The following pull request was submitted through Github. It can be accessed and reviewed at: https://github.com/lxc/lxd/pull/2191
This e-mail was sent by the LXC bot, direct replies will not reach the author unless they happen to be subscribed to this list. === Description (from pull-request) ===
From bde987ddbf167f5e848d001b4e28d828d6b093b9 Mon Sep 17 00:00:00 2001 From: Tycho Andersen <tycho.ander...@canonical.com> Date: Thu, 7 Jul 2016 22:28:32 +0000 Subject: [PATCH 1/2] make client.websocket a public API We'll use this in the next patch. Signed-off-by: Tycho Andersen <tycho.ander...@canonical.com> --- client.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/client.go b/client.go index 3567127..3e0d791 100644 --- a/client.go +++ b/client.go @@ -482,7 +482,7 @@ func (c *Client) delete(base string, args interface{}, rtype ResponseType) (*Res return HoistResponse(resp, rtype) } -func (c *Client) websocket(operation string, secret string) (*websocket.Conn, error) { +func (c *Client) Websocket(operation string, secret string) (*websocket.Conn, error) { query := url.Values{"secret": []string{secret}} url := c.BaseWSURL + path.Join(operation, "websocket") + "?" + query.Encode() return WebsocketDial(c.websocketDialer, url) @@ -1513,7 +1513,7 @@ func (c *Client) Exec(name string, cmd []string, env map[string]string, if controlHandler != nil { var control *websocket.Conn if wsControl, ok := fds["control"]; ok { - control, err = c.websocket(resp.Operation, wsControl.(string)) + control, err = c.Websocket(resp.Operation, wsControl.(string)) if err != nil { return -1, err } @@ -1522,7 +1522,7 @@ func (c *Client) Exec(name string, cmd []string, env map[string]string, go controlHandler(c, control) } - conn, err := c.websocket(resp.Operation, fds["0"].(string)) + conn, err := c.Websocket(resp.Operation, fds["0"].(string)) if err != nil { return -1, err } @@ -1535,7 +1535,7 @@ func (c *Client) Exec(name string, cmd []string, env map[string]string, conns := make([]*websocket.Conn, 3) dones := make([]chan bool, 3) - conns[0], err = c.websocket(resp.Operation, fds[strconv.Itoa(0)].(string)) + conns[0], err = c.Websocket(resp.Operation, fds[strconv.Itoa(0)].(string)) if err != nil { return -1, err } @@ -1545,7 +1545,7 @@ func (c *Client) Exec(name string, cmd []string, env map[string]string, outputs := []io.WriteCloser{stdout, stderr} for i := 1; i < 3; i++ { - conns[i], err = c.websocket(resp.Operation, fds[strconv.Itoa(i)].(string)) + conns[i], err = c.Websocket(resp.Operation, fds[strconv.Itoa(i)].(string)) if err != nil { return -1, err } From b8078bc35703bdf09062adfc3491bbc70ba2aa14 Mon Sep 17 00:00:00 2001 From: Tycho Andersen <tycho.ander...@canonical.com> Date: Thu, 7 Jul 2016 02:35:44 +0000 Subject: [PATCH 2/2] resume dumped container on failed restore This commit implements "freezing" the dumped container until we are sure that the restore was successful. Because of various issues (namely, TCP socket repair mode and windowing, but also various other problems), we can't simply implement some kind of --leave-frozen option in CRIU. Instead, we have CRIU do a callback when the dump is done but the container is still frozen, so that we can then try the restore and see if is succeds. Signed-off-by: Tycho Andersen <tycho.ander...@canonical.com> --- lxd/container.go | 7 +++- lxd/container_lxc.go | 14 +++++-- lxd/main.go | 23 +++++++++++ lxd/migrate.go | 115 ++++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 147 insertions(+), 12 deletions(-) diff --git a/lxd/container.go b/lxd/container.go index dd1038c..8242de7 100644 --- a/lxd/container.go +++ b/lxd/container.go @@ -271,7 +271,10 @@ type container interface { // Snapshots & migration Restore(sourceContainer container) error - Migrate(cmd uint, stateDir string, function string, stop bool) error + /* actionScript here is a script called action.sh in the stateDir, to + * be passed to CRIU as --action-script + */ + Migrate(cmd uint, stateDir string, function string, stop bool, actionScript bool) error Snapshots() ([]container, error) // Config handling @@ -457,7 +460,7 @@ func containerCreateAsSnapshot(d *Daemon, args containerArgs, sourceContainer co * after snapshotting will fail. */ - err = sourceContainer.Migrate(lxc.MIGRATE_DUMP, stateDir, "snapshot", false) + err = sourceContainer.Migrate(lxc.MIGRATE_DUMP, stateDir, "snapshot", false, false) if err != nil { os.RemoveAll(sourceContainer.StatePath()) return nil, err diff --git a/lxd/container_lxc.go b/lxd/container_lxc.go index 8761e3e..2cb3521 100644 --- a/lxd/container_lxc.go +++ b/lxd/container_lxc.go @@ -1198,7 +1198,7 @@ func (c *containerLXC) Start(stateful bool) error { return fmt.Errorf("Container has no existing state to restore.") } - err := c.Migrate(lxc.MIGRATE_RESTORE, c.StatePath(), "snapshot", false) + err := c.Migrate(lxc.MIGRATE_RESTORE, c.StatePath(), "snapshot", false, false) if err != nil && !c.IsRunning() { return err } @@ -1364,7 +1364,7 @@ func (c *containerLXC) Stop(stateful bool) error { } // Checkpoint - err = c.Migrate(lxc.MIGRATE_DUMP, stateDir, "snapshot", true) + err = c.Migrate(lxc.MIGRATE_DUMP, stateDir, "snapshot", true, false) if err != nil { return err } @@ -1740,7 +1740,7 @@ func (c *containerLXC) Restore(sourceContainer container) error { // If the container wasn't running but was stateful, should we restore // it as running? if shared.PathExists(c.StatePath()) { - if err := c.Migrate(lxc.MIGRATE_RESTORE, c.StatePath(), "snapshot", false); err != nil { + if err := c.Migrate(lxc.MIGRATE_RESTORE, c.StatePath(), "snapshot", false, false); err != nil { return err } @@ -2661,7 +2661,7 @@ func findCriu(host string) error { return nil } -func (c *containerLXC) Migrate(cmd uint, stateDir string, function string, stop bool) error { +func (c *containerLXC) Migrate(cmd uint, stateDir string, function string, stop bool, actionScript bool) error { if err := findCriu(function); err != nil { return err } @@ -2747,11 +2747,17 @@ func (c *containerLXC) Migrate(cmd uint, stateDir string, function string, stop return err } + script := "" + if actionScript { + script = filepath.Join(stateDir, "action.sh") + } + opts := lxc.MigrateOptions{ Stop: stop, Directory: stateDir, Verbose: true, PreservesInodes: preservesInodes, + ActionScript: script, } migrateErr = c.c.Migrate(cmd, opts) diff --git a/lxd/main.go b/lxd/main.go index 9da7563..4eb4372 100644 --- a/lxd/main.go +++ b/lxd/main.go @@ -158,6 +158,8 @@ func run() error { fmt.Printf(" Start a container\n") fmt.Printf(" callhook\n") fmt.Printf(" Call a container hook\n") + fmt.Printf(" migratedumpsuccess\n") + fmt.Printf(" Indicate that a migration dump was successful\n") fmt.Printf(" netcat\n") fmt.Printf(" Mirror a unix socket to stdin/stdout\n") } @@ -237,6 +239,8 @@ func run() error { os.Exit(ret) case "netcat": return Netcat(os.Args[1:]) + case "migratedumpsuccess": + return cmdMigrateDumpSuccess(os.Args[1:]) } } @@ -1050,3 +1054,22 @@ func printnet() error { return nil } + +func cmdMigrateDumpSuccess(args []string) error { + if len(args) != 3 { + return fmt.Errorf("bad migrate dump success args %s", args) + } + + c, err := lxd.NewClient(&lxd.DefaultConfig, "local") + if err != nil { + return err + } + + conn, err := c.Websocket(args[1], args[2]) + if err != nil { + return err + } + conn.Close() + + return c.WaitForSuccess(args[1]) +} diff --git a/lxd/migrate.go b/lxd/migrate.go index 14e9c00..5377a61 100644 --- a/lxd/migrate.go +++ b/lxd/migrate.go @@ -11,6 +11,7 @@ import ( "net/http" "net/url" "os" + "path/filepath" "strconv" "strings" "sync" @@ -228,7 +229,28 @@ func (s *migrationSourceWs) Connect(op *operation, r *http.Request, w http.Respo return nil } -func (s *migrationSourceWs) Do(op *operation) error { +func writeActionScript(directory string, operation string, secret string) error { + script := fmt.Sprintf(`#!/bin/sh -e +if [ "$CRTOOLS_SCRIPT_ACTION" = "post-dump" ]; then + %s migratedumpsuccess %s %s +fi +`, execPath, operation, secret) + + f, err := os.Create(filepath.Join(directory, "action.sh")) + if err != nil { + return err + } + defer f.Close() + + if err := f.Chmod(0500); err != nil { + return err + } + + _, err = f.WriteString(script) + return err +} + +func (s *migrationSourceWs) Do(migrateOp *operation) error { <-s.allConnected criuType := CRIUType_CRIU_RSYNC.Enum() @@ -326,15 +348,98 @@ func (s *migrationSourceWs) Do(op *operation) error { return abort(fmt.Errorf("Formats other than criu rsync not understood")) } + /* What happens below is slightly convoluted. Due to various + * complications with networking, there's no easy way for criu + * to exit and leave the container in a frozen state for us to + * somehow resume later. + * + * Instead, we use what criu calls an "action-script", which is + * basically a callback that lets us know when the dump is + * done. (Unfortunately, we can't pass arguments, just an + * executable path, so we write a custom action script with the + * real command we want to run.) + * + * This script then hangs until the migration operation either + * finishes successfully or fails, and exits 1 or 0, which + * causes criu to either leave the container running or kill it + * as we asked. + */ + dumpDone := make(chan bool, 1) + actionScriptOpSecret, err := shared.RandomCryptoString() + if err != nil { + return abort(err) + } + + actionScriptOp, err := operationCreate( + operationClassWebsocket, + nil, + nil, + func(op *operation) error { + _, err := migrateOp.WaitFinal(-1) + if err != nil { + return err + } + + if migrateOp.status != shared.Success { + return fmt.Errorf("restore failed: %s", op.status.String()) + } + return nil + }, + nil, + func(op *operation, r *http.Request, w http.ResponseWriter) error { + secret := r.FormValue("secret") + if secret == "" { + return fmt.Errorf("missing secret") + } + + if secret != actionScriptOpSecret { + return os.ErrPermission + } + + c, err := shared.WebsocketUpgrader.Upgrade(w, r, nil) + if err != nil { + return err + } + + dumpDone <- true + + closeMsg := websocket.FormatCloseMessage(websocket.CloseNormalClosure, "") + return c.WriteMessage(websocket.CloseMessage, closeMsg) + }, + ) + if err != nil { + return abort(err) + } + checkpointDir, err := ioutil.TempDir("", "lxd_checkpoint_") if err != nil { return abort(err) } - defer os.RemoveAll(checkpointDir) - err = s.container.Migrate(lxc.MIGRATE_DUMP, checkpointDir, "migration", true) + if err := writeActionScript(checkpointDir, actionScriptOp.url, actionScriptOpSecret); err != nil { + os.RemoveAll(checkpointDir) + return abort(err) + } + + _, err = actionScriptOp.Run() if err != nil { + os.RemoveAll(checkpointDir) + return abort(err) + } + + migrateDone := make(chan error, 1) + go func() { + defer os.RemoveAll(checkpointDir) + migrateDone <- s.container.Migrate(lxc.MIGRATE_DUMP, checkpointDir, "migration", true, true) + }() + + select { + /* the checkpoint failed, let's just abort */ + case err = <-migrateDone: return abort(err) + /* the dump finished, let's continue on to the restore */ + case <-dumpDone: + shared.Debugf("Dump finished, continuing with restore...") } /* @@ -361,8 +466,6 @@ func (s *migrationSourceWs) Do(op *operation) error { return err } - // TODO: should we add some config here about automatically restarting - // the container migrate failure? What about the failures above? if !*msg.Success { return fmt.Errorf(*msg.Message) } @@ -559,7 +662,7 @@ func (c *migrationSink) do() error { } if c.live { - err = c.container.Migrate(lxc.MIGRATE_RESTORE, imagesDir, "migration", false) + err = c.container.Migrate(lxc.MIGRATE_RESTORE, imagesDir, "migration", false, false) if err != nil { restore <- err return
_______________________________________________ lxc-devel mailing list lxc-devel@lists.linuxcontainers.org http://lists.linuxcontainers.org/listinfo/lxc-devel