Dduvall has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/238839

Change subject: Support atomic promotion and rollback
......................................................................

Support atomic promotion and rollback

Implemented a new directory structure that supports a more atomic
promotion process and automated rollbacks. Each project revision is
deployed into its own directory under `revs/` and a `current` symlink,
created just before service restart, controls which revision is
currently promoted.

The 'fetch' stage now utilizes a local `cache` directory for the remote
repo which is then uses to clone locally into the revision specific
directory. This process should be disk-space efficient as long as both
location are on the same local filesystem, as git hardlinks objects by
default for local clones. If it proves problematic, we can investigate
using something like `git-new-workdir`.

Rollback support depends on two additional symlinks (`.in-progress` and
`.done`), both located in the root deploy directory, to maintain state
for the current deployment across target nodes. The `.in-progress`
symlink is created following a successful 'fetch', and replaces the
`.done` symlink upon a successful 'check'.

If any failure occurs during the `promote` and `check` stages, the user
is prompted to rollback. The actual rollback process simply resolves the
last successful revision by looking at the `.done` symlink and performs
a 'promote' to that revision.

Bug: T109514
Change-Id: I49132b68339e489729009532c9e19a7e50f5d6fe
---
M scap/cli.py
M scap/main.py
M scap/tasks.py
M scap/utils.py
4 files changed, 222 insertions(+), 49 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/tools/scap 
refs/changes/39/238839/1

diff --git a/scap/cli.py b/scap/cli.py
index c9a14a1..4d08b7b 100644
--- a/scap/cli.py
+++ b/scap/cli.py
@@ -111,6 +111,21 @@
 
         return sorted_versions
 
+    def ask(self, question, default):
+        """Provides a y/n prompt if the controlling terminal is interactive.
+
+        :param question: Prompt message to display
+        :param default: Default answer to use in the case of a non-interactive
+                        terminal
+        :returns: str User input or default value
+        """
+
+        if not sys.stdout.isatty():
+            return default
+
+        ans = raw_input('{} [{}]: '.format(question, default)).strip()
+        return ans.lower() if ans else default
+
     def _parse_arguments(self, argv):
         """Parse command line arguments.
 
diff --git a/scap/main.py b/scap/main.py
index 4125e7d..a647421 100644
--- a/scap/main.py
+++ b/scap/main.py
@@ -578,23 +578,55 @@
 
 class DeployLocal(cli.Application):
     """Deploy service code via git"""
-
     STAGES = ['fetch', 'promote', 'check']
+    EX_STAGES = ['rollback']
 
-    @cli.argument('stage', metavar='STAGE', choices=STAGES,
+    rev = None
+    deploy_dir = None
+    cache_dir = None
+    revs_dir = None
+    rev_dir = None
+    user = None
+    progress_flag = None
+    done_flag = None
+
+    @cli.argument('stage', metavar='STAGE', choices=STAGES + EX_STAGES,
         help='Stage of the deployment to execute')
     def main(self, *extra_args):
+        self.rev = self.config['git_rev']
+
+        self.deploy_dir = os.path.normpath("{0}/{1}".format(
+            self.config['git_deploy_dir'], self.config['git_repo']))
+
+        self.cache_dir = os.path.join(self.deploy_dir, 'cache')
+        self.revs_dir = os.path.join(self.deploy_dir, 'revs')
+        self.rev_dir = os.path.join(self.revs_dir, self.rev)
+        self.cur_link = os.path.join(self.deploy_dir, 'current')
+
+        self.progress_flag = os.path.join(self.deploy_dir, '.in-progress')
+        self.done_flag = os.path.join(self.deploy_dir, '.done')
+
+        self.user = self.config['git_repo_user']
+
         getattr(self, self.arguments.stage)()
 
     def fetch(self):
+        """Fetch the specified revision of the remote repo.
+
+        The given repo is cloned into the cache directory and a new working
+        directory for the given revision is created under revs/{rev}.
+
+        At the end of this stage, the .in-progress link is created to signal
+        the possibility for future rollback.
+        """
+
         repo = self.config['git_repo']
-        repo_user = self.config['git_repo_user']
         server = self.config['git_server']
-        rev = self.config['git_rev']
         has_submodules = self.config['git_submodules']
 
-        location = os.path.normpath("{0}/{1}".format(
-            self.config['git_deploy_dir'], repo))
+        # create deployment directories if they don't already exist
+        for d in [self.cache_dir, self.revs_dir]:
+            utils.mkdir_p(d, logger=self.get_logger())
 
         # only supports http from tin for the moment
         scheme = 'http'
@@ -604,20 +636,98 @@
         logger = self.get_logger()
         logger.debug('Fetching from: {}'.format(url))
 
-        tasks.git_fetch(location, url, repo_user, logger)
-        tasks.git_checkout(location, rev, has_submodules, repo_user, logger)
+        # clone/fetch from the repo to the cache directory
+        tasks.git_fetch(self.cache_dir, url,
+                        user=self.user, logger=logger)
+
+        # clone/fetch from the local cache directory to the revision directory
+        tasks.git_fetch(self.rev_dir, self.cache_dir,
+                        user=self.user,
+                        logger=logger)
+
+        # checkout the given revision
+        tasks.git_checkout(self.rev_dir, self.rev,
+                           submodules=has_submodules,
+                           user=self.user)
+
+        # link the .in-progress flag to the rev directory
+        self._link_rev_dir(self.progress_flag)
 
     def promote(self):
+        """Promote the current deployment.
+
+        Switches the `current` symlink to the current revision directory and
+        restarts the configured service.
+        """
+
         service = self.config.get('service_name', None)
+
+        self._link_rev_dir(self.cur_link)
 
         if service is not None:
             tasks.restart_service(service, user=self.config['git_repo_user'])
 
     def check(self):
+        """Verifies whether the promotion was successful.
+
+        Probes the configured service port to measure whether it successfully
+        restarted.
+
+        At the end of this stage, the .done link is created and the
+        .in-progress link is removed.
+        """
+
         port = self.config.get('service_port', None)
 
         if port is not None:
             tasks.check_port(int(port))
+
+        # move .done flag and remove the .in-progress flag
+        self._link_rev_dir(self.done_flag)
+        self._remove_progress_link()
+
+    def rollback(self):
+        """Performs a rollback to the last deployed revision.
+
+        The rollback stage expects an .in-progress symlink to points to the
+        revision directory for the currently running deployment. If the link
+        doesn't exist, it's assumed that the current deployment errored at an
+        early enough stage where a rollback isn't necessary.
+
+        It also looks for a .done symlink that points to the revision
+        directory for the last successful deployment. If this link doesn't
+        exist, a rollback isn't possible. If it does exist, the current
+        revision directory is replaced with the target of the link and the
+        promote stage is re-run.
+        """
+
+        logger = self.get_logger()
+
+        if not os.path.exists(self.progress_flag):
+            logger.info('No rollback necessary. Skipping')
+            return 0
+
+        if not os.path.exists(self.done_flag):
+            raise RuntimeError('there is no previous revision to rollback to')
+
+        rev_dir = os.path.realpath(self.done_flag)
+        rev = os.path.basename(rev_dir)
+
+        if not os.path.isdir(rev_dir):
+            msg = 'rollback failed due to missing rev directory {}'
+            raise RuntimeError(msg.format(rev_dir))
+
+        logger.info('Rolling back to revision {}'.format(rev))
+        self.rev = rev
+        self.rev_dir = rev_dir
+        self.promote()
+        self._remove_progress_link()
+
+    def _link_rev_dir(self, symlink_path):
+        tasks.move_symlink(self.rev_dir, symlink_path, user=self.user)
+
+    def _remove_progress_link(self):
+        tasks.remove_symlink(self.progress_flag, user=self.user)
 
 
 class Deploy(cli.Application):
@@ -626,14 +736,38 @@
     Uses local .scaprc as config for each host in cluster
     """
 
+    MAX_BATCH_SIZE = 80
+
+    DEPLOY_CONF = [
+        'git_deploy_dir',
+        'git_repo_user',
+        'git_server',
+        'git_scheme',
+        'git_repo',
+        'git_rev',
+        'git_submodules',
+        'service_name',
+        'service_port',
+    ]
+
+    repo = []
+    targets = []
+
     @cli.argument('-r', '--rev', default='HEAD', help='Revision to deploy')
+    @cli.argument('-s', '--stages', choices=DeployLocal.STAGES,
+                  help='Deployment stages to execute. Used only for testing.')
     @cli.argument('-l', '--limit-hosts', default='all',
                   help='Limit deploy to hosts matching expression')
     def main(self, *extra_args):
         logger = self.get_logger()
-        repo = self.config['git_repo']
+        self.repo = self.config['git_repo']
         deploy_dir = self.config['git_deploy_dir']
         cwd = os.getcwd()
+
+        if self.arguments.stages:
+            stages = self.arguments.stages.split(',')
+        else:
+            stages = DeployLocal.STAGES
 
         in_deploy_dir = os.path.commonprefix([cwd, deploy_dir]) == deploy_dir
 
@@ -646,24 +780,19 @@
                 'Script must be run from deployment repository under {}'
                     .format(deploy_dir))
 
-        targets = utils.get_target_hosts(
+        self.targets = utils.get_target_hosts(
             self.arguments.limit_hosts,
             utils.read_hosts_file(self.config['dsh_targets'])
         )
 
         logger.info(
             'Deploy will run on the following targets: \n\t- {}'.format(
-                '\n\t- '.join(targets)
+                '\n\t- '.join(self.targets)
             )
         )
 
-        # batch_size not required, don't allow a batch_size > 80 if set
-        # TODO allow batch sizes to be configured per stage
-        batch_size = self.config.get('batch_size', 80)
-        batch_size = 80 if batch_size >= 80 else batch_size
-
         with utils.lock(self.config['lock_file']):
-            with log.Timer('deploy_' + repo):
+            with log.Timer('deploy_' + self.repo):
                 timestamp = datetime.utcnow()
                 tag = utils.git_next_deploy_tag(location=cwd)
                 commit = utils.git_sha(location=cwd, rev=self.arguments.rev)
@@ -685,37 +814,47 @@
                 # apache server
                 tasks.git_update_server_info(self.config['git_submodules'])
 
-                deploy_conf = [
-                    'git_deploy_dir',
-                    'git_repo_user',
-                    'git_server',
-                    'git_scheme',
-                    'git_repo',
-                    'git_rev',
-                    'git_submodules',
-                    'service_name',
-                    'service_port',
-                ]
-
-                deploy_local_cmd = [self.get_script_path('deploy-local')]
-
-                deploy_local_cmd.extend([
-                    "-D '{}:{}'".format(x, self.config.get(x))
-                    for x in deploy_conf
-                    if self.config.get(x) is not None
-                ])
-
-                for stage in DeployLocal.STAGES:
-                    deploy_stage_cmd = deploy_local_cmd + [stage]
-                    logger.debug('Running cmd {}'.format(deploy_stage_cmd))
-                    deploy_stage = ssh.Job(
-                        hosts=targets, user=self.config['ssh_user'])
-                    deploy_stage.command(deploy_stage_cmd)
-                    deploy_stage.progress('deploy_{}_{}'.format(repo, stage))
-                    succeeded, failed = deploy_stage.run(batch_size=batch_size)
-
-                    if failed:
-                        logger.warning('%d targets had deploy errors', failed)
-                        return 1
+                for stage in stages:
+                    ret = self.execute_stage(stage)
+                    if ret > 0:
+                        self.execute_rollback(stage)
+                        return ret
 
         return 0
+
+    def execute_rollback(self, stage):
+        prompt = "Stage '{}' failed. Perform rollback?".format(stage)
+
+        if self.ask(prompt, 'y') == 'y':
+            return self.execute_stage('rollback')
+
+        return 0
+
+    def execute_stage(self, stage):
+        logger = self.get_logger()
+        deploy_local_cmd = [self.get_script_path('deploy-local')]
+        batch_size = self._get_batch_size()
+
+        deploy_local_cmd.extend([
+            "-D '{}:{}'".format(x, self.config.get(x))
+            for x in self.DEPLOY_CONF
+            if self.config.get(x) is not None
+        ])
+
+        deploy_stage_cmd = deploy_local_cmd + [stage]
+        logger.debug('Running cmd {}'.format(deploy_stage_cmd))
+        deploy_stage = ssh.Job(
+            hosts=self.targets, user=self.config['ssh_user'])
+        deploy_stage.command(deploy_stage_cmd)
+        deploy_stage.progress('deploy_{}_{}'.format(self.repo, stage))
+        succeeded, failed = deploy_stage.run(batch_size=batch_size)
+
+        if failed:
+            logger.warning('%d targets had deploy errors', failed)
+            return 1
+
+        return 0
+
+    def _get_batch_size(self):
+        size = self.config.get('batch_size', self.MAX_BATCH_SIZE)
+        return min(size, self.MAX_BATCH_SIZE)
diff --git a/scap/tasks.py b/scap/tasks.py
index 6c5f28b..b1860b5 100644
--- a/scap/tasks.py
+++ b/scap/tasks.py
@@ -710,3 +710,18 @@
         errno.ENOTCONN,
         "Specified port {} is not accepting connections".format(port_number)
     )
+
+
+def move_symlink(source, dest, user='mwdeploy', logger=None):
+    common_path = os.path.commonprefix([source, dest])
+    rsource = os.path.relpath(source, common_path)
+    rdest = os.path.relpath(dest, common_path)
+
+    with utils.cd(common_path):
+        utils.sudo_check_call(user,
+                              "ln -sfT '{}' '{}'".format(rsource, rdest),
+                              logger)
+
+
+def remove_symlink(path, user='mwdeploy', logger=None):
+    utils.sudo_check_call(user, "rm '{}'".format(path), logger)
diff --git a/scap/utils.py b/scap/utils.py
index fb25edb..2e61c24 100644
--- a/scap/utils.py
+++ b/scap/utils.py
@@ -566,6 +566,10 @@
             os.path.isfile(os.path.join(git_path, 'HEAD')))
 
 
+def mkdir_p(path, user=get_real_username(), logger=None):
+    sudo_check_call(user, "mkdir -p '{}'".format(path), logger=logger)
+
+
 @inside_git_dir
 def git_sha(location, rev):
     """Returns SHA1 for things like HEAD or HEAD~~"""

-- 
To view, visit https://gerrit.wikimedia.org/r/238839
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I49132b68339e489729009532c9e19a7e50f5d6fe
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/tools/scap
Gerrit-Branch: master
Gerrit-Owner: Dduvall <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to