On Thu, 2010-10-21 at 13:28 -0700, Gregory Smith wrote:
> Allow limiting of the number of hosts that go through the scheduler's
> periodic Repair Failed -> Verifying cleanup cycle at once.  When you've
> got an infrastructure issue that kicks all your hosts into repair
> failed, launching reverifies on all of them at once can overwhelm the
> server.

Hi Greg, looks good to me. I had even prepared an updated version of
your patch with the fix cobbled together, but I just remember that you
guys at google have an internal branch and some sort of tracker of the
changes, so instead I am going to commit the original patches one after
the other. 

Thanks,

Lucas

> Signed-off-by: Gregory Smith <[email protected]>
> 
> --- autotest/global_config.ini        2010-07-14 12:42:00.000000000 -0700
> +++ autotest/global_config.ini        2010-10-21 13:16:19.000000000 -0700
> @@ -80,6 +80,7 @@
>  gc_stats_interval_mins: 360
>  # set nonzero to enable periodic reverification of all dead hosts
>  reverify_period_minutes: 0
> +reverify_max_hosts_at_once: 0 
>  drone_sets_enabled: False
>  # default_drone_set_name: This is required if drone sets are enabled.
>  default_drone_set_name:
> --- autotest/scheduler/monitor_db_cleanup.py  2010-02-18 12:57:56.000000000 
> -0800
> +++ autotest/scheduler/monitor_db_cleanup.py  2010-10-21 13:16:19.000000000 
> -0700
> @@ -3,7 +3,7 @@
>  """
>  
> 
> -import datetime, time, logging
> +import datetime, time, logging, random
>  from autotest_lib.database import database_connection
>  from autotest_lib.frontend.afe import models
>  from autotest_lib.scheduler import email_manager, scheduler_config
> @@ -162,6 +162,15 @@
>          return (self._last_reverify_time + reverify_period_sec) <= 
> time.time()
>  
> 
> +    def _choose_subset_of_hosts_to_reverify(self, hosts):
> +        """Given hosts needing verification, return a subset to reverify."""
> +        if (scheduler_config.reverify_max_hosts_at_once > 0 and
> +            len(hosts) > scheduler_config.reverify_max_hosts_at_once):
> +            return random.sample(hosts,
> +                                 scheduler_config.reverify_max_hosts_at_once)
> +        return sorted(hosts)
> +
> +
>      def _reverify_dead_hosts(self):
>          if not self._should_reverify_hosts_now():
>              return
> @@ -177,8 +186,11 @@
>          if not hosts:
>              return
>  
> -        logging.info('Reverifying dead hosts %s'
> -                     % ', '.join(host.hostname for host in hosts))
> +        hosts = list(hosts)
> +        total_hosts = len(hosts)
> +        hosts = self._choose_subset_of_hosts_to_reverify(hosts)
> +        logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts),
> +                     total_hosts, ', '.join(host.hostname for host in hosts))
>          for host in hosts:
>              models.SpecialTask.schedule_special_task(
>                      host=host, task=models.SpecialTask.Task.VERIFY)
> --- autotest/scheduler/monitor_db_cleanup_test.py     2010-01-27 
> 22:21:03.000000000 -0800
> +++ autotest/scheduler/monitor_db_cleanup_test.py     2010-10-21 
> 13:16:19.000000000 -0700
> @@ -5,7 +5,7 @@
>  from autotest_lib.frontend import setup_django_environment
>  from autotest_lib.database import database_connection
>  from autotest_lib.frontend.afe import frontend_test_utils, models
> -from autotest_lib.scheduler import monitor_db_cleanup
> +from autotest_lib.scheduler import monitor_db_cleanup, scheduler_config
>  from autotest_lib.client.common_lib import host_protections
>  
>  class UserCleanupTest(unittest.TestCase, 
> frontend_test_utils.FrontendTestMixin):
> @@ -23,6 +23,8 @@
>  
> 
>      def test_reverify_dead_hosts(self):
> +        # unlimited reverifies
> +        self.god.stub_with(scheduler_config, 'reverify_max_hosts_at_once', 0)
>          for i in (0, 1, 2):
>              self.hosts[i].status = models.Host.Status.REPAIR_FAILED
>              self.hosts[i].save()
> @@ -43,5 +45,31 @@
>          self.assertEquals(tasks[0].task, models.SpecialTask.Task.VERIFY)
>  
> 
> +    def test_reverify_dead_hosts_limits(self):
> +        # limit the number of reverifies
> +        self.god.stub_with(scheduler_config, 'reverify_max_hosts_at_once', 2)
> +        for i in (0, 1, 2, 3, 4, 5):
> +            self.hosts[i].status = models.Host.Status.REPAIR_FAILED
> +            self.hosts[i].save()
> +
> +        self.hosts[1].locked = True
> +        self.hosts[1].save()
> +
> +        self.hosts[2].protection = host_protections.Protection.DO_NOT_VERIFY
> +        self.hosts[2].save()
> +
> +        self.god.stub_with(self.cleanup, '_should_reverify_hosts_now',
> +                           lambda : True)
> +        self.cleanup._reverify_dead_hosts()
> +
> +        tasks = models.SpecialTask.objects.all()
> +        # four hosts need reverifying but our max limit was set to 2
> +        self.assertEquals(len(tasks), 2)
> +        self.assertTrue(tasks[0].host.id in (1, 4, 5, 6))
> +        self.assertTrue(tasks[1].host.id in (1, 4, 5, 6))
> +        self.assertEquals(tasks[0].task, models.SpecialTask.Task.VERIFY)
> +        self.assertEquals(tasks[1].task, models.SpecialTask.Task.VERIFY)
> +
> +
>  if __name__ == '__main__':
>      unittest.main()
> --- autotest/scheduler/scheduler_config.py    2010-01-10 11:48:43.000000000 
> -0800
> +++ autotest/scheduler/scheduler_config.py    2010-10-21 13:16:19.000000000 
> -0700
> @@ -16,6 +16,7 @@
>                'secs_to_wait_for_atomic_group_hosts':
>                    'secs_to_wait_for_atomic_group_hosts',
>                'reverify_period_minutes': 'reverify_period_minutes',
> +              'reverify_max_hosts_at_once': 'reverify_max_hosts_at_once',
>               }
> _______________________________________________
> Autotest mailing list
> [email protected]
> http://test.kernel.org/cgi-bin/mailman/listinfo/autotest


_______________________________________________
Autotest mailing list
[email protected]
http://test.kernel.org/cgi-bin/mailman/listinfo/autotest

Reply via email to