On Thu, 2010-10-21 at 13:28 -0700, Gregory Smith wrote: > Allow limiting of the number of hosts that go through the scheduler's > periodic Repair Failed -> Verifying cleanup cycle at once. When you've > got an infrastructure issue that kicks all your hosts into repair > failed, launching reverifies on all of them at once can overwhelm the > server.
Hi Greg, looks good to me. I had even prepared an updated version of your patch with the fix cobbled together, but I just remember that you guys at google have an internal branch and some sort of tracker of the changes, so instead I am going to commit the original patches one after the other. Thanks, Lucas > Signed-off-by: Gregory Smith <[email protected]> > > --- autotest/global_config.ini 2010-07-14 12:42:00.000000000 -0700 > +++ autotest/global_config.ini 2010-10-21 13:16:19.000000000 -0700 > @@ -80,6 +80,7 @@ > gc_stats_interval_mins: 360 > # set nonzero to enable periodic reverification of all dead hosts > reverify_period_minutes: 0 > +reverify_max_hosts_at_once: 0 > drone_sets_enabled: False > # default_drone_set_name: This is required if drone sets are enabled. > default_drone_set_name: > --- autotest/scheduler/monitor_db_cleanup.py 2010-02-18 12:57:56.000000000 > -0800 > +++ autotest/scheduler/monitor_db_cleanup.py 2010-10-21 13:16:19.000000000 > -0700 > @@ -3,7 +3,7 @@ > """ > > > -import datetime, time, logging > +import datetime, time, logging, random > from autotest_lib.database import database_connection > from autotest_lib.frontend.afe import models > from autotest_lib.scheduler import email_manager, scheduler_config > @@ -162,6 +162,15 @@ > return (self._last_reverify_time + reverify_period_sec) <= > time.time() > > > + def _choose_subset_of_hosts_to_reverify(self, hosts): > + """Given hosts needing verification, return a subset to reverify.""" > + if (scheduler_config.reverify_max_hosts_at_once > 0 and > + len(hosts) > scheduler_config.reverify_max_hosts_at_once): > + return random.sample(hosts, > + scheduler_config.reverify_max_hosts_at_once) > + return sorted(hosts) > + > + > def _reverify_dead_hosts(self): > if not self._should_reverify_hosts_now(): > return > @@ -177,8 +186,11 @@ > if not hosts: > return > > - logging.info('Reverifying dead hosts %s' > - % ', '.join(host.hostname for host in hosts)) > + hosts = list(hosts) > + total_hosts = len(hosts) > + hosts = self._choose_subset_of_hosts_to_reverify(hosts) > + logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts), > + total_hosts, ', '.join(host.hostname for host in hosts)) > for host in hosts: > models.SpecialTask.schedule_special_task( > host=host, task=models.SpecialTask.Task.VERIFY) > --- autotest/scheduler/monitor_db_cleanup_test.py 2010-01-27 > 22:21:03.000000000 -0800 > +++ autotest/scheduler/monitor_db_cleanup_test.py 2010-10-21 > 13:16:19.000000000 -0700 > @@ -5,7 +5,7 @@ > from autotest_lib.frontend import setup_django_environment > from autotest_lib.database import database_connection > from autotest_lib.frontend.afe import frontend_test_utils, models > -from autotest_lib.scheduler import monitor_db_cleanup > +from autotest_lib.scheduler import monitor_db_cleanup, scheduler_config > from autotest_lib.client.common_lib import host_protections > > class UserCleanupTest(unittest.TestCase, > frontend_test_utils.FrontendTestMixin): > @@ -23,6 +23,8 @@ > > > def test_reverify_dead_hosts(self): > + # unlimited reverifies > + self.god.stub_with(scheduler_config, 'reverify_max_hosts_at_once', 0) > for i in (0, 1, 2): > self.hosts[i].status = models.Host.Status.REPAIR_FAILED > self.hosts[i].save() > @@ -43,5 +45,31 @@ > self.assertEquals(tasks[0].task, models.SpecialTask.Task.VERIFY) > > > + def test_reverify_dead_hosts_limits(self): > + # limit the number of reverifies > + self.god.stub_with(scheduler_config, 'reverify_max_hosts_at_once', 2) > + for i in (0, 1, 2, 3, 4, 5): > + self.hosts[i].status = models.Host.Status.REPAIR_FAILED > + self.hosts[i].save() > + > + self.hosts[1].locked = True > + self.hosts[1].save() > + > + self.hosts[2].protection = host_protections.Protection.DO_NOT_VERIFY > + self.hosts[2].save() > + > + self.god.stub_with(self.cleanup, '_should_reverify_hosts_now', > + lambda : True) > + self.cleanup._reverify_dead_hosts() > + > + tasks = models.SpecialTask.objects.all() > + # four hosts need reverifying but our max limit was set to 2 > + self.assertEquals(len(tasks), 2) > + self.assertTrue(tasks[0].host.id in (1, 4, 5, 6)) > + self.assertTrue(tasks[1].host.id in (1, 4, 5, 6)) > + self.assertEquals(tasks[0].task, models.SpecialTask.Task.VERIFY) > + self.assertEquals(tasks[1].task, models.SpecialTask.Task.VERIFY) > + > + > if __name__ == '__main__': > unittest.main() > --- autotest/scheduler/scheduler_config.py 2010-01-10 11:48:43.000000000 > -0800 > +++ autotest/scheduler/scheduler_config.py 2010-10-21 13:16:19.000000000 > -0700 > @@ -16,6 +16,7 @@ > 'secs_to_wait_for_atomic_group_hosts': > 'secs_to_wait_for_atomic_group_hosts', > 'reverify_period_minutes': 'reverify_period_minutes', > + 'reverify_max_hosts_at_once': 'reverify_max_hosts_at_once', > } > _______________________________________________ > Autotest mailing list > [email protected] > http://test.kernel.org/cgi-bin/mailman/listinfo/autotest _______________________________________________ Autotest mailing list [email protected] http://test.kernel.org/cgi-bin/mailman/listinfo/autotest
