Allow limiting of the number of hosts that go through the scheduler's
periodic Repair Failed -> Verifying cleanup cycle at once.  When you've
got an infrastructure issue that kicks all your hosts into repair
failed, launching reverifies on all of them at once can overwhelm the
server.

Signed-off-by: Gregory Smith <[email protected]>

--- autotest/global_config.ini  2010-07-14 12:42:00.000000000 -0700
+++ autotest/global_config.ini  2010-10-21 13:16:19.000000000 -0700
@@ -80,6 +80,7 @@
 gc_stats_interval_mins: 360
 # set nonzero to enable periodic reverification of all dead hosts
 reverify_period_minutes: 0
+reverify_max_hosts_at_once: 0 
 drone_sets_enabled: False
 # default_drone_set_name: This is required if drone sets are enabled.
 default_drone_set_name:
--- autotest/scheduler/monitor_db_cleanup.py    2010-02-18 12:57:56.000000000 
-0800
+++ autotest/scheduler/monitor_db_cleanup.py    2010-10-21 13:16:19.000000000 
-0700
@@ -3,7 +3,7 @@
 """
 
 
-import datetime, time, logging
+import datetime, time, logging, random
 from autotest_lib.database import database_connection
 from autotest_lib.frontend.afe import models
 from autotest_lib.scheduler import email_manager, scheduler_config
@@ -162,6 +162,15 @@
         return (self._last_reverify_time + reverify_period_sec) <= time.time()
 
 
+    def _choose_subset_of_hosts_to_reverify(self, hosts):
+        """Given hosts needing verification, return a subset to reverify."""
+        if (scheduler_config.reverify_max_hosts_at_once > 0 and
+            len(hosts) > scheduler_config.reverify_max_hosts_at_once):
+            return random.sample(hosts,
+                                 scheduler_config.reverify_max_hosts_at_once)
+        return sorted(hosts)
+
+
     def _reverify_dead_hosts(self):
         if not self._should_reverify_hosts_now():
             return
@@ -177,8 +186,11 @@
         if not hosts:
             return
 
-        logging.info('Reverifying dead hosts %s'
-                     % ', '.join(host.hostname for host in hosts))
+        hosts = list(hosts)
+        total_hosts = len(hosts)
+        hosts = self._choose_subset_of_hosts_to_reverify(hosts)
+        logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts),
+                     total_hosts, ', '.join(host.hostname for host in hosts))
         for host in hosts:
             models.SpecialTask.schedule_special_task(
                     host=host, task=models.SpecialTask.Task.VERIFY)
--- autotest/scheduler/monitor_db_cleanup_test.py       2010-01-27 
22:21:03.000000000 -0800
+++ autotest/scheduler/monitor_db_cleanup_test.py       2010-10-21 
13:16:19.000000000 -0700
@@ -5,7 +5,7 @@
 from autotest_lib.frontend import setup_django_environment
 from autotest_lib.database import database_connection
 from autotest_lib.frontend.afe import frontend_test_utils, models
-from autotest_lib.scheduler import monitor_db_cleanup
+from autotest_lib.scheduler import monitor_db_cleanup, scheduler_config
 from autotest_lib.client.common_lib import host_protections
 
 class UserCleanupTest(unittest.TestCase, 
frontend_test_utils.FrontendTestMixin):
@@ -23,6 +23,8 @@
 
 
     def test_reverify_dead_hosts(self):
+        # unlimited reverifies
+        self.god.stub_with(scheduler_config, 'reverify_max_hosts_at_once', 0)
         for i in (0, 1, 2):
             self.hosts[i].status = models.Host.Status.REPAIR_FAILED
             self.hosts[i].save()
@@ -43,5 +45,31 @@
         self.assertEquals(tasks[0].task, models.SpecialTask.Task.VERIFY)
 
 
+    def test_reverify_dead_hosts_limits(self):
+        # limit the number of reverifies
+        self.god.stub_with(scheduler_config, 'reverify_max_hosts_at_once', 2)
+        for i in (0, 1, 2, 3, 4, 5):
+            self.hosts[i].status = models.Host.Status.REPAIR_FAILED
+            self.hosts[i].save()
+
+        self.hosts[1].locked = True
+        self.hosts[1].save()
+
+        self.hosts[2].protection = host_protections.Protection.DO_NOT_VERIFY
+        self.hosts[2].save()
+
+        self.god.stub_with(self.cleanup, '_should_reverify_hosts_now',
+                           lambda : True)
+        self.cleanup._reverify_dead_hosts()
+
+        tasks = models.SpecialTask.objects.all()
+        # four hosts need reverifying but our max limit was set to 2
+        self.assertEquals(len(tasks), 2)
+        self.assertTrue(tasks[0].host.id in (1, 4, 5, 6))
+        self.assertTrue(tasks[1].host.id in (1, 4, 5, 6))
+        self.assertEquals(tasks[0].task, models.SpecialTask.Task.VERIFY)
+        self.assertEquals(tasks[1].task, models.SpecialTask.Task.VERIFY)
+
+
 if __name__ == '__main__':
     unittest.main()
--- autotest/scheduler/scheduler_config.py      2010-01-10 11:48:43.000000000 
-0800
+++ autotest/scheduler/scheduler_config.py      2010-10-21 13:16:19.000000000 
-0700
@@ -16,6 +16,7 @@
               'secs_to_wait_for_atomic_group_hosts':
                   'secs_to_wait_for_atomic_group_hosts',
               'reverify_period_minutes': 'reverify_period_minutes',
+              'reverify_max_hosts_at_once': 'reverify_max_hosts_at_once',
              }
_______________________________________________
Autotest mailing list
[email protected]
http://test.kernel.org/cgi-bin/mailman/listinfo/autotest

Reply via email to