On Thu, Oct 28, 2010 at 3:56 PM, Lucas Meneghel Rodrigues 
<[email protected]>wrote:

> On Thu, 2010-10-21 at 13:28 -0700, Gregory Smith wrote:
> > Allow limiting of the number of hosts that go through the scheduler's
> > periodic Repair Failed -> Verifying cleanup cycle at once.  When you've
> > got an infrastructure issue that kicks all your hosts into repair
> > failed, launching reverifies on all of them at once can overwhelm the
> > server.
>
> Hi Greg, looks good to me. I had even prepared an updated version of
> your patch with the fix cobbled together, but I just remember that you
> guys at google have an internal branch and some sort of tracker of the
> changes, so instead I am going to commit the original patches one after
> the other.
>

heh yeah, we have code that pulls change by change from subversion to
reapply to our internal repository when we sync up.  It tries to filter out
the ones we were responsible for mailing to avoid human intervention when
possible.  thanks!


>
> Thanks,
>
> Lucas
>
> > Signed-off-by: Gregory Smith <[email protected]>
> >
> > --- autotest/global_config.ini        2010-07-14 12:42:00.000000000 -0700
> > +++ autotest/global_config.ini        2010-10-21 13:16:19.000000000 -0700
> > @@ -80,6 +80,7 @@
> >  gc_stats_interval_mins: 360
> >  # set nonzero to enable periodic reverification of all dead hosts
> >  reverify_period_minutes: 0
> > +reverify_max_hosts_at_once: 0
> >  drone_sets_enabled: False
> >  # default_drone_set_name: This is required if drone sets are enabled.
> >  default_drone_set_name:
> > --- autotest/scheduler/monitor_db_cleanup.py  2010-02-18
> 12:57:56.000000000 -0800
> > +++ autotest/scheduler/monitor_db_cleanup.py  2010-10-21
> 13:16:19.000000000 -0700
> > @@ -3,7 +3,7 @@
> >  """
> >
> >
> > -import datetime, time, logging
> > +import datetime, time, logging, random
> >  from autotest_lib.database import database_connection
> >  from autotest_lib.frontend.afe import models
> >  from autotest_lib.scheduler import email_manager, scheduler_config
> > @@ -162,6 +162,15 @@
> >          return (self._last_reverify_time + reverify_period_sec) <=
> time.time()
> >
> >
> > +    def _choose_subset_of_hosts_to_reverify(self, hosts):
> > +        """Given hosts needing verification, return a subset to
> reverify."""
> > +        if (scheduler_config.reverify_max_hosts_at_once > 0 and
> > +            len(hosts) > scheduler_config.reverify_max_hosts_at_once):
> > +            return random.sample(hosts,
> > +
> scheduler_config.reverify_max_hosts_at_once)
> > +        return sorted(hosts)
> > +
> > +
> >      def _reverify_dead_hosts(self):
> >          if not self._should_reverify_hosts_now():
> >              return
> > @@ -177,8 +186,11 @@
> >          if not hosts:
> >              return
> >
> > -        logging.info('Reverifying dead hosts %s'
> > -                     % ', '.join(host.hostname for host in hosts))
> > +        hosts = list(hosts)
> > +        total_hosts = len(hosts)
> > +        hosts = self._choose_subset_of_hosts_to_reverify(hosts)
> > +        logging.info('Reverifying dead hosts (%d of %d) %s',
> len(hosts),
> > +                     total_hosts, ', '.join(host.hostname for host in
> hosts))
> >          for host in hosts:
> >              models.SpecialTask.schedule_special_task(
> >                      host=host, task=models.SpecialTask.Task.VERIFY)
> > --- autotest/scheduler/monitor_db_cleanup_test.py     2010-01-27
> 22:21:03.000000000 -0800
> > +++ autotest/scheduler/monitor_db_cleanup_test.py     2010-10-21
> 13:16:19.000000000 -0700
> > @@ -5,7 +5,7 @@
> >  from autotest_lib.frontend import setup_django_environment
> >  from autotest_lib.database import database_connection
> >  from autotest_lib.frontend.afe import frontend_test_utils, models
> > -from autotest_lib.scheduler import monitor_db_cleanup
> > +from autotest_lib.scheduler import monitor_db_cleanup, scheduler_config
> >  from autotest_lib.client.common_lib import host_protections
> >
> >  class UserCleanupTest(unittest.TestCase,
> frontend_test_utils.FrontendTestMixin):
> > @@ -23,6 +23,8 @@
> >
> >
> >      def test_reverify_dead_hosts(self):
> > +        # unlimited reverifies
> > +        self.god.stub_with(scheduler_config,
> 'reverify_max_hosts_at_once', 0)
> >          for i in (0, 1, 2):
> >              self.hosts[i].status = models.Host.Status.REPAIR_FAILED
> >              self.hosts[i].save()
> > @@ -43,5 +45,31 @@
> >          self.assertEquals(tasks[0].task, models.SpecialTask.Task.VERIFY)
> >
> >
> > +    def test_reverify_dead_hosts_limits(self):
> > +        # limit the number of reverifies
> > +        self.god.stub_with(scheduler_config,
> 'reverify_max_hosts_at_once', 2)
> > +        for i in (0, 1, 2, 3, 4, 5):
> > +            self.hosts[i].status = models.Host.Status.REPAIR_FAILED
> > +            self.hosts[i].save()
> > +
> > +        self.hosts[1].locked = True
> > +        self.hosts[1].save()
> > +
> > +        self.hosts[2].protection =
> host_protections.Protection.DO_NOT_VERIFY
> > +        self.hosts[2].save()
> > +
> > +        self.god.stub_with(self.cleanup, '_should_reverify_hosts_now',
> > +                           lambda : True)
> > +        self.cleanup._reverify_dead_hosts()
> > +
> > +        tasks = models.SpecialTask.objects.all()
> > +        # four hosts need reverifying but our max limit was set to 2
> > +        self.assertEquals(len(tasks), 2)
> > +        self.assertTrue(tasks[0].host.id in (1, 4, 5, 6))
> > +        self.assertTrue(tasks[1].host.id in (1, 4, 5, 6))
> > +        self.assertEquals(tasks[0].task, models.SpecialTask.Task.VERIFY)
> > +        self.assertEquals(tasks[1].task, models.SpecialTask.Task.VERIFY)
> > +
> > +
> >  if __name__ == '__main__':
> >      unittest.main()
> > --- autotest/scheduler/scheduler_config.py    2010-01-10
> 11:48:43.000000000 -0800
> > +++ autotest/scheduler/scheduler_config.py    2010-10-21
> 13:16:19.000000000 -0700
> > @@ -16,6 +16,7 @@
> >                'secs_to_wait_for_atomic_group_hosts':
> >                    'secs_to_wait_for_atomic_group_hosts',
> >                'reverify_period_minutes': 'reverify_period_minutes',
> > +              'reverify_max_hosts_at_once':
> 'reverify_max_hosts_at_once',
> >               }
> > _______________________________________________
> > Autotest mailing list
> > [email protected]
> > http://test.kernel.org/cgi-bin/mailman/listinfo/autotest
>
>
>
_______________________________________________
Autotest mailing list
[email protected]
http://test.kernel.org/cgi-bin/mailman/listinfo/autotest

Reply via email to