Commit 13f108b2 authored by Jelte Jansen's avatar Jelte Jansen
Browse files

Merge branch 'trac1342'

parents fc0a3168 295732d4
......@@ -92,51 +92,6 @@ VERSION = "bind10 20110223 (BIND 10 @PACKAGE_VERSION@)"
# This is for boot_time of Boss
_BASETIME = time.gmtime()
class RestartSchedule:
"""
Keeps state when restarting something (in this case, a process).
When a process dies unexpectedly, we need to restart it. However, if
it fails to restart for some reason, then we should not simply keep
restarting it at high speed.
A more sophisticated algorithm can be developed, but for now we choose
a simple set of rules:
* If a process was been running for >=10 seconds, we restart it
right away.
* If a process was running for <10 seconds, we wait until 10 seconds
after it was started.
To avoid programs getting into lockstep, we use a normal distribution
to avoid being restarted at exactly 10 seconds."""
def __init__(self, restart_frequency=10.0):
self.restart_frequency = restart_frequency
self.run_start_time = None
self.run_stop_time = None
self.restart_time = None
def set_run_start_time(self, when=None):
if when is None:
when = time.time()
self.run_start_time = when
sigma = self.restart_frequency * 0.05
self.restart_time = when + random.normalvariate(self.restart_frequency,
sigma)
def set_run_stop_time(self, when=None):
"""We don't actually do anything with stop time now, but it
might be useful for future algorithms."""
if when is None:
when = time.time()
self.run_stop_time = when
def get_restart_time(self, when=None):
if when is None:
when = time.time()
return max(when, self.restart_time)
class ProcessInfoError(Exception): pass
class ProcessInfo:
......@@ -151,7 +106,6 @@ class ProcessInfo:
self.env = env
self.dev_null_stdout = dev_null_stdout
self.dev_null_stderr = dev_null_stderr
self.restart_schedule = RestartSchedule()
self.uid = uid
self.username = username
self.process = None
......@@ -200,7 +154,6 @@ class ProcessInfo:
env=spawn_env,
preexec_fn=self._preexec_work)
self.pid = self.process.pid
self.restart_schedule.set_run_start_time()
# spawn() and respawn() are the same for now, but in the future they
# may have different functionality
......@@ -240,8 +193,6 @@ class BoB:
self.cc_session = None
self.ccs = None
self.curproc = None
# XXX: Not used now, waits for reintroduction of restarts.
self.dead_processes = {}
self.msgq_socket_file = msgq_socket_file
self.nocache = nocache
self.component_config = {}
......@@ -250,6 +201,9 @@ class BoB:
# inapropriate. But as the code isn't probably completely ready
# for it, we leave it at components for now.
self.components = {}
# Simply list of components that died and need to wait for a
# restart. Components manage their own restart schedule now
self.components_to_restart = []
self.runnable = False
self.uid = setuid
self.username = username
......@@ -821,7 +775,11 @@ class BoB:
# Tell it it failed. But only if it matters (we are
# not shutting down and the component considers itself
# to be running.
component.failed(exit_status);
component_restarted = component.failed(exit_status);
# if the process wants to be restarted, but not just yet,
# it returns False
if not component_restarted:
self.components_to_restart.append(component)
else:
logger.info(BIND10_UNKNOWN_CHILD_PROCESS_ENDED, pid)
......@@ -837,39 +795,22 @@ class BoB:
timeout value.
"""
# TODO: This is an artefact of previous way of handling processes. The
# restart queue is currently empty at all times, so this returns None
# every time it is called (thought is a relict that is obviously wrong,
# it is called and it doesn't hurt).
#
# It is preserved for archeological reasons for the time when we return
# the delayed restarts, most of it might be useful then (or, if it is
# found useless, removed).
next_restart = None
# if we're shutting down, then don't restart
if not self.runnable:
return 0
# otherwise look through each dead process and try to restart
still_dead = {}
still_dead = []
# keep track of the first time we need to check this queue again,
# if at all
next_restart_time = None
now = time.time()
for proc_info in self.dead_processes.values():
restart_time = proc_info.restart_schedule.get_restart_time(now)
if restart_time > now:
if (next_restart is None) or (next_restart > restart_time):
next_restart = restart_time
still_dead[proc_info.pid] = proc_info
else:
logger.info(BIND10_RESURRECTING_PROCESS, proc_info.name)
try:
proc_info.respawn()
self.components[proc_info.pid] = proc_info
logger.info(BIND10_RESURRECTED_PROCESS, proc_info.name, proc_info.pid)
except:
still_dead[proc_info.pid] = proc_info
# remember any processes that refuse to be resurrected
self.dead_processes = still_dead
# return the time when the next process is ready to be restarted
return next_restart
for component in self.components_to_restart:
if not component.restart(now):
still_dead.append(component)
if next_restart_time is None or\
next_restart_time > component.get_restart_time():
next_restart_time = component.get_restart_time()
self.components_to_restart = still_dead
return next_restart_time
# global variables, needed for signal handlers
options = None
......@@ -1052,10 +993,6 @@ def main():
while boss_of_bind.runnable:
# clean up any processes that exited
boss_of_bind.reap_children()
# XXX: As we don't put anything into the processes to be restarted,
# this is really a complicated NOP. But we will try to reintroduce
# delayed restarts, so it stays here for now, until we find out if
# it's useful.
next_restart = boss_of_bind.restart_processes()
if next_restart is None:
wait_time = None
......
......@@ -105,7 +105,6 @@ class TestBoB(unittest.TestCase):
self.assertEqual(bob.cc_session, None)
self.assertEqual(bob.ccs, None)
self.assertEqual(bob.components, {})
self.assertEqual(bob.dead_processes, {})
self.assertEqual(bob.runnable, False)
self.assertEqual(bob.uid, None)
self.assertEqual(bob.username, None)
......@@ -118,7 +117,6 @@ class TestBoB(unittest.TestCase):
self.assertEqual(bob.cc_session, None)
self.assertEqual(bob.ccs, None)
self.assertEqual(bob.components, {})
self.assertEqual(bob.dead_processes, {})
self.assertEqual(bob.runnable, False)
self.assertEqual(bob.uid, None)
self.assertEqual(bob.username, None)
......
......@@ -39,6 +39,7 @@ START_CMD = 'start'
STOP_CMD = 'stop'
STARTED_OK_TIME = 10
COMPONENT_RESTART_DELAY = 10
STATE_DEAD = 'dead'
STATE_STOPPED = 'stopped'
......@@ -99,11 +100,18 @@ class BaseComponent:
but it is vital part of the service (like auth server). If
it fails to start or crashes in less than 10s after the first
startup, the system is brought down. If it crashes later on,
it is restarted.
it is restarted (see below).
* 'dispensable' means the component should be running, but if it
doesn't start or crashes for some reason, the system simply tries
to restart it and keeps running.
For components that are restarted, the restarts are not always
immediate; if the component has run for more than
COMPONENT_RESTART_DELAY (10) seconds, they are restarted right
away. If the component has not run that long, the system waits
until that time has passed (since the last start) until the
component is restarted.
Note that the __init__ method of child class should have these
parameters:
......@@ -134,6 +142,7 @@ class BaseComponent:
self.__state = STATE_STOPPED
self._kind = kind
self._boss = boss
self._original_start_time = None
def start(self):
"""
......@@ -149,6 +158,9 @@ class BaseComponent:
logger.info(BIND10_COMPONENT_START, self.name())
self.__state = STATE_RUNNING
self.__start_time = time.time()
if self._original_start_time is None:
self._original_start_time = self.__start_time
self._restart_time = None
try:
self._start_internal()
except Exception as e:
......@@ -188,6 +200,11 @@ class BaseComponent:
The exit code is used for logging. It might be None.
It calls _failed_internal internally.
Returns True if the process was immediately restarted, returns
False is the process was not restarted, either because
it is considered a core or needed component, or because
the component is to be restarted later.
"""
logger.error(BIND10_COMPONENT_FAILED, self.name(), self.pid(),
exit_code if exit_code is not None else "unknown")
......@@ -199,14 +216,47 @@ class BaseComponent:
# (including it stopped really soon)
if self._kind == 'core' or \
(self._kind == 'needed' and time.time() - STARTED_OK_TIME <
self.__start_time):
self._original_start_time):
self.__state = STATE_DEAD
logger.fatal(BIND10_COMPONENT_UNSATISFIED, self.name())
self._boss.component_shutdown(1)
return False
# This means we want to restart
else:
logger.warn(BIND10_COMPONENT_RESTART, self.name())
# if the component was only running for a short time, don't
# restart right away, but set a time it wants to restarted,
# and return that it wants to be restarted later
self.set_restart_time()
return self.restart()
def set_restart_time(self):
"""Calculates and sets the time this component should be restarted.
Currently, it uses a very basic algorithm; start time +
RESTART_DELAY (10 seconds). This algorithm may be improved upon
in the future.
"""
self._restart_at = self.__start_time + COMPONENT_RESTART_DELAY
def get_restart_time(self):
"""Returns the time at which this component should be restarted."""
return self._restart_at
def restart(self, now = None):
"""Restarts the component if it has a restart_time and if the value
of the restart_time is smaller than 'now'.
If the parameter 'now' is given, its value will be used instead
of calling time.time().
Returns True if the component is restarted, False if not."""
if now is None:
now = time.time()
if self.get_restart_time() is not None and\
self.get_restart_time() < now:
self.start()
return True
else:
return False
def running(self):
"""
......
......@@ -221,11 +221,6 @@ class ComponentTests(BossUtils, unittest.TestCase):
"""
Check the component restarted successfully.
Currently, it is implemented as starting it again right away. This will
change, it will register itself into the restart schedule in boss. But
as the integration with boss is not clear yet, we don't know how
exactly that will happen.
Reset the self.__start_called to False before calling the function when
the component should fail.
"""
......@@ -237,6 +232,16 @@ class ComponentTests(BossUtils, unittest.TestCase):
# Check it can't be started again
self.assertRaises(ValueError, component.start)
def __check_not_restarted(self, component):
"""
Check the component has not (yet) restarted successfully.
"""
self.assertFalse(self._shutdown)
self.assertTrue(self.__start_called)
self.assertFalse(self.__stop_called)
self.assertTrue(self.__failed_called)
self.assertFalse(component.running())
def __do_start_stop(self, kind):
"""
This is a body of a test. It creates a component of given kind,
......@@ -296,7 +301,9 @@ class ComponentTests(BossUtils, unittest.TestCase):
component.start()
self.__check_started(component)
# Pretend the component died
component.failed(1)
restarted = component.failed(1)
# Since it is a core component, it should not be restarted
self.assertFalse(restarted)
# It should bring down the whole server
self.__check_dead(component)
......@@ -312,7 +319,9 @@ class ComponentTests(BossUtils, unittest.TestCase):
self.__check_started(component)
self._timeskip()
# Pretend the component died some time later
component.failed(1)
restarted = component.failed(1)
# Should not be restarted
self.assertFalse(restarted)
# Check the component is still dead
self.__check_dead(component)
......@@ -328,7 +337,9 @@ class ComponentTests(BossUtils, unittest.TestCase):
component.start()
self.__check_started(component)
# Make it fail right away.
component.failed(1)
restarted = component.failed(1)
# Should not have restarted
self.assertFalse(restarted)
self.__check_dead(component)
def test_start_fail_needed_later(self):
......@@ -344,37 +355,65 @@ class ComponentTests(BossUtils, unittest.TestCase):
# Make it fail later on
self.__start_called = False
self._timeskip()
component.failed(1)
restarted = component.failed(1)
# Should have restarted
self.assertTrue(restarted)
self.__check_restarted(component)
def test_start_fail_dispensable(self):
"""
Start and then fail a dispensable component. Should just get restarted.
Start and then fail a dispensable component. Should not get restarted.
"""
# Just ordinary startup
component = self.__create_component('needed')
component = self.__create_component('dispensable')
self.__check_startup(component)
component.start()
self.__check_started(component)
# Make it fail right away
self.__start_called = False
component.failed(1)
self.__check_restarted(component)
restarted = component.failed(1)
# Should signal that it did not restart
self.assertFalse(restarted)
self.__check_not_restarted(component)
def test_start_fail_dispensable(self):
def test_start_fail_dispensable_later(self):
"""
Start and then later on fail a dispensable component. Should just get
restarted.
"""
# Just ordinary startup
component = self.__create_component('needed')
component = self.__create_component('dispensable')
self.__check_startup(component)
component.start()
self.__check_started(component)
# Make it fail later on
self.__start_called = False
self._timeskip()
component.failed(1)
restarted = component.failed(1)
# should signal that it restarted
self.assertTrue(restarted)
# and check if it really did
self.__check_restarted(component)
def test_start_fail_dispensable_restart_later(self):
"""
Start and then fail a dispensable component, wait a bit and try to
restart. Should get restarted after the wait.
"""
# Just ordinary startup
component = self.__create_component('dispensable')
self.__check_startup(component)
component.start()
self.__check_started(component)
# Make it fail immediately
restarted = component.failed(1)
# should signal that it did not restart
self.assertFalse(restarted)
self.__check_not_restarted(component)
self._timeskip()
# try to restart again
restarted = component.restart()
# should signal that it restarted
self.assertTrue(restarted)
# and check if it really did
self.__check_restarted(component)
def test_fail_core(self):
......@@ -402,14 +441,56 @@ class ComponentTests(BossUtils, unittest.TestCase):
def test_fail_dispensable(self):
"""
Failure to start a dispensable component. The exception should get
through, but it should be restarted.
through, but it should be restarted after a time skip.
"""
component = self.__create_component('dispensable')
self.__check_startup(component)
component._start_internal = self.__fail_to_start
self.assertRaises(TestError, component.start)
# tell it to see if it must restart
restarted = component.restart()
# should not have restarted yet
self.assertFalse(restarted)
self.__check_not_restarted(component)
self._timeskip()
# tell it to see if it must restart and do so, with our vision of time
restarted = component.restart()
# should have restarted now
self.assertTrue(restarted)
self.__check_restarted(component)
def test_component_start_time(self):
"""
Check that original start time is set initially, and remains the same
after a restart, while the internal __start_time does change
"""
# Just ordinary startup
component = self.__create_component('dispensable')
self.__check_startup(component)
self.assertIsNone(component._original_start_time)
component.start()
self.__check_started(component)
self.assertIsNotNone(component._original_start_time)
self.assertIsNotNone(component._BaseComponent__start_time)
original_start_time = component._original_start_time
start_time = component._BaseComponent__start_time
# Not restarted yet, so they should be the same
self.assertEqual(original_start_time, start_time)
self._timeskip()
# Make it fail
restarted = component.failed(1)
# should signal that it restarted
self.assertTrue(restarted)
# and check if it really did
self.__check_restarted(component)
# original start time should not have changed
self.assertEqual(original_start_time, component._original_start_time)
# but actual start time should
self.assertNotEqual(start_time, component._BaseComponent__start_time)
def test_bad_kind(self):
"""
Test the component rejects nonsensical kinds. This includes bad
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment