Commit b85213cd authored by Michal 'vorner' Vaner's avatar Michal 'vorner' Vaner
Browse files

Merge branch 'trac213-incremental-restarts' into trac213-incremental

Conflicts:
	src/bin/bind10/bind10_src.py.in
parents 056a1342 4aa0057d
......@@ -113,27 +113,15 @@ old process was not shut down correctly, and needs to be killed, or
another instance of BIND10, with the same msgq domain socket, is
running, which needs to be stopped.
% BIND10_MSGQ_DAEMON_ENDED b10-msgq process died, shutting down
The message bus daemon has died. This is a fatal error, since it may
leave the system in an inconsistent state. BIND10 will now shut down.
% BIND10_MSGQ_DISAPPEARED msgq channel disappeared
While listening on the message bus channel for messages, it suddenly
disappeared. The msgq daemon may have died. This might lead to an
inconsistent state of the system, and BIND 10 will now shut down.
% BIND10_PROCESS_ENDED_NO_EXIT_STATUS process %1 (PID %2) died: exit status not available
The given process ended unexpectedly, but no exit status is
available. See BIND10_PROCESS_ENDED_WITH_EXIT_STATUS for a longer
description.
% BIND10_PROCESS_ENDED_WITH_EXIT_STATUS process %1 (PID %2) terminated, exit status = %3
The given process ended unexpectedly with the given exit status.
Depending on which module it was, it may simply be restarted, or it
may be a problem that will cause the boss module to shut down too.
The latter happens if it was the message bus daemon, which, if it has
died suddenly, may leave the system in an inconsistent state. BIND10
will also shut down now if it has been run with --brittle.
% BIND10_PROCESS_ENDED process %2 of %1 ended with status %3
This indicates a process started previously terminated. The process id
and component owning the process are indicated, as well as the exit code.
This doesn't distinguish if the process was supposed to terminate or not.
% BIND10_READING_BOSS_CONFIGURATION reading boss configuration
The boss process is starting up, and will now process the initial
......@@ -187,11 +175,6 @@ which failed is unknown (not one of 'S' for socket or 'B' for bind).
The boss requested a socket from the creator, but the answer is unknown. This
looks like a programmer error.
% BIND10_SOCKCREATOR_CRASHED the socket creator crashed
The socket creator terminated unexpectedly. It is not possible to restart it
(because the boss already gave up root privileges), so the system is going
to terminate.
% BIND10_SOCKCREATOR_EOF eof while expecting data from socket creator
There should be more data from the socket creator, but it closed the socket.
It probably crashed.
......
......@@ -247,12 +247,16 @@ class BoB:
self.cfg_start_dhcp6 = False
self.cfg_start_dhcp4 = False
self.curproc = None
# XXX: Not used now, waits for reintroduction of restarts.
self.dead_processes = {}
self.msgq_socket_file = msgq_socket_file
self.nocache = nocache
self.component_config = {}
self.processes = {}
self.expected_shutdowns = {}
# Some time in future, it may happen that a single component has
# multple processes. If so happens, name "components" may be
# inapropriate. But as the code isn't probably completely ready
# for it, we leave it at components for now.
self.components = {}
self.runnable = False
self.uid = setuid
self.username = username
......@@ -262,7 +266,6 @@ class BoB:
self.cmdctl_port = cmdctl_port
self.brittle = brittle
self.wait_time = wait_time
self.sockcreator = None
self._component_configurator = isc.bind10.component.Configurator(self,
isc.bind10.special_component.get_specials())
# The priorities here make them start in the correct order. First
......@@ -355,11 +358,11 @@ class BoB:
return answer
def get_processes(self):
pids = list(self.processes.keys())
pids = list(self.components.keys())
pids.sort()
process_list = [ ]
for pid in pids:
process_list.append([pid, self.processes[pid].name])
process_list.append([pid, self.components[pid].name()])
return process_list
def _get_stats_data(self):
......@@ -408,7 +411,7 @@ class BoB:
"Unknown command")
return answer
def kill_started_processes(self):
def kill_started_components(self):
"""
Called as part of the exception handling when a process fails to
start, this runs through the list of started processes, killing
......@@ -416,12 +419,10 @@ class BoB:
"""
logger.info(BIND10_KILLING_ALL_PROCESSES)
self.stop_creator(True)
for pid in self.processes:
logger.info(BIND10_KILL_PROCESS, self.processes[pid].name)
self.processes[pid].process.kill()
self.processes = {}
for pid in self.components:
logger.info(BIND10_KILL_PROCESS, self.components[pid].name())
self.components[pid].kill(True)
self.components = {}
def read_bind10_config(self):
"""
......@@ -594,26 +595,18 @@ class BoB:
self.log_starting(name, port, address)
newproc = ProcessInfo(name, args, c_channel_env)
newproc.spawn()
# This is now done in register_process()
#self.processes[newproc.pid] = newproc
self.log_started(newproc.pid)
return newproc
def register_process(self, pid, info):
def register_process(self, pid, component):
"""
Put another process into boss to watch over it. When the process
dies, the info.failed() is called with the exit code.
dies, the component.failed() is called with the exit code.
It is expected the info is a isc.bind10.component.BaseComponent
subclass (or anything having the same interface).
"""
if '_procinfo' in dir(info):
# FIXME: This is temporary and the interface of the component
# doesn't guarantee the existence.
self.processes[pid] = info._procinfo
else:
# XXX: a short term hack. This is the sockcreator.
self.sockcreator = info._SockCreator__creator
self.components[pid] = component
def start_simple(self, name):
"""
......@@ -717,10 +710,10 @@ class BoB:
return self.start_process("b10-xfrin", args, c_channel_env)
def start_all_processes(self):
def start_all_components(self):
"""
Starts up all the processes. Any exception generated during the
starting of the processes is handled by the caller.
Starts up all the components. Any exception generated during the
starting of the components is handled by the caller.
"""
# Start the real core (sockcreator, msgq, cfgmgr)
self._component_configurator.startup(self.__core_components)
......@@ -735,7 +728,7 @@ class BoB:
# configuration may override the "-v" switch set on the command line.
self.read_bind10_config()
# Continue starting the processes. The authoritative server (if
# Continue starting the components. The authoritative server (if
# selected):
component_config = {}
if self.cfg_start_auth:
......@@ -766,7 +759,7 @@ class BoB:
'address': 'Zonemgr' }
self.__propagate_component_config(component_config)
# ... and finally start the remaining processes
# ... and finally start the remaining components
component_config['b10-stats'] = { 'kind': 'dispensable',
'address': 'Stats' }
component_config['b10-stats-httpd'] = { 'kind': 'dispensable',
......@@ -804,13 +797,13 @@ class BoB:
# this is the case we want, where the msgq is not running
pass
# Start all processes. If any one fails to start, kill all started
# processes and exit with an error indication.
# Start all components. If any one fails to start, kill all started
# components and exit with an error indication.
try:
self.c_channel_env = c_channel_env
self.start_all_processes()
self.start_all_components()
except Exception as e:
self.kill_started_processes()
self.kill_started_components()
return "Unable to start " + self.curproc + ": " + str(e)
# Started successfully
......@@ -824,10 +817,6 @@ class BoB:
(in logs, etc), the recipient is the address on msgq.
"""
logger.info(BIND10_STOP_PROCESS, process)
# TODO: Some timeout to solve processes that don't want to die would
# help. We can even store it in the dict, it is used only as a set
self.expected_shutdowns[process] = 1
# Ask the process to die willingly
self.cc_session.group_sendmsg({'command': ['shutdown']}, recipient,
recipient)
......@@ -879,27 +868,26 @@ class BoB:
time.sleep(1)
self.reap_children()
# next try sending a SIGTERM
processes_to_stop = list(self.processes.values())
for proc_info in processes_to_stop:
logger.info(BIND10_SEND_SIGTERM, proc_info.name,
proc_info.pid)
components_to_stop = list(self.components.values())
for component in components_to_stop:
logger.info(BIND10_SEND_SIGTERM, component.name(), component.pid())
try:
proc_info.process.terminate()
component.kill()
except OSError:
# ignore these (usually ESRCH because the child
# finally exited)
pass
# finally, send SIGKILL (unmaskable termination) until everybody dies
while self.processes:
while self.components:
# XXX: some delay probably useful... how much is uncertain
time.sleep(0.1)
self.reap_children()
processes_to_stop = list(self.processes.values())
for proc_info in processes_to_stop:
logger.info(BIND10_SEND_SIGKILL, proc_info.name,
proc_info.pid)
components_to_stop = list(self.components.values())
for component in components_to_stop:
logger.info(BIND10_SEND_SIGKILL, component.name(),
component.pid())
try:
proc_info.process.kill()
component.kill(True)
except OSError:
# ignore these (usually ESRCH because the child
# finally exited)
......@@ -921,40 +909,16 @@ class BoB:
# XXX: should be impossible to get any other error here
raise
if pid == 0: break
if self.sockcreator is not None and self.sockcreator.pid() == pid:
# This is the socket creator, started and terminated
# differently. This can't be restarted.
if self.runnable:
logger.fatal(BIND10_SOCKCREATOR_CRASHED)
self.sockcreator = None
self.runnable = False
elif pid in self.processes:
# One of the processes we know about. Get information on it.
proc_info = self.processes.pop(pid)
proc_info.restart_schedule.set_run_stop_time()
self.dead_processes[proc_info.pid] = proc_info
# Write out message, but only if in the running state:
# During startup and shutdown, these messages are handled
# elsewhere.
if self.runnable:
if exit_status is None:
logger.warn(BIND10_PROCESS_ENDED_NO_EXIT_STATUS,
proc_info.name, proc_info.pid)
else:
logger.warn(BIND10_PROCESS_ENDED_WITH_EXIT_STATUS,
proc_info.name, proc_info.pid,
exit_status)
# Was it a special process?
if proc_info.name == "b10-msgq":
logger.fatal(BIND10_MSGQ_DAEMON_ENDED)
self.runnable = False
# If we're in 'brittle' mode, we want to shutdown after
# any process dies.
if self.brittle:
self.runnable = False
if pid in self.components:
# One of the components we know about. Get information on it.
component = self.components.pop(pid)
logger.info(BIND10_PROCESS_ENDED, component.name(), pid,
exit_status)
if component.running() and self.runnable:
# Tell it it failed. But only if it matters (we are
# not shutting down and the component considers itself
# to be running.
component.failed(exit_status);
else:
logger.info(BIND10_UNKNOWN_CHILD_PROCESS_ENDED, pid)
......@@ -968,7 +932,16 @@ class BoB:
The values returned can be safely passed into select() as the
timeout value.
"""
# TODO: This is an artefact of previous way of handling processes. The
# restart queue is currently empty at all times, so this returns None
# every time it is called (thought is a relict that is obviously wrong,
# it is called and it doesn't hurt).
#
# It is preserved for archeological reasons for the time when we return
# the delayed restarts, most of it might be useful then (or, if it is
# found useless, removed).
next_restart = None
# if we're shutting down, then don't restart
if not self.runnable:
......@@ -977,10 +950,6 @@ class BoB:
still_dead = {}
now = time.time()
for proc_info in self.dead_processes.values():
if proc_info.name in self.expected_shutdowns:
# We don't restart, we wanted it to die
del self.expected_shutdowns[proc_info.name]
continue
restart_time = proc_info.restart_schedule.get_restart_time(now)
if restart_time > now:
if (next_restart is None) or (next_restart > restart_time):
......@@ -990,7 +959,7 @@ class BoB:
logger.info(BIND10_RESURRECTING_PROCESS, proc_info.name)
try:
proc_info.respawn()
self.processes[proc_info.pid] = proc_info
self.components[proc_info.pid] = proc_info
logger.info(BIND10_RESURRECTED_PROCESS, proc_info.name, proc_info.pid)
except:
still_dead[proc_info.pid] = proc_info
......@@ -1182,6 +1151,10 @@ def main():
while boss_of_bind.runnable:
# clean up any processes that exited
boss_of_bind.reap_children()
# XXX: As we don't put anything into the processes to be restarted,
# this is really a complicated NOP. But we will try to reintroduce
# delayed restarts, so it stays here for now, until we find out if
# it's useful.
next_restart = boss_of_bind.restart_processes()
if next_restart is None:
wait_time = None
......
......@@ -104,7 +104,7 @@ class TestBoB(unittest.TestCase):
self.assertEqual(bob.msgq_socket_file, None)
self.assertEqual(bob.cc_session, None)
self.assertEqual(bob.ccs, None)
self.assertEqual(bob.processes, {})
self.assertEqual(bob.components, {})
self.assertEqual(bob.dead_processes, {})
self.assertEqual(bob.runnable, False)
self.assertEqual(bob.uid, None)
......@@ -122,7 +122,7 @@ class TestBoB(unittest.TestCase):
self.assertEqual(bob.msgq_socket_file, "alt_socket_file")
self.assertEqual(bob.cc_session, None)
self.assertEqual(bob.ccs, None)
self.assertEqual(bob.processes, {})
self.assertEqual(bob.components, {})
self.assertEqual(bob.dead_processes, {})
self.assertEqual(bob.runnable, False)
self.assertEqual(bob.uid, None)
......@@ -221,7 +221,7 @@ class MockBob(BoB):
self.dhcp6 = False
self.dhcp4 = False
self.c_channel_env = {}
self.processes = { }
self.components = { }
self.creator = False
class MockSockCreator(isc.bind10.component.Component):
......@@ -351,58 +351,58 @@ class MockBob(BoB):
# in case he forgets to update the tests.
def stop_msgq(self):
if self.msgq:
del self.processes[2]
del self.components[2]
self.msgq = False
def stop_cfgmgr(self):
if self.cfgmgr:
del self.processes[3]
del self.components[3]
self.cfgmgr = False
def stop_auth(self):
if self.auth:
del self.processes[5]
del self.components[5]
self.auth = False
def stop_resolver(self):
if self.resolver:
del self.processes[6]
del self.components[6]
self.resolver = False
def stop_xfrout(self):
if self.xfrout:
del self.processes[7]
del self.components[7]
self.xfrout = False
def stop_xfrin(self):
if self.xfrin:
del self.processes[8]
del self.components[8]
self.xfrin = False
def stop_zonemgr(self):
if self.zonemgr:
del self.processes[9]
del self.components[9]
self.zonemgr = False
def stop_stats(self):
if self.stats:
del self.processes[10]
del self.components[10]
self.stats = False
def stop_stats_httpd(self):
if self.stats_httpd:
del self.processes[11]
del self.components[11]
self.stats_httpd = False
def stop_cmdctl(self):
if self.cmdctl:
del self.processes[12]
del self.components[12]
self.cmdctl = False
class TestStartStopProcessesBob(unittest.TestCase):
"""
Check that the start_all_processes method starts the right combination
of processes and that the right processes are started and stopped
Check that the start_all_components method starts the right combination
of components and that the right components are started and stopped
according to changes in configuration.
"""
def check_environment_unchanged(self):
......@@ -436,7 +436,7 @@ class TestStartStopProcessesBob(unittest.TestCase):
def check_started_none(self, bob):
"""
Check that the situation is according to configuration where no servers
should be started. Some processes still need to be running.
should be started. Some components still need to be running.
"""
self.check_started(bob, True, False, False)
self.check_environment_unchanged()
......@@ -451,14 +451,14 @@ class TestStartStopProcessesBob(unittest.TestCase):
def check_started_auth(self, bob):
"""
Check the set of processes needed to run auth only is started.
Check the set of components needed to run auth only is started.
"""
self.check_started(bob, True, True, False)
self.check_environment_unchanged()
def check_started_resolver(self, bob):
"""
Check the set of processes needed to run resolver only is started.
Check the set of components needed to run resolver only is started.
"""
self.check_started(bob, True, False, True)
self.check_environment_unchanged()
......@@ -467,14 +467,8 @@ class TestStartStopProcessesBob(unittest.TestCase):
"""
Check if proper combinations of DHCPv4 and DHCpv6 can be started
"""
v4found = 0
v6found = 0
for pid in bob.processes:
if (bob.processes[pid].name == "b10-dhcp4"):
v4found += 1
if (bob.processes[pid].name == "b10-dhcp6"):
v6found += 1
v4found = 'b10-dhcp4' in bob.component_config
v6found = 'b10-dhcp6' in bob.component_config
# there should be exactly one DHCPv4 daemon (if v4==True)
# there should be exactly one DHCPv6 daemon (if v6==True)
......@@ -482,65 +476,65 @@ class TestStartStopProcessesBob(unittest.TestCase):
self.assertEqual(v6==True, v6found==1)
self.check_environment_unchanged()
# Checks the processes started when starting neither auth nor resolver
# Checks the components started when starting neither auth nor resolver
# is specified.
def test_start_none(self):
# Create BoB and ensure correct initialization
bob = MockBob()
self.check_preconditions(bob)
# Start processes and check what was started
# Start components and check what was started
bob.cfg_start_auth = False
bob.cfg_start_resolver = False
bob.start_all_processes()
bob.start_all_components()
self.check_started_none(bob)
# Checks the processes started when starting only the auth process
# Checks the components started when starting only the auth process
def test_start_auth(self):
# Create BoB and ensure correct initialization
bob = MockBob()
self.check_preconditions(bob)
# Start processes and check what was started
# Start components and check what was started
bob.cfg_start_auth = True
bob.cfg_start_resolver = False
bob.start_all_processes()
bob.start_all_components()
self.check_started_auth(bob)
# Checks the processes started when starting only the resolver process
# Checks the components started when starting only the resolver process
def test_start_resolver(self):
# Create BoB and ensure correct initialization
bob = MockBob()
self.check_preconditions(bob)
# Start processes and check what was started
# Start components and check what was started
bob.cfg_start_auth = False
bob.cfg_start_resolver = True
bob.start_all_processes()
bob.start_all_components()
self.check_started_resolver(bob)
# Checks the processes started when starting both auth and resolver process
# Checks the components started when starting both auth and resolver process
def test_start_both(self):
# Create BoB and ensure correct initialization
bob = MockBob()
self.check_preconditions(bob)
# Start processes and check what was started
# Start components and check what was started
bob.cfg_start_auth = True
bob.cfg_start_resolver = True
bob.start_all_processes()
bob.start_all_components()
self.check_started_both(bob)
def test_config_start(self):
"""
Test that the configuration starts and stops processes according
Test that the configuration starts and stops components according
to configuration changes.
"""
......@@ -548,12 +542,12 @@ class TestStartStopProcessesBob(unittest.TestCase):
bob = MockBob()
self.check_preconditions(bob)
# Start processes (nothing much should be started, as in
# Start components (nothing much should be started, as in
# test_start_none)
bob.cfg_start_auth = False
bob.cfg_start_resolver = False
bob.start_all_processes()
bob.start_all_components()
bob.runnable = True
self.check_started_none(bob)
......@@ -613,11 +607,11 @@ class TestStartStopProcessesBob(unittest.TestCase):
bob = MockBob()
self.check_preconditions(bob)
# Start processes (both)
# Start components (both)
bob.cfg_start_auth = True
bob.cfg_start_resolver = True
bob.start_all_processes()
bob.start_all_components()
bob.runnable = True
self.check_started_both(bob)
......@@ -633,7 +627,7 @@ class TestStartStopProcessesBob(unittest.TestCase):
def test_config_not_started_early(self):
"""
Test that processes are not started by the config handler before
Test that components are not started by the config handler before
startup.
"""
bob = MockBob()
......@@ -647,7 +641,7 @@ class TestStartStopProcessesBob(unittest.TestCase):
bob.config_handler({'start_auth': True, 'start_resolver': True})
# Checks that DHCP (v4 and v6) processes are started when expected
# Checks that DHCP (v4 and v6) components are started when expected
def test_start_dhcp(self):
# Create BoB and ensure correct initialization
......@@ -661,7 +655,7 @@ class TestStartStopProcessesBob(unittest.TestCase):
# v4 and v6 disabled
bob.cfg_start_dhcp6 = False
bob.cfg_start_dhcp4 = False
bob.start_all_processes()
bob.start_all_components()
self.check_started_dhcp(bob, False, False)
def test_start_dhcp_v6only(self):
......@@ -676,7 +670,7 @@ class TestStartStopProcessesBob(unittest.TestCase):
# v6 only enabled
bob.cfg_start_dhcp6 = True
bob.cfg_start_dhcp4 = False
bob.start_all_processes()
bob.start_all_components()
self.check_started_dhcp(bob, False, True)
# uncomment when dhcpv4 becomes implemented
......@@ -690,6 +684,12 @@ class TestStartStopProcessesBob(unittest.TestCase):
#bob.cfg_start_dhcp4 = True
#self.check_started_dhcp(bob, True, True)
class MockComponent:
def __init__(self, name, pid):
self.name = lambda: name
self.pid = lambda: pid
class TestBossCmd(unittest.TestCase):
def test_ping(self):
"""
......@@ -699,7 +699,7 @@ class TestBossCmd(unittest.TestCase):
answer = bob.command_handler("ping", None)
self.assertEqual(answer, {'result': [0, 'pong']})
def test_show_processes(self):
def test_show_processes_empty(self):
"""
Confirm getting a list of processes works.
"""
......@@ -707,23 +707,16 @@ class TestBossCmd(unittest.TestCase):
answer = bob.command_handler("show_processes", None)
self.assertEqual(answer, {'result': [0, []]})
def test_show_processes_started(self):
def test_show_processes(self):
"""
Confirm getting a list of processes works.
"""
bob = MockBob()
bob.start_all_processes()
bob.register_process(1, MockComponent('first', 1))
bob.register_process(2, MockComponent('second', 2))
answer = bob.command_handler("show_processes", None)
processes = [[1, 'b10-sockcreator'],
[2, 'b10-msgq'],
[3, 'b10-cfgmgr'],
[5, 'b10-auth'],
[7, 'b10-xfrout'],
[8, 'b10-xfrin'],
[9, 'b10-zonemgr'],
[10, 'b10-stats'],
[11, 'b10-stats-httpd'],
[12, 'b10-cmdctl']]