Commit 4c54d523 authored by Shane Kerr's avatar Shane Kerr
Browse files

Add class to track process information.

Restart killed processes.



git-svn-id: svn://bind10.isc.org/svn/bind10/branches/f2f200910@228 e5f2f494-b856-4b98-b285-d166d9295462
parent 0be9ea50
......@@ -25,16 +25,40 @@ import sys
import re
import errno
import time
import select
from optparse import OptionParser, OptionValueError
import ISC.CC
# This is the version that gets displayed to the user.
__version__ = "v20091028 (Paving the DNS Parking Lot)"
__version__ = "v20091030 (Paving the DNS Parking Lot)"
# Nothing at all to do with the 1990-12-10 article here:
# http://www.subgenius.com/subg-digest/v2/0056.html
class ProcessInfo:
"""Information about a process"""
dev_null = open("/dev/null", "w")
def _spawn(self):
self.process = subprocess.Popen(self.args,
stdin=subprocess.PIPE,
stdout=self.dev_null,
stderr=self.dev_null,
close_fds=True,
env=self.env,)
self.pid = self.process.pid
def __init__(self, name, args, env={}):
self.name = name
self.args = args
self.env = env
self._spawn()
def respawn(self):
self._spawn()
class BoB:
"""Boss of BIND class."""
def __init__(self, c_channel_port=9912, verbose=False):
......@@ -47,11 +71,10 @@ class BoB:
"""
self.verbose = True
self.c_channel_port = c_channel_port
self.cc_process = None
self.cc_session = None
self.processes = {}
self.dead_processes = {}
self.component_processes = {}
self.runnable = False
def startup(self):
"""Start the BoB instance.
......@@ -59,23 +82,17 @@ class BoB:
Returns None if successful, otherwise an string describing the
problem.
"""
dev_null = open("/dev/null", "w")
# start the c-channel daemon
if self.verbose:
sys.stdout.write("Starting msgq using port %d\n" % self.c_channel_port)
c_channel_env = { "ISC_MSGQ_PORT": str(self.c_channel_port), }
try:
c_channel = subprocess.Popen("msgq",
stdin=subprocess.PIPE,
stdout=dev_null,
stderr=dev_null,
close_fds=True,
env=c_channel_env,)
c_channel = ProcessInfo("msgq", "msgq", c_channel_env)
except:
return "Unable to start msgq"
self.processes[c_channel.pid] = c_channel
if self.verbose:
sys.stdout.write("Started msgq with PID %d\n" % c_channel.pid)
sys.stdout.write("Started msgq (PID %d)\n" % c_channel.pid)
# now connect to the c-channel
cc_connect_start = time.time()
......@@ -95,42 +112,30 @@ class BoB:
if self.verbose:
sys.stdout.write("Starting bind-cfgd\n")
try:
bind_cfgd = subprocess.Popen("bind-cfgd",
stdin=dev_null,
stdout=dev_null,
stderr=dev_null,
close_fds=True,
env={},)
bind_cfgd = ProcessInfo("bind-cfgd", "bind-cfgd")
except:
c_channel.kill()
c_channel.process.kill()
return "Unable to start bind-cfgd"
self.processes[bind_cfgd.pid] = bind_cfgd
if self.verbose:
sys.stdout.write("Started bind-cfgd with PID %d\n" % bind_cfgd.pid)
sys.stdout.write("Started bind-cfgd (PID %d)\n" % bind_cfgd.pid)
# start the parking lot
# XXX: this must be read from the configuration manager in the future
# XXX: we hardcode port 5300
if self.verbose:
sys.stdout.write("Starting parkinglot\n")
sys.stdout.write("Starting parkinglot on port 5300\n")
try:
parkinglot = subprocess.Popen(["parkinglot", "-p", "5300",],
stdin=dev_null,
stdout=dev_null,
stderr=dev_null,
close_fds=True,
env={},)
parkinglot = ProcessInfo("parkinglot", ["parkinglot", "-p", "5300"])
except:
c_channel.kill()
bind_cfgd.kill()
return "Unable to start parkinglot"
self.processes[parkinglot.pid] = parkinglot
if self.verbose:
sys.stdout.write("Started parkinglot with PID %d\n" % parkinglot.pid)
sys.stdout.write("Started parkinglot (PID %d)\n" % parkinglot.pid)
# remember our super-important process
self.cc_process = c_channel
self.runnable = True
return None
def stop_all_processes(self):
......@@ -147,36 +152,37 @@ class BoB:
if self.verbose:
sys.stdout.write("Stopping the server.\n")
# first try using the BIND 10 request to stop
if self.cc_session:
try:
self.stop_all_processes()
except:
pass
try:
self.stop_all_processes()
except:
pass
time.sleep(0.1) # XXX: some delay probably useful... how much is uncertain
# next try sending a SIGTERM
processes_to_stop = list(self.processes.values())
unstopped_processes = []
for process in processes_to_stop:
for proc_info in processes_to_stop:
if self.verbose:
sys.stdout.write("Sending SIGTERM to process %d.\n" % process.pid)
sys.stdout.write("Sending SIGTERM to %s (PID %d).\n" %
(proc_info.name, proc_info.pid))
try:
process.terminate()
proc_info.process.terminate()
except OSError as o:
# ignore these (usually ESRCH because the child
# finally exited)
pass
time.sleep(0.1) # XXX: some delay probably useful... how much is uncertain
for process in processes_to_stop:
(pid, exit_status) = os.waitpid(process.pid, os.WNOHANG)
for proc_info in processes_to_stop:
(pid, exit_status) = os.waitpid(proc_info.pid, os.WNOHANG)
if pid == 0:
unstopped_processes.append(process)
unstopped_processes.append(proc_info)
# finally, send a SIGKILL (unmaskable termination)
processes_to_stop = unstopped_processes
for process in processes_to_stop:
for proc_info in processes_to_stop:
if self.verbose:
sys.stdout.write("Sending SIGKILL to process %d.\n" % process.pid)
sys.stdout.write("Sending SIGKILL to %s (PID %d).\n" %
(proc_info.name, proc_info.pid))
try:
process.kill()
proc_info.process.kill()
except OSError as o:
# ignore these (usually ESRCH because the child
# finally exited)
......@@ -192,16 +198,41 @@ class BoB:
Returns True if everything is okay, or False if a fatal error
has been detected and the program should exit.
"""
process = self.processes.pop(pid)
self.dead_processes[process.pid] = process
proc_info = self.processes.pop(pid)
self.dead_processes[proc_info.pid] = proc_info
if self.verbose:
sys.stdout.write("Process %d died.\n" % pid)
if self.cc_process and (pid == self.cc_process.pid):
sys.stdout.write("Process %s (PID %d) died.\n" %
(proc_info.name, proc_info.pid))
if proc_info.name == "msgq":
if self.verbose:
sys.stdout.write("The msgq process died, shutting down.\n")
return False
else:
return True
self.runnable = False
def recv_and_process_cc_msg(self):
"""Receive and process the next message on the c-channel,
if any."""
routing, data = self.cc_session.group_recvmsg(False)
print("routing", routing)
print("data", data)
def restart_processes(self):
"""Restart any dead processes."""
# XXX: this needs a back-off algorithm
still_dead = {}
for proc_info in self.dead_processes.values():
if self.verbose:
sys.stdout.write("Resurrecting dead %s process...\n" %
proc_info.name)
try:
proc_info.respawn()
self.processes[proc_info.pid] = proc_info
if self.verbose:
sys.stdout.write("Resurrected %s (PID %d)\n" %
(proc_info.name, proc_info.pid))
except:
still_dead[proc_info.pid] = proc_info
# remember any processes that refuse to be resurrected
self.dead_processes = still_dead
if __name__ == "__main__":
def reaper(signal_number, stack_frame):
......@@ -214,13 +245,9 @@ if __name__ == "__main__":
if o.errno == errno.ECHILD: break
raise
if pid == 0: break
if not boss_of_bind.reap(pid, exit_status):
signal.signal(signal.SIGCHLD, signal.SIG_DFL)
boss_of_bind.shutdown()
sys.exit(0)
if boss_of_bind:
boss_of_bind.reap(pid, exit_status)
def get_signame(signal_number):
"""Return the symbolic name for a signal."""
for sig in dir(signal):
......@@ -232,14 +259,11 @@ if __name__ == "__main__":
# XXX: perhaps register atexit() function and invoke that instead
def fatal_signal(signal_number, stack_frame):
"""We need to exit (SIGINT or SIGTERM received)."""
global boss_of_bind
global options
if options.verbose:
sys.stdout.write("Received %s.\n" % get_signame(signal_number))
signal.signal(signal.SIGCHLD, signal.SIG_DFL)
if boss_of_bind:
boss_of_bind.shutdown()
sys.exit(0)
boss_of_bind.runnable = False
def check_port(option, opt_str, value, parser):
"""Function to insure that the port we are passed is actually
......@@ -265,6 +289,10 @@ if __name__ == "__main__":
# http://code.google.com/p/procname/
# http://github.com/lericson/procname/
# Create wakeup pipe for signal handlers
wakeup_pipe = os.pipe()
signal.set_wakeup_fd(wakeup_pipe[1])
# Set signal handlers for catching child termination, as well
# as our own demise.
signal.signal(signal.SIGCHLD, reaper)
......@@ -279,6 +307,36 @@ if __name__ == "__main__":
sys.stderr.write("Error on startup: %s\n" % startup_result)
sys.exit(1)
while True:
time.sleep(1)
# In our main loop, we check for dead processes or messages
# on the c-channel.
event_poller = select.poll()
wakeup_fd = wakeup_pipe[0]
event_poller.register(wakeup_fd, select.POLLIN)
cc_fd = boss_of_bind.cc_session._socket.fileno()
event_poller.register(cc_fd, select.POLLIN)
while boss_of_bind.runnable:
# XXX: get time for next restart for poll
# poll() can raise EINTR when a signal arrives,
# even if they are resumable, so we have to catch
# the exception
try:
events = event_poller.poll()
except select.error as err:
if err.args[0] == errno.EINTR:
events = []
else:
sys.stderr.write("Error with poll(); %s\n" % err)
break
for (fd, event) in events:
if fd == cc_fd:
boss_of_bind.recv_and_process_cc_msg()
elif fd == wakeup_fd:
os.read(wakeup_fd, 32)
boss_of_bind.restart_processes()
# shutdown
signal.signal(signal.SIGCHLD, signal.SIG_DFL)
boss_of_bind.shutdown()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment