Commit 9dc63b3e authored by Shane Kerr's avatar Shane Kerr
Browse files

Added back-off when processes terminate too quickly.



git-svn-id: svn://bind10.isc.org/svn/bind10/trunk@1233 e5f2f494-b856-4b98-b285-d166d9295462
parent 8306380f
......@@ -8,7 +8,6 @@
- Force-stop a component
- Mechanism to wait for child to start before continuing
- Way to ask a child to die politely
- Back-off mechanism for restarting failed processes
- Start statistics daemon
- Statistics interaction (?)
- Use .spec file to define comands
......
......@@ -3,6 +3,7 @@
import sys; sys.path.append ('@@PYTHONPATH@@')
import os
import time
import random
"""\
This file implements the Boss of Bind (BoB, or bob) program.
......@@ -51,7 +52,7 @@ import isc.cc
import isc
# This is the version that gets displayed to the user.
__version__ = "v20100308"
__version__ = "v20100309"
# Nothing at all to do with the 1990-12-10 article here:
# http://www.subgenius.com/subg-digest/v2/0056.html
......@@ -70,7 +71,10 @@ a simple set of rules:
* If a process was been running for >=10 seconds, we restart it
right away.
* If a process was running for <10 seconds, we wait until 10 seconds
after it was started."""
after it was started.
To avoid programs getting into lockstep, we use a normal distribution
to avoid being restarted at exactly 10 seconds."""
def __init__(self, restart_frequency=10.0):
self.restart_frequency = restart_frequency
......@@ -82,7 +86,9 @@ a simple set of rules:
if when is None:
when = time.time()
self.run_start_time = when
self.restart_time = when + self.restart_frequency
sigma = self.restart_frequency * 0.05
self.restart_time = when + random.normalvariate(self.restart_frequency,
sigma)
def set_run_stop_time(self, when=None):
if when is None:
......@@ -121,15 +127,15 @@ class ProcessInfo:
close_fds=True,
env=spawn_env,)
self.pid = self.process.pid
self.restart_schedule.set_run_start_time()
def __init__(self, name, args, env={}, dev_null_stdout=False):
self.name = name
self.args = args
self.env = env
self.dev_null_stdout = dev_null_stdout
self.restart_schedule = RestartSchedule()
self._spawn()
self.last_spawn_time = time.time()
# self.respawn
def respawn(self):
self._spawn()
......@@ -358,6 +364,7 @@ class BoB:
if pid == 0: break
if pid in self.processes:
proc_info = self.processes.pop(pid)
proc_info.restart_schedule.set_run_stop_time()
self.dead_processes[proc_info.pid] = proc_info
if self.verbose:
sys.stdout.write("Process %s (PID %d) died.\n" %
......@@ -427,26 +434,39 @@ class BoB:
def restart_processes(self):
"""Restart any dead processes."""
# XXX: this needs a back-off algorithm
next_restart = None
# if we're shutting down, then don't restart
if not self.runnable:
return
return next_restart
# otherwise look through each dead process and try to restart
still_dead = {}
now = time.time()
for proc_info in self.dead_processes.values():
if self.verbose:
sys.stdout.write("Resurrecting dead %s process...\n" %
proc_info.name)
try:
proc_info.respawn()
self.processes[proc_info.pid] = proc_info
if self.verbose:
sys.stdout.write("Resurrected %s (PID %d)\n" %
(proc_info.name, proc_info.pid))
except:
restart_time = proc_info.restart_schedule.get_restart_time(now)
if restart_time > now:
# if self.verbose:
# sys.stdout.write("Dead %s process waiting %.1f seconds "\
# "for resurrection\n" %
# (proc_info.name, (restart_time-now)))
if (next_restart is None) or (next_restart > restart_time):
next_restart = restart_time
still_dead[proc_info.pid] = proc_info
else:
if self.verbose:
sys.stdout.write("Resurrecting dead %s process...\n" %
proc_info.name)
try:
proc_info.respawn()
self.processes[proc_info.pid] = proc_info
if self.verbose:
sys.stdout.write("Resurrected %s (PID %d)\n" %
(proc_info.name, proc_info.pid))
except:
still_dead[proc_info.pid] = proc_info
# remember any processes that refuse to be resurrected
self.dead_processes = still_dead
# return the time when the next process is ready to be restarted
return next_restart
def reaper(signal_number, stack_frame):
"""A child process has died (SIGCHLD received)."""
......@@ -525,15 +545,18 @@ def main():
while boss_of_bind.runnable:
# clean up any processes that exited
boss_of_bind.reap_children()
boss_of_bind.restart_processes()
# XXX: get time for next restart for timeout
next_restart = boss_of_bind.restart_processes()
if next_restart is None:
wait_time = None
else:
wait_time = max(next_restart - time.time(), 0)
# select() can raise EINTR when a signal arrives,
# even if they are resumable, so we have to catch
# the exception
try:
(rlist, wlist, xlist) = select.select([wakeup_fd, ccs_fd], [], [])
(rlist, wlist, xlist) = select.select([wakeup_fd, ccs_fd], [], [],
wait_time)
except select.error as err:
if err.args[0] == errno.EINTR:
(rlist, wlist, xlist) = ([], [], [])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment