bind10_src.py.in 48.2 KB
Newer Older
1
2
#!@PYTHON@

Naoki Kambe's avatar
Naoki Kambe committed
3
# Copyright (C) 2010,2011  Internet Systems Consortium.
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SYSTEMS CONSORTIUM
# DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
# INTERNET SYSTEMS CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
# FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
# WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

18
"""
19
20
This file implements the Boss of Bind (BoB, or bob) program.

Jeremy C. Reed's avatar
Jeremy C. Reed committed
21
Its purpose is to start up the BIND 10 system, and then manage the
Shane Kerr's avatar
Shane Kerr committed
22
23
processes, by starting and stopping processes, plus restarting
processes that exit.
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

To start the system, it first runs the c-channel program (msgq), then
connects to that. It then runs the configuration manager, and reads
its own configuration. Then it proceeds to starting other modules.

The Python subprocess module is used for starting processes, but
because this is not efficient for managing groups of processes,
SIGCHLD signals are caught and processed using the signal module.

Most of the logic is contained in the BoB class. However, since Python
requires that signal processing happen in the main thread, we do
signal handling outside of that class, in the code running for
__main__.
"""

39
40
41
import sys; sys.path.append ('@@PYTHONPATH@@')
import os

Jelte Jansen's avatar
Jelte Jansen committed
42
43
44
# If B10_FROM_SOURCE is set in the environment, we use data files
# from a directory relative to that, otherwise we use the ones
# installed on the system
45
46
if "B10_FROM_SOURCE" in os.environ:
    SPECFILE_LOCATION = os.environ["B10_FROM_SOURCE"] + "/src/bin/bind10/bob.spec"
Jelte Jansen's avatar
Jelte Jansen committed
47
48
49
50
51
else:
    PREFIX = "@prefix@"
    DATAROOTDIR = "@datarootdir@"
    SPECFILE_LOCATION = "@datadir@/@PACKAGE@/bob.spec".replace("${datarootdir}", DATAROOTDIR).replace("${prefix}", PREFIX)
    
52
53
54
55
56
import subprocess
import signal
import re
import errno
import time
57
import select
58
import random
Evan Hunt's avatar
Evan Hunt committed
59
import socket
60
from optparse import OptionParser, OptionValueError
61
62
63
import io
import pwd
import posix
64
import copy
65

66
from bind10_config import LIBEXECPATH
67
import bind10_config
68
import isc.cc
69
import isc.util.process
Michal Vaner's avatar
Michal Vaner committed
70
import isc.net.parse
71
import isc.log
72
from isc.log_messages.bind10_messages import *
73
74
import isc.bind10.component
import isc.bind10.special_component
75
import isc.bind10.socket_cache
76
import libutil_io_python
77
import tempfile
78

79
80
isc.log.init("b10-boss")
logger = isc.log.Logger("boss")
81
82
83

# Pending system-wide debug level definitions, the ones we
# use here are hardcoded for now
84
85
DBG_PROCESS = logger.DBGLVL_TRACE_BASIC
DBG_COMMANDS = logger.DBGLVL_TRACE_DETAIL
Michal Vaner's avatar
Michal Vaner committed
86

87
# Messages sent over the unix domain socket to indicate if it is followed by a real socket
88
89
CREATOR_SOCKET_OK = b"1\n"
CREATOR_SOCKET_UNAVAILABLE = b"0\n"
90

91
92
93
94
# RCodes of known exceptions for the get_token command
CREATOR_SOCKET_ERROR = 2
CREATOR_SHARE_ERROR = 3

Michal Vaner's avatar
Michal Vaner committed
95
# Assign this process some longer name
96
isc.util.process.rename(sys.argv[0])
97
98

# This is the version that gets displayed to the user.
99
100
# The VERSION string consists of the module name, the module version
# number, and the overall BIND 10 version number (set in configure.ac).
101
VERSION = "bind10 20110223 (BIND 10 @PACKAGE_VERSION@)"
102

103
# This is for boot_time of Boss
104
_BASETIME = time.gmtime()
105

106
107
class ProcessInfoError(Exception): pass

108
109
110
class ProcessInfo:
    """Information about a process"""

111
112
113
    dev_null = open(os.devnull, "w")

    def __init__(self, name, args, env={}, dev_null_stdout=False,
114
115
                 dev_null_stderr=False):
        self.name = name
116
117
118
119
        self.args = args
        self.env = env
        self.dev_null_stdout = dev_null_stdout
        self.dev_null_stderr = dev_null_stderr
120
121
        self.process = None
        self.pid = None
122

123
    def _preexec_work(self):
124
125
        """Function used before running a program that needs to run as a
        different user."""
126
127
128
129
        # First, put us into a separate process group so we don't get
        # SIGINT signals on Ctrl-C (the boss will shut everthing down by
        # other means).
        os.setpgrp()
130
131

    def _spawn(self):
132
133
134
135
        if self.dev_null_stdout:
            spawn_stdout = self.dev_null
        else:
            spawn_stdout = None
136
137
        if self.dev_null_stderr:
            spawn_stderr = self.dev_null
138
        else:
139
            spawn_stderr = None
140
141
142
        # Environment variables for the child process will be a copy of those
        # of the boss process with any additional specific variables given
        # on construction (self.env).
143
        spawn_env = copy.deepcopy(os.environ)
144
        spawn_env.update(self.env)
145
        spawn_env['PATH'] = LIBEXECPATH + ':' + spawn_env['PATH']
146
147
        self.process = subprocess.Popen(self.args,
                                        stdin=subprocess.PIPE,
148
                                        stdout=spawn_stdout,
149
                                        stderr=spawn_stderr,
150
                                        close_fds=True,
151
                                        env=spawn_env,
152
                                        preexec_fn=self._preexec_work)
153
154
        self.pid = self.process.pid

155
156
157
158
159
    # spawn() and respawn() are the same for now, but in the future they
    # may have different functionality
    def spawn(self):
        self._spawn()

160
161
162
    def respawn(self):
        self._spawn()

163
164
class CChannelConnectError(Exception): pass

165
166
class ProcessStartError(Exception): pass

167
168
class BoB:
    """Boss of BIND class."""
169
    
170
    def __init__(self, msgq_socket_file=None, data_path=None,
171
172
173
                 config_filename=None, clear_config=False, nocache=False,
                 verbose=False, nokill=False, setuid=None, username=None,
                 cmdctl_port=None, wait_time=10):
174
175
        """
            Initialize the Boss of BIND. This is a singleton (only one can run).
176
        
177
178
179
            The msgq_socket_file specifies the UNIX domain socket file that the
            msgq process listens on.  If verbose is True, then the boss reports
            what it is doing.
180

181
            Data path and config filename are passed through to config manager
182
183
184
185
            (if provided) and specify the config file to be used.

            The cmdctl_port is passed to cmdctl and specify on which port it
            should listen.
186
187
188
189

            wait_time controls the amount of time (in seconds) that Boss waits
            for selected processes to initialize before continuing with the
            initialization.  Currently this is only the configuration manager.
190
        """
191
        self.cc_session = None
192
        self.ccs = None
193
194
195
        self.curproc = None
        self.msgq_socket_file = msgq_socket_file
        self.nocache = nocache
196
        self.component_config = {}
197
        # Some time in future, it may happen that a single component has
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
198
199
200
201
202
203
204
        # multple processes (like a pipeline-like component). If so happens,
        # name "components" may be inapropriate. But as the code isn't probably
        # completely ready for it, we leave it at components for now. We also
        # want to support multiple instances of a single component. If it turns
        # out that we'll have a single component with multiple same processes
        # or if we start multiple components with the same configuration (we do
        # this now, but it might change) is an open question.
205
        self.components = {}
Jelte Jansen's avatar
Jelte Jansen committed
206
207
        # Simply list of components that died and need to wait for a
        # restart. Components manage their own restart schedule now
208
        self.components_to_restart = []
209
        self.runnable = False
210
211
        self.uid = setuid
        self.username = username
212
        self.verbose = verbose
213
        self.nokill = nokill
214
215
        self.data_path = data_path
        self.config_filename = config_filename
216
        self.clear_config = clear_config
217
        self.cmdctl_port = cmdctl_port
218
        self.wait_time = wait_time
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
        self._component_configurator = isc.bind10.component.Configurator(self,
            isc.bind10.special_component.get_specials())
        # The priorities here make them start in the correct order. First
        # the socket creator (which would drop root privileges by then),
        # then message queue and after that the config manager (which uses
        # the config manager)
        self.__core_components = {
            'sockcreator': {
                'kind': 'core',
                'special': 'sockcreator',
                'priority': 200
            },
            'msgq': {
                'kind': 'core',
                'special': 'msgq',
                'priority': 199
            },
            'cfgmgr': {
                'kind': 'core',
                'special': 'cfgmgr',
                'priority': 198
            }
        }
        self.__started = False
        self.exitcode = 0
244

245
246
247
        # If -v was set, enable full debug logging.
        if self.verbose:
            logger.set_severity("DEBUG", 99)
248
        # This is set in init_socket_srv
249
        self._socket_path = None
250
        self._socket_cache = None
251
252
        self._tmpdir = None
        self._srv_socket = None
253
        self._unix_sockets = {}
254

255
256
257
258
259
260
261
262
263
264
265
    def __propagate_component_config(self, config):
        comps = dict(config)
        # Fill in the core components, so they stay alive
        for comp in self.__core_components:
            if comp in comps:
                raise Exception(comp + " is core component managed by " +
                                "bind10 boss, do not set it")
            comps[comp] = self.__core_components[comp]
        # Update the configuration
        self._component_configurator.reconfigure(comps)

266
    def config_handler(self, new_config):
267
        # If this is initial update, don't do anything now, leave it to startup
268
269
        if not self.runnable:
            return
270
271
        logger.debug(DBG_COMMANDS, BIND10_RECEIVED_NEW_CONFIGURATION,
                     new_config)
272
273
274
275
276
277
        try:
            if 'components' in new_config:
                self.__propagate_component_config(new_config['components'])
            return isc.config.ccsession.create_answer(0)
        except Exception as e:
            return isc.config.ccsession.create_answer(1, str(e))
278

Shane Kerr's avatar
Shane Kerr committed
279
    def get_processes(self):
280
        pids = list(self.components.keys())
Shane Kerr's avatar
Shane Kerr committed
281
282
283
        pids.sort()
        process_list = [ ]
        for pid in pids:
284
            process_list.append([pid, self.components[pid].name()])
Shane Kerr's avatar
Shane Kerr committed
285
286
        return process_list

287
    def _get_stats_data(self):
288
289
290
291
292
        return { "owner": "Boss",
                 "data": { 'boot_time':
                               time.strftime('%Y-%m-%dT%H:%M:%SZ', _BASETIME)
                           }
                 }
293

Jelte Jansen's avatar
Jelte Jansen committed
294
    def command_handler(self, command, args):
295
        logger.debug(DBG_COMMANDS, BIND10_RECEIVED_COMMAND, command)
296
        answer = isc.config.ccsession.create_answer(1, "command not implemented")
Jelte Jansen's avatar
Jelte Jansen committed
297
        if type(command) != str:
Jelte Jansen's avatar
Jelte Jansen committed
298
            answer = isc.config.ccsession.create_answer(1, "bad command")
299
        else:
300
            if command == "shutdown":
301
                self.runnable = False
Jelte Jansen's avatar
Jelte Jansen committed
302
                answer = isc.config.ccsession.create_answer(0)
303
304
            elif command == "getstats":
                answer = isc.config.ccsession.create_answer(0, self._get_stats_data())
Naoki Kambe's avatar
Naoki Kambe committed
305
306
            elif command == "sendstats":
                # send statistics data to the stats daemon immediately
307
                stats_data = self._get_stats_data()
308
                valid = self.ccs.get_module_spec().validate_statistics(
309
                    True, stats_data["data"])
310
                if valid:
311
                    cmd = isc.config.ccsession.create_command('set', stats_data)
312
                    seq = self.cc_session.group_sendmsg(cmd, 'Stats')
313
314
315
316
317
                    # Consume the answer, in case it becomes a orphan message.
                    try:
                        self.cc_session.group_recvmsg(False, seq)
                    except isc.cc.session.SessionTimeout:
                        pass
318
                    answer = isc.config.ccsession.create_answer(0)
319
320
321
322
                else:
                    logger.fatal(BIND10_INVALID_STATISTICS_DATA);
                    answer = isc.config.ccsession.create_answer(
                        1, "specified statistics data is invalid")
Shane Kerr's avatar
Shane Kerr committed
323
324
325
            elif command == "ping":
                answer = isc.config.ccsession.create_answer(0, "pong")
            elif command == "show_processes":
326
327
                answer = isc.config.ccsession. \
                    create_answer(0, self.get_processes())
328
329
            elif command == "get_socket":
                answer = self._get_socket(args)
330
331
332
333
334
335
336
337
338
339
            elif command == "drop_socket":
                if "token" not in args:
                    answer = isc.config.ccsession. \
                        create_answer(1, "Missing token parameter")
                else:
                    try:
                        self._socket_cache.drop_socket(args["token"])
                        answer = isc.config.ccsession.create_answer(0)
                    except Exception as e:
                        answer = isc.config.ccsession.create_answer(1, str(e))
Jelte Jansen's avatar
Jelte Jansen committed
340
            else:
341
                answer = isc.config.ccsession.create_answer(1,
342
                                                            "Unknown command")
Jelte Jansen's avatar
Jelte Jansen committed
343
        return answer
344

345
    def kill_started_components(self):
346
347
348
349
        """
            Called as part of the exception handling when a process fails to
            start, this runs through the list of started processes, killing
            each one.  It then clears that list.
350
        """
351
        logger.info(BIND10_KILLING_ALL_PROCESSES)
352

353
354
355
356
        for pid in self.components:
            logger.info(BIND10_KILL_PROCESS, self.components[pid].name())
            self.components[pid].kill(True)
        self.components = {}
357

Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
358
    def _read_bind10_config(self):
359
360
361
        """
            Reads the parameters associated with the BoB module itself.

362
            This means the list of components we should start now.
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
363
364
365
366

            This could easily be combined into start_all_processes, but
            it stays because of historical reasons and because the tests
            replace the method sometimes.
367
        """
368
        logger.info(BIND10_READING_BOSS_CONFIGURATION)
369
370

        config_data = self.ccs.get_full_config()
371
        self.__propagate_component_config(config_data['components'])
372
373
374
375

    def log_starting(self, process, port = None, address = None):
        """
            A convenience function to output a "Starting xxx" message if the
376
377
            logging is set to DEBUG with debuglevel DBG_PROCESS or higher.
            Putting this into a separate method ensures
378
379
380
381
382
383
384
385
386
            that the output form is consistent across all processes.

            The process name (passed as the first argument) is put into
            self.curproc, and is used to indicate which process failed to
            start if there is an error (and is used in the "Started" message
            on success).  The optional port and address information are
            appended to the message (if present).
        """
        self.curproc = process
387
        if port is None and address is None:
388
            logger.info(BIND10_STARTING_PROCESS, self.curproc)
389
        elif address is None:
390
            logger.info(BIND10_STARTING_PROCESS_PORT, self.curproc,
391
392
                        port)
        else:
393
            logger.info(BIND10_STARTING_PROCESS_PORT_ADDRESS,
394
                        self.curproc, address, port)
395

396
397
398
399
400
401
    def log_started(self, pid = None):
        """
            A convenience function to output a 'Started xxxx (PID yyyy)'
            message.  As with starting_message(), this ensures a consistent
            format.
        """
402
403
404
405
        if pid is None:
            logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS, self.curproc)
        else:
            logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS_PID, self.curproc, pid)
406

407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
    def process_running(self, msg, who):
        """
            Some processes return a message to the Boss after they have
            started to indicate that they are running.  The form of the
            message is a dictionary with contents {"running:", "<process>"}.
            This method checks the passed message and returns True if the
            "who" process is contained in the message (so is presumably
            running).  It returns False for all other conditions and will
            log an error if appropriate.
        """
        if msg is not None:
            try:
                if msg["running"] == who:
                    return True
                else:
                    logger.error(BIND10_STARTUP_UNEXPECTED_MESSAGE, msg)
            except:
                logger.error(BIND10_STARTUP_UNRECOGNISED_MESSAGE, msg)
        
        return False

428
    # The next few methods start the individual processes of BIND-10.  They
429
430
    # are called via start_all_processes().  If any fail, an exception is
    # raised which is caught by the caller of start_all_processes(); this kills
431
432
    # processes started up to that point before terminating the program.

433
    def start_msgq(self):
434
435
436
437
        """
            Start the message queue and connect to the command channel.
        """
        self.log_starting("b10-msgq")
438
        msgq_proc = ProcessInfo("b10-msgq", ["b10-msgq"], self.c_channel_env,
439
                                True, not self.verbose)
440
441
        msgq_proc.spawn()
        self.log_started(msgq_proc.pid)
442

443
        # Now connect to the c-channel
444
445
446
447
        cc_connect_start = time.time()
        while self.cc_session is None:
            # if we have been trying for "a while" give up
            if (time.time() - cc_connect_start) > 5:
448
449
                raise CChannelConnectError("Unable to connect to c-channel after 5 seconds")

450
451
            # try to connect, and if we can't wait a short while
            try:
452
                self.cc_session = isc.cc.Session(self.msgq_socket_file)
453
            except isc.cc.session.SessionError:
454
455
                time.sleep(0.1)

456
457
458
459
        # Subscribe to the message queue.  The only messages we expect to receive
        # on this channel are once relating to process startup.
        self.cc_session.group_subscribe("Boss")

460
461
462
        return msgq_proc

    def start_cfgmgr(self):
463
464
465
466
        """
            Starts the configuration manager process
        """
        self.log_starting("b10-cfgmgr")
467
        args = ["b10-cfgmgr"]
468
        if self.data_path is not None:
469
            args.append("--data-path=" + self.data_path)
470
        if self.config_filename is not None:
471
            args.append("--config-filename=" + self.config_filename)
472
        if self.clear_config:
473
            args.append("--clear-config")
474
        bind_cfgd = ProcessInfo("b10-cfgmgr", args,
475
                                self.c_channel_env)
476
        bind_cfgd.spawn()
477
        self.log_started(bind_cfgd.pid)
478

479
480
481
        # Wait for the configuration manager to start up as subsequent initialization
        # cannot proceed without it.  The time to wait can be set on the command line.
        time_remaining = self.wait_time
482
483
484
485
486
487
488
489
490
        msg, env = self.cc_session.group_recvmsg()
        while time_remaining > 0 and not self.process_running(msg, "ConfigManager"):
            logger.debug(DBG_PROCESS, BIND10_WAIT_CFGMGR)
            time.sleep(1)
            time_remaining = time_remaining - 1
            msg, env = self.cc_session.group_recvmsg()
        
        if not self.process_running(msg, "ConfigManager"):
            raise ProcessStartError("Configuration manager process has not started")
491

492
493
        return bind_cfgd

494
495
496
497
498
499
    def start_ccsession(self, c_channel_env):
        """
            Start the CC Session

            The argument c_channel_env is unused but is supplied to keep the
            argument list the same for all start_xxx methods.
500
501
502

            With regards to logging, note that as the CC session is not a
            process, the log_starting/log_started methods are not used.
503
        """
504
        logger.info(BIND10_STARTING_CC)
505
        self.ccs = isc.config.ModuleCCSession(SPECFILE_LOCATION, 
506
                                      self.config_handler,
507
508
                                      self.command_handler,
                                      socket_file = self.msgq_socket_file)
509
        self.ccs.start()
510
        logger.debug(DBG_PROCESS, BIND10_STARTED_CC)
511
512
513
514
515
516
517
518
519
520
521
522
523

    # A couple of utility methods for starting processes...

    def start_process(self, name, args, c_channel_env, port=None, address=None):
        """
            Given a set of command arguments, start the process and output
            appropriate log messages.  If the start is successful, the process
            is added to the list of started processes.

            The port and address arguments are for log messages only.
        """
        self.log_starting(name, port, address)
        newproc = ProcessInfo(name, args, c_channel_env)
524
        newproc.spawn()
525
        self.log_started(newproc.pid)
526
        return newproc
527

Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
528
    def register_process(self, pid, component):
529
530
        """
        Put another process into boss to watch over it.  When the process
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
531
        dies, the component.failed() is called with the exit code.
532

533
534
        It is expected the info is a isc.bind10.component.BaseComponent
        subclass (or anything having the same interface).
535
        """
536
        self.components[pid] = component
537
538

    def start_simple(self, name):
539
540
541
542
543
544
545
546
547
548
549
550
        """
            Most of the BIND-10 processes are started with the command:

                <process-name> [-v]

            ... where -v is appended if verbose is enabled.  This method
            generates the arguments from the name and starts the process.

            The port and address arguments are for log messages only.
        """
        # Set up the command arguments.
        args = [name]
551
        if self.verbose:
552
            args += ['-v']
553

554
        # ... and start the process
555
        return self.start_process(name, args, self.c_channel_env)
556

557
558
559
560
561
562
    # The next few methods start up the rest of the BIND-10 processes.
    # Although many of these methods are little more than a call to
    # start_simple, they are retained (a) for testing reasons and (b) as a place
    # where modifications can be made if the process start-up sequence changes
    # for a given process.

563
    def start_auth(self):
564
565
566
        """
            Start the Authoritative server
        """
567
568
        if self.uid is not None and self.__started:
            logger.warn(BIND10_START_AS_NON_ROOT_AUTH)
569
        authargs = ['b10-auth']
570
        if self.nocache:
571
            authargs += ['-n']
572
        if self.verbose:
573
            authargs += ['-v']
574

575
        # ... and start
576
        return self.start_process("b10-auth", authargs, self.c_channel_env)
577

578
    def start_resolver(self):
579
580
581
582
583
        """
            Start the Resolver.  At present, all these arguments and switches
            are pure speculation.  As with the auth daemon, they should be
            read from the configuration database.
        """
584
585
        if self.uid is not None and self.__started:
            logger.warn(BIND10_START_AS_NON_ROOT_RESOLVER)
586
        self.curproc = "b10-resolver"
587
        # XXX: this must be read from the configuration manager in the future
588
        resargs = ['b10-resolver']
589
        if self.verbose:
590
            resargs += ['-v']
591

592
        # ... and start
593
        return self.start_process("b10-resolver", resargs, self.c_channel_env)
Likun Zhang's avatar
Likun Zhang committed
594

595
596
597
598
599
600
601
602
603
604
605
    def start_cmdctl(self):
        """
            Starts the command control process
        """
        args = ["b10-cmdctl"]
        if self.cmdctl_port is not None:
            args.append("--port=" + str(self.cmdctl_port))
        if self.verbose:
            args.append("-v")
        return self.start_process("b10-cmdctl", args, self.c_channel_env,
                                  self.cmdctl_port)
606

607
    def start_all_components(self):
608
        """
609
610
            Starts up all the components.  Any exception generated during the
            starting of the components is handled by the caller.
611
        """
612
613
        # Start the real core (sockcreator, msgq, cfgmgr)
        self._component_configurator.startup(self.__core_components)
614

615
616
        # Connect to the msgq. This is not a process, so it's not handled
        # inside the configurator.
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
617
        self.start_ccsession(self.c_channel_env)
618
619

        # Extract the parameters associated with Bob.  This can only be
620
621
        # done after the CC Session is started.  Note that the logging
        # configuration may override the "-v" switch set on the command line.
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
622
        self._read_bind10_config()
623

624
        # TODO: Return the dropping of privileges
625

626
627
628
    def startup(self):
        """
            Start the BoB instance.
629

630
631
632
633
634
635
636
            Returns None if successful, otherwise an string describing the
            problem.
        """
        # Try to connect to the c-channel daemon, to see if it is already
        # running
        c_channel_env = {}
        if self.msgq_socket_file is not None:
637
638
             c_channel_env["BIND10_MSGQ_SOCKET_FILE"] = self.msgq_socket_file
        logger.debug(DBG_PROCESS, BIND10_CHECK_MSGQ_ALREADY_RUNNING)
639
        # try to connect, and if we can't wait a short while
640
        try:
641
            self.cc_session = isc.cc.Session(self.msgq_socket_file)
642
            logger.fatal(BIND10_MSGQ_ALREADY_RUNNING)
643
644
645
646
            return "b10-msgq already running, or socket file not cleaned , cannot start"
        except isc.cc.session.SessionError:
            # this is the case we want, where the msgq is not running
            pass
647

648
649
        # Start all components.  If any one fails to start, kill all started
        # components and exit with an error indication.
650
        try:
651
            self.c_channel_env = c_channel_env
652
            self.start_all_components()
653
        except Exception as e:
654
            self.kill_started_components()
655
            return "Unable to start " + self.curproc + ": " + str(e)
656

657
        # Started successfully
658
        self.runnable = True
659
        self.__started = True
660
661
        return None

662
    def stop_process(self, process, recipient, pid):
663
664
        """
        Stop the given process, friendly-like. The process is the name it has
665
666
667
        (in logs, etc), the recipient is the address on msgq. The pid is the
        pid of the process (if we have multiple processes of the same name,
        it might want to choose if it is for this one).
668
        """
669
        logger.info(BIND10_STOP_PROCESS, process)
670
671
672
        self.cc_session.group_sendmsg(isc.config.ccsession.
                                      create_command('shutdown', {'pid': pid}),
                                      recipient, recipient)
673

674
675
676
677
    def component_shutdown(self, exitcode=0):
        """
        Stop the Boss instance from a components' request. The exitcode
        indicates the desired exit code.
678

679
680
        If we did not start yet, it raises an exception, which is meant
        to propagate through the component and configurator to the startup
681
        routine and abort the startup immediately. If it is started up already,
682
        we just mark it so we terminate soon.
683

684
685
686
687
688
689
690
        It does set the exit code in both cases.
        """
        self.exitcode = exitcode
        if not self.__started:
            raise Exception("Component failed during startup");
        else:
            self.runnable = False
691
692
693

    def shutdown(self):
        """Stop the BoB instance."""
694
        logger.info(BIND10_SHUTDOWN)
Jelte Jansen's avatar
Jelte Jansen committed
695
696
697
698
        # If ccsession is still there, inform rest of the system this module
        # is stopping. Since everything will be stopped shortly, this is not
        # really necessary, but this is done to reflect that boss is also
        # 'just' a module.
699
        self.ccs.send_stopping()
Jelte Jansen's avatar
Jelte Jansen committed
700
701

        # try using the BIND 10 request to stop
702
        try:
703
            self._component_configurator.shutdown()
704
705
        except:
            pass
706
        # XXX: some delay probably useful... how much is uncertain
707
708
        # I have changed the delay from 0.5 to 1, but sometime it's 
        # still not enough.
709
        time.sleep(1)
710
        self.reap_children()
711
712
713

        # Send TERM and KILL signals to modules if we're not prevented
        # from doing so
714
        if not self.nokill:
715
            # next try sending a SIGTERM
716
            components_to_stop = list(self.components.values())
717
            for component in components_to_stop:
718
                logger.info(BIND10_SEND_SIGTERM, component.name(), component.pid())
719
                try:
720
                    component.kill()
721
722
723
724
                except OSError:
                    # ignore these (usually ESRCH because the child
                    # finally exited)
                    pass
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
            # finally, send SIGKILL (unmaskable termination) until everybody dies
            while self.components:
                # XXX: some delay probably useful... how much is uncertain
                time.sleep(0.1)
                self.reap_children()
                components_to_stop = list(self.components.values())
                for component in components_to_stop:
                    logger.info(BIND10_SEND_SIGKILL, component.name(),
                                component.pid())
                    try:
                        component.kill(True)
                    except OSError:
                        # ignore these (usually ESRCH because the child
                        # finally exited)
                        pass
            logger.info(BIND10_SHUTDOWN_COMPLETE)
741

Shane Kerr's avatar
Shane Kerr committed
742
743
744
    def _get_process_exit_status(self):
        return os.waitpid(-1, os.WNOHANG)

745
746
747
    def reap_children(self):
        """Check to see if any of our child processes have exited, 
        and note this for later handling. 
748
        """
749
750
        while True:
            try:
Shane Kerr's avatar
Shane Kerr committed
751
                (pid, exit_status) = self._get_process_exit_status()
752
753
754
755
756
            except OSError as o:
                if o.errno == errno.ECHILD: break
                # XXX: should be impossible to get any other error here
                raise
            if pid == 0: break
757
758
759
            if pid in self.components:
                # One of the components we know about.  Get information on it.
                component = self.components.pop(pid)
760
761
                logger.info(BIND10_PROCESS_ENDED, component.name(), pid,
                            exit_status)
762
763
764
765
                if component.running() and self.runnable:
                    # Tell it it failed. But only if it matters (we are
                    # not shutting down and the component considers itself
                    # to be running.
766
767
768
769
770
                    component_restarted = component.failed(exit_status);
                    # if the process wants to be restarted, but not just yet,
                    # it returns False
                    if not component_restarted:
                        self.components_to_restart.append(component)
771
            else:
772
                logger.info(BIND10_UNKNOWN_CHILD_PROCESS_ENDED, pid)
773
774

    def restart_processes(self):
775
776
777
778
779
780
781
782
783
        """
            Restart any dead processes:

            * Returns the time when the next process is ready to be restarted. 
            * If the server is shutting down, returns 0.
            * If there are no processes, returns None.

            The values returned can be safely passed into select() as the 
            timeout value.
784

785
        """
786
        if not self.runnable:
787
            return 0
788
        still_dead = []
Jelte Jansen's avatar
Jelte Jansen committed
789
790
        # keep track of the first time we need to check this queue again,
        # if at all
791
        next_restart_time = None
792
        now = time.time()
793
        for component in self.components_to_restart:
Jelte Jansen's avatar
Jelte Jansen committed
794
            if not component.restart(now):
795
796
797
798
799
800
801
                still_dead.append(component)
                if next_restart_time is None or\
                   next_restart_time > component.get_restart_time():
                    next_restart_time = component.get_restart_time()
        self.components_to_restart = still_dead

        return next_restart_time
802

803
804
805
806
807
    def _get_socket(self, args):
        """
        Implementation of the get_socket CC command. It asks the cache
        to provide the token and sends the information back.
        """
808
809
810
811
812
813
814
815
816
817
        try:
            try:
                addr = isc.net.parse.addr_parse(args['address'])
                port = isc.net.parse.port_parse(args['port'])
                protocol = args['protocol']
                if protocol not in ['UDP', 'TCP']:
                    raise ValueError("Protocol must be either UDP or TCP")
                share_mode = args['share_mode']
                if share_mode not in ['ANY', 'SAMEAPP', 'NO']:
                    raise ValueError("Share mode must be one of ANY, SAMEAPP" +
818
                                     " or NO")
819
820
821
822
823
824
825
                share_name = args['share_name']
            except KeyError as ke:
                return \
                    isc.config.ccsession.create_answer(1,
                                                       "Missing parameter " +
                                                       str(ke))

826
827
828
            # FIXME: This call contains blocking IPC. It is expected to be
            # short, but if it turns out to be problem, we'll need to do
            # something about it.
829
830
831
832
833
834
            token = self._socket_cache.get_token(protocol, addr, port,
                                                 share_mode, share_name)
            return isc.config.ccsession.create_answer(0, {
                'token': token,
                'path': self._socket_path
            })
835
        except isc.bind10.socket_cache.SocketError as e:
836
837
            return isc.config.ccsession.create_answer(CREATOR_SOCKET_ERROR,
                                                      str(e))
838
        except isc.bind10.socket_cache.ShareError as e:
839
840
            return isc.config.ccsession.create_answer(CREATOR_SHARE_ERROR,
                                                      str(e))
841
842
        except Exception as e:
            return isc.config.ccsession.create_answer(1, str(e))
843

844
845
846
847
    def socket_request_handler(self, token, unix_socket):
        """
        This function handles a token that comes over a unix_domain socket.
        The function looks into the _socket_cache and sends the socket
848
        identified by the token back over the unix_socket.
849
        """
850
        try:
851
            token = str(token, 'ASCII') # Convert from bytes to str
852
            fd = self._socket_cache.get_socket(token, unix_socket.fileno())
853
854
855
856
857
            # FIXME: These two calls are blocking in their nature. An OS-level
            # buffer is likely to be large enough to hold all these data, but
            # if it wasn't and the remote application got stuck, we would have
            # a problem. If there appear such problems, we should do something
            # about it.
858
            unix_socket.sendall(CREATOR_SOCKET_OK)
859
860
            libutil_io_python.send_fd(unix_socket.fileno(), fd)
        except Exception as e:
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
861
            logger.info(BIND10_NO_SOCKET, token, e)
862
            unix_socket.sendall(CREATOR_SOCKET_UNAVAILABLE)
863
864
865
866
867
868
869

    def socket_consumer_dead(self, unix_socket):
        """
        This function handles when a unix_socket closes. This means all
        sockets sent to it are to be considered closed. This function signals
        so to the _socket_cache.
        """
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
870
        logger.info(BIND10_LOST_SOCKET_CONSUMER, unix_socket.fileno())
871
872
873
874
875
876
877
878
        try:
            self._socket_cache.drop_application(unix_socket.fileno())
        except ValueError:
            # This means the application holds no sockets. It's harmless, as it
            # can happen in real life - for example, it requests a socket, but
            # get_socket doesn't find it, so the application dies. It should be
            # rare, though.
            pass
879

880
    def set_creator(self, creator):
881
882
883
884
885
886
887
888
889
890
891
        """
        Registeres a socket creator into the boss. The socket creator is not
        used directly, but through a cache. The cache is created in this
        method.

        If called more than once, it raises a ValueError.
        """
        if self._socket_cache is not None:
            raise ValueError("A creator was inserted previously")
        self._socket_cache = isc.bind10.socket_cache.Cache(creator)

892
893
894
895
896
897
898
899
900
901
902
903
904
    def init_socket_srv(self):
        """
        Creates and listens on a unix-domain socket to be able to send out
        the sockets.

        This method should be called after switching user, or the switched
        applications won't be able to access the socket.
        """
        self._srv_socket = socket.socket(socket.AF_UNIX)
        # We create a temporary directory somewhere safe and unique, to avoid
        # the need to find the place ourself or bother users. Also, this
        # secures the socket on some platforms, as it creates a private
        # directory.
905
        self._tmpdir = tempfile.mkdtemp(prefix='sockcreator-')
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
        # Get the name
        self._socket_path = os.path.join(self._tmpdir, "sockcreator")
        # And bind the socket to the name
        self._srv_socket.bind(self._socket_path)
        self._srv_socket.listen(5)

    def remove_socket_srv(self):
        """
        Closes and removes the listening socket and the directory where it
        lives, as we created both.

        It does nothing if the _srv_socket is not set (eg. it was not yet
        initialized).
        """
        if self._srv_socket is not None:
            self._srv_socket.close()
            os.remove(self._socket_path)
            os.rmdir(self._tmpdir)

925
926
927
928
929
    def _srv_accept(self):
        """
        Accept a socket from the unix domain socket server and put it to the
        others we care about.
        """
930
        (socket, conn) = self._srv_socket.accept()
931
        self._unix_sockets[socket.fileno()] = (socket, b'')
932

933
934
935
936
937
938
    def _socket_data(self, socket_fileno):
        """
        This is called when a socket identified by the socket_fileno needs
        attention. We try to read data from there. If it is closed, we remove
        it.
        """
939
940
        (sock, previous) = self._unix_sockets[socket_fileno]
        while True:
941
942
943
944
945
946
947
948
949
            try:
                data = sock.recv(1, socket.MSG_DONTWAIT)
            except socket.error as se:
                # These two might be different on some systems
                if se.errno == errno.EAGAIN or se.errno == errno.EWOULDBLOCK:
                    # No more data now. Oh, well, just store what we have.
                    self._unix_sockets[socket_fileno] = (sock, previous)
                    return
                else:
950
                    data = b'' # Pretend it got closed
951
952
953
954
955
            if len(data) == 0: # The socket got to it's end
                del self._unix_sockets[socket_fileno]
                self.socket_consumer_dead(sock)
                sock.close()
                return
956
957
            else:
                if data == b"\n":
958
959
960
                    # Handle this token and clear it
                    self.socket_request_handler(previous, sock)
                    previous = b''
961
962
                else:
                    previous += data
963

964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
    def run(self, wakeup_fd):
        """
        The main loop, waiting for sockets, commands and dead processes.
        Runs as long as the runnable is true.

        The wakeup_fd descriptor is the read end of pipe where CHLD signal
        handler writes.
        """
        ccs_fd = self.ccs.get_socket().fileno()
        while self.runnable:
            # clean up any processes that exited
            self.reap_children()
            next_restart = self.restart_processes()
            if next_restart is None:
                wait_time = None
            else:
                wait_time = max(next_restart - time.time(), 0)

            # select() can raise EINTR when a signal arrives,
            # even if they are resumable, so we have to catch
            # the exception
            try:
986
987
                (rlist, wlist, xlist) = \
                    select.select([wakeup_fd, ccs_fd,
988
989
                                   self._srv_socket.fileno()] +
                                   list(self._unix_sockets.keys()), [], [],
990
                                  wait_time)
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
            except select.error as err:
                if err.args[0] == errno.EINTR:
                    (rlist, wlist, xlist) = ([], [], [])
                else:
                    logger.fatal(BIND10_SELECT_ERROR, err)
                    break

            for fd in rlist + xlist:
                if fd == ccs_fd:
                    try:
                        self.ccs.check_command()
                    except isc.cc.session.ProtocolError:
                        logger.fatal(BIND10_MSGQ_DISAPPEARED)
                        self.runnable = False
                        break
                elif fd == wakeup_fd:
                    os.read(wakeup_fd, 32)
1008
1009
                elif fd == self._srv_socket.fileno():
                    self._srv_accept()
1010
1011
                elif fd in self._unix_sockets:
                    self._socket_data(fd)
1012

1013
1014
1015
1016
# global variables, needed for signal handlers
options = None
boss_of_bind = None

Shane Kerr's avatar
Shane Kerr committed
1017
1018
1019
1020
1021
1022
def reaper(signal_number, stack_frame):
    """A child process has died (SIGCHLD received)."""
    # don't do anything... 
    # the Python signal handler has been set up to write
    # down a pipe, waking up our select() bit
    pass
1023

Shane Kerr's avatar
Shane Kerr committed
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
def get_signame(signal_number):
    """Return the symbolic name for a signal."""
    for sig in dir(signal):
        if sig.startswith("SIG") and sig[3].isalnum():
            if getattr(signal, sig) == signal_number:
                return sig
    return "Unknown signal %d" % signal_number

# XXX: perhaps register atexit() function and invoke that instead
def fatal_signal(signal_number, stack_frame):
    """We need to exit (SIGINT or SIGTERM received)."""
    global options
    global boss_of_bind
1037
    logger.info(BIND10_RECEIVED_SIGNAL, get_signame(signal_number))
Shane Kerr's avatar
Shane Kerr committed
1038
1039
1040
    signal.signal(signal.SIGCHLD, signal.SIG_DFL)
    boss_of_bind.runnable = False

Michal Vaner's avatar
Michal Vaner committed
1041
1042
def process_rename(option, opt_str, value, parser):
    """Function that renames the process if it is requested by a option."""
1043
    isc.util.process.rename(value)
Michal Vaner's avatar
Michal Vaner committed
1044

1045
def parse_args(args=sys.argv[1:], Parser=OptionParser):
1046
1047
1048
1049
    """
    Function for parsing command line arguments. Returns the
    options object from OptionParser.
    """
1050
    parser = Parser(version=VERSION)
Evan Hunt's avatar
Evan Hunt committed
1051
1052
1053
    parser.add_option("-m", "--msgq-socket-file", dest="msgq_socket_file",
                      type="string", default=None,
                      help="UNIX domain socket file the b10-msgq daemon will use")
Evan Hunt's avatar
Evan Hunt committed
1054
    parser.add_option("-n", "--no-cache", action="store_true", dest="nocache",
1055
                      default=False, help="disable hot-spot cache in authoritative DNS server")
1056
    parser.add_option("-i", "--no-kill", action="store_true", dest="nokill",
1057
                      default=False, help="do not send SIGTERM and SIGKILL signals to modules during shutdown")
1058
    parser.add_option("-u", "--user", dest="user", type="string", default=None,
1059
                      help="Change user after startup (must run as root)")
Evan Hunt's avatar
Evan Hunt committed
1060
1061
    parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
                      help="display more about what is going on")
Michal Vaner's avatar
Michal Vaner committed
1062
1063
1064
    parser.add_option("--pretty-name", type="string", action="callback",
                      callback=process_rename,
                      help="Set the process name (displayed in ps, top, ...)")
1065
1066
1067
    parser.add_option("-c", "--config-file", action="store",
                      dest="config_file", default=None,
                      help="Configuration database filename")
1068
1069
    parser.add_option("--clear-config", action="store_true",
                      dest="clear_config", default=False,
1070
1071
                      help="Create backup of the configuration file and " +
                           "start with a clean configuration")
1072
1073
1074
    parser.add_option("-p", "--data-path", dest="data_path",
                      help="Directory to search for configuration files",
                      default=None)
1075
1076
    parser.add_option("--cmdctl-port", dest="cmdctl_port", type="int",
                      default=None, help="Port of command control")
1077
1078
1079
    parser.add_option("--pid-file", dest="pid_file", type="string",
                      default=None,
                      help="file to dump the PID of the BIND 10 process")
1080
1081
    parser.add_option("-w", "--wait", dest="wait_time", type="int",
                      default=10, help="Time (in seconds) to wait for config manager to start up")
1082

1083
    (options, args) = parser.parse_args(args)
1084

1085
1086
1087
1088
1089
    if options.cmdctl_port is not None:
        try:
            isc.net.parse.port_parse(options.cmdctl_port)
        except ValueError as e:
            parser.error(e)
1090

1091
1092
1093
1094
    if args:
        parser.print_help()
        sys.exit(1)

1095
1096
    return options

1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
def dump_pid(pid_file):
    """
    Dump the PID of the current process to the specified file.  If the given
    file is None this function does nothing.  If the file already exists,
    the existing content will be removed.  If a system error happens in
    creating or writing to the file, the corresponding exception will be
    propagated to the caller.
    """
    if pid_file is None:
        return
    f = open(pid_file, "w")
    f.write('%d\n' % os.getpid())
    f.close()

def unlink_pid_file(pid_file):
    """
    Remove the given file, which is basically expected to be the PID file
1114
    created by dump_pid().  The specified may or may not exist; if it
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
    doesn't this function does nothing.  Other system level errors in removing
    the file will be propagated as the corresponding exception.
    """
    if pid_file is None:
        return
    try:
        os.unlink(pid_file)
    except OSError as error:
        if error.errno is not errno.ENOENT:
            raise

1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
def remove_lock_files():
    """
    Remove various lock files which were created by code such as in the
    logger. This function should be called after BIND 10 shutdown.
    """

    lockfiles = ["logger_lockfile"]

    lpath = bind10_config.DATA_PATH
    if "B10_FROM_BUILD" in os.environ:
        lpath = os.environ["B10_FROM_BUILD"]
    if "B10_FROM_SOURCE_LOCALSTATEDIR" in os.environ:
        lpath = os.environ["B10_FROM_SOURCE_LOCALSTATEDIR"]

    for f in lockfiles:
        os.unlink(lpath + '/' + f)

    return
1144

Shane Kerr's avatar
Shane Kerr committed
1145
1146
1147
def main():
    global options
    global boss_of_bind
1148
1149
1150
    # Enforce line buffering on stdout, even when not a TTY
    sys.stdout = io.TextIOWrapper(sys.stdout.detach(), line_buffering=True)

1151
    options = parse_args()
1152
1153