bind10_src.py.in 44.3 KB
Newer Older
1
2
#!@PYTHON@

Naoki Kambe's avatar
Naoki Kambe committed
3
# Copyright (C) 2010,2011  Internet Systems Consortium.
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SYSTEMS CONSORTIUM
# DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
# INTERNET SYSTEMS CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
# FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
# WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

18
"""
19
20
This file implements the Boss of Bind (BoB, or bob) program.

Jeremy C. Reed's avatar
Jeremy C. Reed committed
21
Its purpose is to start up the BIND 10 system, and then manage the
Shane Kerr's avatar
Shane Kerr committed
22
23
processes, by starting and stopping processes, plus restarting
processes that exit.
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

To start the system, it first runs the c-channel program (msgq), then
connects to that. It then runs the configuration manager, and reads
its own configuration. Then it proceeds to starting other modules.

The Python subprocess module is used for starting processes, but
because this is not efficient for managing groups of processes,
SIGCHLD signals are caught and processed using the signal module.

Most of the logic is contained in the BoB class. However, since Python
requires that signal processing happen in the main thread, we do
signal handling outside of that class, in the code running for
__main__.
"""

39
40
41
import sys; sys.path.append ('@@PYTHONPATH@@')
import os

Jelte Jansen's avatar
Jelte Jansen committed
42
43
44
# If B10_FROM_SOURCE is set in the environment, we use data files
# from a directory relative to that, otherwise we use the ones
# installed on the system
45
46
if "B10_FROM_SOURCE" in os.environ:
    SPECFILE_LOCATION = os.environ["B10_FROM_SOURCE"] + "/src/bin/bind10/bob.spec"
Jelte Jansen's avatar
Jelte Jansen committed
47
    ADD_LIBEXEC_PATH = False
Jelte Jansen's avatar
Jelte Jansen committed
48
49
50
51
else:
    PREFIX = "@prefix@"
    DATAROOTDIR = "@datarootdir@"
    SPECFILE_LOCATION = "@datadir@/@PACKAGE@/bob.spec".replace("${datarootdir}", DATAROOTDIR).replace("${prefix}", PREFIX)
Jelte Jansen's avatar
Jelte Jansen committed
52
    ADD_LIBEXEC_PATH = True
Jelte Jansen's avatar
Jelte Jansen committed
53
    
54
55
56
57
58
import subprocess
import signal
import re
import errno
import time
59
import select
60
import random
Evan Hunt's avatar
Evan Hunt committed
61
import socket
62
from optparse import OptionParser, OptionValueError
63
64
65
import io
import pwd
import posix
66
import copy
67

68
import isc.cc
69
import isc.util.process
Michal Vaner's avatar
Michal Vaner committed
70
import isc.net.parse
71
import isc.log
72
from isc.log_messages.bind10_messages import *
73
74
import isc.bind10.component
import isc.bind10.special_component
75
import isc.bind10.socket_cache
76
import libutil_io_python
77

78
79
isc.log.init("b10-boss")
logger = isc.log.Logger("boss")
80
81
82

# Pending system-wide debug level definitions, the ones we
# use here are hardcoded for now
83
84
DBG_PROCESS = logger.DBGLVL_TRACE_BASIC
DBG_COMMANDS = logger.DBGLVL_TRACE_DETAIL
Michal Vaner's avatar
Michal Vaner committed
85

86
87
88
89
# Messages sent over the unix domain socket to indicate if it is followed by a real socket
CREATOR_SOCKET_OK = "1\n"
CREATOR_SOCKET_UNAVAILABLE = "0\n"

Michal Vaner's avatar
Michal Vaner committed
90
# Assign this process some longer name
91
isc.util.process.rename(sys.argv[0])
92
93

# This is the version that gets displayed to the user.
94
95
# The VERSION string consists of the module name, the module version
# number, and the overall BIND 10 version number (set in configure.ac).
96
VERSION = "bind10 20110223 (BIND 10 @PACKAGE_VERSION@)"
97

98
# This is for boot_time of Boss
99
_BASETIME = time.gmtime()
100

101
102
class ProcessInfoError(Exception): pass

103
104
105
class ProcessInfo:
    """Information about a process"""

106
107
108
    dev_null = open(os.devnull, "w")

    def __init__(self, name, args, env={}, dev_null_stdout=False,
109
                 dev_null_stderr=False, uid=None, username=None):
110
111
112
113
114
        self.name = name 
        self.args = args
        self.env = env
        self.dev_null_stdout = dev_null_stdout
        self.dev_null_stderr = dev_null_stderr
115
116
        self.uid = uid
        self.username = username
117
118
        self.process = None
        self.pid = None
119

120
    def _preexec_work(self):
121
122
        """Function used before running a program that needs to run as a
        different user."""
123
124
125
126
127
        # First, put us into a separate process group so we don't get
        # SIGINT signals on Ctrl-C (the boss will shut everthing down by
        # other means).
        os.setpgrp()
        # Second, set the user ID if one has been specified
128
129
130
131
132
133
134
135
136
137
        if self.uid is not None:
            try:
                posix.setuid(self.uid)
            except OSError as e:
                if e.errno == errno.EPERM:
                    # if we failed to change user due to permission report that
                    raise ProcessInfoError("Unable to change to user %s (uid %d)" % (self.username, self.uid))
                else:
                    # otherwise simply re-raise whatever error we found
                    raise
138
139

    def _spawn(self):
140
141
142
143
        if self.dev_null_stdout:
            spawn_stdout = self.dev_null
        else:
            spawn_stdout = None
144
145
        if self.dev_null_stderr:
            spawn_stderr = self.dev_null
146
        else:
147
            spawn_stderr = None
148
149
150
        # Environment variables for the child process will be a copy of those
        # of the boss process with any additional specific variables given
        # on construction (self.env).
151
        spawn_env = copy.deepcopy(os.environ)
152
        spawn_env.update(self.env)
Jelte Jansen's avatar
Jelte Jansen committed
153
        if ADD_LIBEXEC_PATH:
154
            spawn_env['PATH'] = "@@LIBEXECDIR@@:" + spawn_env['PATH']
155
156
        self.process = subprocess.Popen(self.args,
                                        stdin=subprocess.PIPE,
157
                                        stdout=spawn_stdout,
158
                                        stderr=spawn_stderr,
159
                                        close_fds=True,
160
                                        env=spawn_env,
161
                                        preexec_fn=self._preexec_work)
162
163
        self.pid = self.process.pid

164
165
166
167
168
    # spawn() and respawn() are the same for now, but in the future they
    # may have different functionality
    def spawn(self):
        self._spawn()

169
170
171
    def respawn(self):
        self._spawn()

172
173
class CChannelConnectError(Exception): pass

174
175
class ProcessStartError(Exception): pass

176
177
class BoB:
    """Boss of BIND class."""
178
    
179
180
    def __init__(self, msgq_socket_file=None, data_path=None,
    config_filename=None, nocache=False, verbose=False, setuid=None,
181
    username=None, cmdctl_port=None, wait_time=10):
182
183
        """
            Initialize the Boss of BIND. This is a singleton (only one can run).
184
        
185
186
187
            The msgq_socket_file specifies the UNIX domain socket file that the
            msgq process listens on.  If verbose is True, then the boss reports
            what it is doing.
188

189
            Data path and config filename are passed through to config manager
190
191
192
193
            (if provided) and specify the config file to be used.

            The cmdctl_port is passed to cmdctl and specify on which port it
            should listen.
194
195
196
197

            wait_time controls the amount of time (in seconds) that Boss waits
            for selected processes to initialize before continuing with the
            initialization.  Currently this is only the configuration manager.
198
        """
199
        self.cc_session = None
200
        self.ccs = None
201
202
203
        self.curproc = None
        self.msgq_socket_file = msgq_socket_file
        self.nocache = nocache
204
        self.component_config = {}
205
206
207
208
209
        # Some time in future, it may happen that a single component has
        # multple processes. If so happens, name "components" may be
        # inapropriate. But as the code isn't probably completely ready
        # for it, we leave it at components for now.
        self.components = {}
Jelte Jansen's avatar
Jelte Jansen committed
210
211
        # Simply list of components that died and need to wait for a
        # restart. Components manage their own restart schedule now
212
        self.components_to_restart = []
213
        self.runnable = False
214
215
        self.uid = setuid
        self.username = username
216
        self.verbose = verbose
217
218
        self.data_path = data_path
        self.config_filename = config_filename
219
        self.cmdctl_port = cmdctl_port
220
        self.wait_time = wait_time
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
        self._component_configurator = isc.bind10.component.Configurator(self,
            isc.bind10.special_component.get_specials())
        # The priorities here make them start in the correct order. First
        # the socket creator (which would drop root privileges by then),
        # then message queue and after that the config manager (which uses
        # the config manager)
        self.__core_components = {
            'sockcreator': {
                'kind': 'core',
                'special': 'sockcreator',
                'priority': 200
            },
            'msgq': {
                'kind': 'core',
                'special': 'msgq',
                'priority': 199
            },
            'cfgmgr': {
                'kind': 'core',
                'special': 'cfgmgr',
                'priority': 198
            }
        }
        self.__started = False
        self.exitcode = 0
246

247
248
249
        # If -v was set, enable full debug logging.
        if self.verbose:
            logger.set_severity("DEBUG", 99)
250
        self._socket_cache = None
251
252
        # TODO: To be filled in by #1428
        self._socket_path = None
253

254
255
256
257
258
259
260
261
262
263
264
    def __propagate_component_config(self, config):
        comps = dict(config)
        # Fill in the core components, so they stay alive
        for comp in self.__core_components:
            if comp in comps:
                raise Exception(comp + " is core component managed by " +
                                "bind10 boss, do not set it")
            comps[comp] = self.__core_components[comp]
        # Update the configuration
        self._component_configurator.reconfigure(comps)

265
    def config_handler(self, new_config):
266
        # If this is initial update, don't do anything now, leave it to startup
267
268
        if not self.runnable:
            return
269
270
        logger.debug(DBG_COMMANDS, BIND10_RECEIVED_NEW_CONFIGURATION,
                     new_config)
271
272
273
274
275
276
        try:
            if 'components' in new_config:
                self.__propagate_component_config(new_config['components'])
            return isc.config.ccsession.create_answer(0)
        except Exception as e:
            return isc.config.ccsession.create_answer(1, str(e))
277

Shane Kerr's avatar
Shane Kerr committed
278
    def get_processes(self):
279
        pids = list(self.components.keys())
Shane Kerr's avatar
Shane Kerr committed
280
281
282
        pids.sort()
        process_list = [ ]
        for pid in pids:
283
            process_list.append([pid, self.components[pid].name()])
Shane Kerr's avatar
Shane Kerr committed
284
285
        return process_list

286
    def _get_stats_data(self):
287
288
289
290
291
        return { "owner": "Boss",
                 "data": { 'boot_time':
                               time.strftime('%Y-%m-%dT%H:%M:%SZ', _BASETIME)
                           }
                 }
292

Jelte Jansen's avatar
Jelte Jansen committed
293
    def command_handler(self, command, args):
294
        logger.debug(DBG_COMMANDS, BIND10_RECEIVED_COMMAND, command)
295
        answer = isc.config.ccsession.create_answer(1, "command not implemented")
Jelte Jansen's avatar
Jelte Jansen committed
296
        if type(command) != str:
Jelte Jansen's avatar
Jelte Jansen committed
297
            answer = isc.config.ccsession.create_answer(1, "bad command")
298
        else:
299
            if command == "shutdown":
300
                self.runnable = False
Jelte Jansen's avatar
Jelte Jansen committed
301
                answer = isc.config.ccsession.create_answer(0)
302
303
            elif command == "getstats":
                answer = isc.config.ccsession.create_answer(0, self._get_stats_data())
Naoki Kambe's avatar
Naoki Kambe committed
304
305
            elif command == "sendstats":
                # send statistics data to the stats daemon immediately
306
                stats_data = self._get_stats_data()
307
                valid = self.ccs.get_module_spec().validate_statistics(
308
                    True, stats_data["data"])
309
                if valid:
310
                    cmd = isc.config.ccsession.create_command('set', stats_data)
311
                    seq = self.cc_session.group_sendmsg(cmd, 'Stats')
312
313
314
315
316
                    # Consume the answer, in case it becomes a orphan message.
                    try:
                        self.cc_session.group_recvmsg(False, seq)
                    except isc.cc.session.SessionTimeout:
                        pass
317
                    answer = isc.config.ccsession.create_answer(0)
318
319
320
321
                else:
                    logger.fatal(BIND10_INVALID_STATISTICS_DATA);
                    answer = isc.config.ccsession.create_answer(
                        1, "specified statistics data is invalid")
Shane Kerr's avatar
Shane Kerr committed
322
323
324
            elif command == "ping":
                answer = isc.config.ccsession.create_answer(0, "pong")
            elif command == "show_processes":
325
326
                answer = isc.config.ccsession. \
                    create_answer(0, self.get_processes())
327
328
            elif command == "get_socket":
                answer = self._get_socket(args)
329
330
331
332
333
334
335
336
337
338
            elif command == "drop_socket":
                if "token" not in args:
                    answer = isc.config.ccsession. \
                        create_answer(1, "Missing token parameter")
                else:
                    try:
                        self._socket_cache.drop_socket(args["token"])
                        answer = isc.config.ccsession.create_answer(0)
                    except Exception as e:
                        answer = isc.config.ccsession.create_answer(1, str(e))
Jelte Jansen's avatar
Jelte Jansen committed
339
            else:
340
                answer = isc.config.ccsession.create_answer(1,
341
                                                            "Unknown command")
Jelte Jansen's avatar
Jelte Jansen committed
342
        return answer
343

344
    def kill_started_components(self):
345
346
347
348
        """
            Called as part of the exception handling when a process fails to
            start, this runs through the list of started processes, killing
            each one.  It then clears that list.
349
        """
350
        logger.info(BIND10_KILLING_ALL_PROCESSES)
351

352
353
354
355
        for pid in self.components:
            logger.info(BIND10_KILL_PROCESS, self.components[pid].name())
            self.components[pid].kill(True)
        self.components = {}
356

Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
357
    def _read_bind10_config(self):
358
359
360
        """
            Reads the parameters associated with the BoB module itself.

361
            This means the list of components we should start now.
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
362
363
364
365

            This could easily be combined into start_all_processes, but
            it stays because of historical reasons and because the tests
            replace the method sometimes.
366
        """
367
        logger.info(BIND10_READING_BOSS_CONFIGURATION)
368
369

        config_data = self.ccs.get_full_config()
370
        self.__propagate_component_config(config_data['components'])
371
372
373
374

    def log_starting(self, process, port = None, address = None):
        """
            A convenience function to output a "Starting xxx" message if the
375
376
            logging is set to DEBUG with debuglevel DBG_PROCESS or higher.
            Putting this into a separate method ensures
377
378
379
380
381
382
383
384
385
            that the output form is consistent across all processes.

            The process name (passed as the first argument) is put into
            self.curproc, and is used to indicate which process failed to
            start if there is an error (and is used in the "Started" message
            on success).  The optional port and address information are
            appended to the message (if present).
        """
        self.curproc = process
386
        if port is None and address is None:
387
            logger.info(BIND10_STARTING_PROCESS, self.curproc)
388
        elif address is None:
389
            logger.info(BIND10_STARTING_PROCESS_PORT, self.curproc,
390
391
                        port)
        else:
392
            logger.info(BIND10_STARTING_PROCESS_PORT_ADDRESS,
393
                        self.curproc, address, port)
394

395
396
397
398
399
400
    def log_started(self, pid = None):
        """
            A convenience function to output a 'Started xxxx (PID yyyy)'
            message.  As with starting_message(), this ensures a consistent
            format.
        """
401
402
403
404
        if pid is None:
            logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS, self.curproc)
        else:
            logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS_PID, self.curproc, pid)
405

406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
    def process_running(self, msg, who):
        """
            Some processes return a message to the Boss after they have
            started to indicate that they are running.  The form of the
            message is a dictionary with contents {"running:", "<process>"}.
            This method checks the passed message and returns True if the
            "who" process is contained in the message (so is presumably
            running).  It returns False for all other conditions and will
            log an error if appropriate.
        """
        if msg is not None:
            try:
                if msg["running"] == who:
                    return True
                else:
                    logger.error(BIND10_STARTUP_UNEXPECTED_MESSAGE, msg)
            except:
                logger.error(BIND10_STARTUP_UNRECOGNISED_MESSAGE, msg)
        
        return False

427
    # The next few methods start the individual processes of BIND-10.  They
428
429
    # are called via start_all_processes().  If any fail, an exception is
    # raised which is caught by the caller of start_all_processes(); this kills
430
431
    # processes started up to that point before terminating the program.

432
    def start_msgq(self):
433
434
435
436
        """
            Start the message queue and connect to the command channel.
        """
        self.log_starting("b10-msgq")
437
        msgq_proc = ProcessInfo("b10-msgq", ["b10-msgq"], self.c_channel_env,
438
439
                                True, not self.verbose, uid=self.uid,
                                username=self.username)
440
441
        msgq_proc.spawn()
        self.log_started(msgq_proc.pid)
442

443
        # Now connect to the c-channel
444
445
446
447
        cc_connect_start = time.time()
        while self.cc_session is None:
            # if we have been trying for "a while" give up
            if (time.time() - cc_connect_start) > 5:
448
449
                raise CChannelConnectError("Unable to connect to c-channel after 5 seconds")

450
451
            # try to connect, and if we can't wait a short while
            try:
452
                self.cc_session = isc.cc.Session(self.msgq_socket_file)
453
            except isc.cc.session.SessionError:
454
455
                time.sleep(0.1)

456
457
458
459
        # Subscribe to the message queue.  The only messages we expect to receive
        # on this channel are once relating to process startup.
        self.cc_session.group_subscribe("Boss")

460
461
462
        return msgq_proc

    def start_cfgmgr(self):
463
464
465
466
        """
            Starts the configuration manager process
        """
        self.log_starting("b10-cfgmgr")
467
        args = ["b10-cfgmgr"]
468
        if self.data_path is not None:
469
            args.append("--data-path=" + self.data_path)
470
        if self.config_filename is not None:
471
472
            args.append("--config-filename=" + self.config_filename)
        bind_cfgd = ProcessInfo("b10-cfgmgr", args,
473
                                self.c_channel_env, uid=self.uid,
474
                                username=self.username)
475
        bind_cfgd.spawn()
476
        self.log_started(bind_cfgd.pid)
477

478
479
480
        # Wait for the configuration manager to start up as subsequent initialization
        # cannot proceed without it.  The time to wait can be set on the command line.
        time_remaining = self.wait_time
481
482
483
484
485
486
487
488
489
        msg, env = self.cc_session.group_recvmsg()
        while time_remaining > 0 and not self.process_running(msg, "ConfigManager"):
            logger.debug(DBG_PROCESS, BIND10_WAIT_CFGMGR)
            time.sleep(1)
            time_remaining = time_remaining - 1
            msg, env = self.cc_session.group_recvmsg()
        
        if not self.process_running(msg, "ConfigManager"):
            raise ProcessStartError("Configuration manager process has not started")
490

491
492
        return bind_cfgd

493
494
495
496
497
498
    def start_ccsession(self, c_channel_env):
        """
            Start the CC Session

            The argument c_channel_env is unused but is supplied to keep the
            argument list the same for all start_xxx methods.
499
500
501

            With regards to logging, note that as the CC session is not a
            process, the log_starting/log_started methods are not used.
502
        """
503
        logger.info(BIND10_STARTING_CC)
504
        self.ccs = isc.config.ModuleCCSession(SPECFILE_LOCATION, 
505
                                      self.config_handler,
506
507
                                      self.command_handler,
                                      socket_file = self.msgq_socket_file)
508
        self.ccs.start()
509
        logger.debug(DBG_PROCESS, BIND10_STARTED_CC)
510
511
512
513
514
515
516
517
518
519
520
521
522

    # A couple of utility methods for starting processes...

    def start_process(self, name, args, c_channel_env, port=None, address=None):
        """
            Given a set of command arguments, start the process and output
            appropriate log messages.  If the start is successful, the process
            is added to the list of started processes.

            The port and address arguments are for log messages only.
        """
        self.log_starting(name, port, address)
        newproc = ProcessInfo(name, args, c_channel_env)
523
        newproc.spawn()
524
        self.log_started(newproc.pid)
525
        return newproc
526

Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
527
    def register_process(self, pid, component):
528
529
        """
        Put another process into boss to watch over it.  When the process
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
530
        dies, the component.failed() is called with the exit code.
531

532
533
        It is expected the info is a isc.bind10.component.BaseComponent
        subclass (or anything having the same interface).
534
        """
535
        self.components[pid] = component
536
537

    def start_simple(self, name):
538
539
540
541
542
543
544
545
546
547
548
549
        """
            Most of the BIND-10 processes are started with the command:

                <process-name> [-v]

            ... where -v is appended if verbose is enabled.  This method
            generates the arguments from the name and starts the process.

            The port and address arguments are for log messages only.
        """
        # Set up the command arguments.
        args = [name]
550
        if self.verbose:
551
            args += ['-v']
552

553
        # ... and start the process
554
        return self.start_process(name, args, self.c_channel_env)
555

556
557
558
559
560
561
    # The next few methods start up the rest of the BIND-10 processes.
    # Although many of these methods are little more than a call to
    # start_simple, they are retained (a) for testing reasons and (b) as a place
    # where modifications can be made if the process start-up sequence changes
    # for a given process.

562
    def start_auth(self):
563
564
565
        """
            Start the Authoritative server
        """
566
567
        if self.uid is not None and self.__started:
            logger.warn(BIND10_START_AS_NON_ROOT_AUTH)
568
        authargs = ['b10-auth']
569
        if self.nocache:
570
            authargs += ['-n']
JINMEI Tatuya's avatar
JINMEI Tatuya committed
571
        if self.uid:
572
            authargs += ['-u', str(self.uid)]
573
        if self.verbose:
574
            authargs += ['-v']
575

576
        # ... and start
577
        return self.start_process("b10-auth", authargs, self.c_channel_env)
578

579
    def start_resolver(self):
580
581
582
583
584
        """
            Start the Resolver.  At present, all these arguments and switches
            are pure speculation.  As with the auth daemon, they should be
            read from the configuration database.
        """
585
586
        if self.uid is not None and self.__started:
            logger.warn(BIND10_START_AS_NON_ROOT_RESOLVER)
587
        self.curproc = "b10-resolver"
588
        # XXX: this must be read from the configuration manager in the future
589
        resargs = ['b10-resolver']
590
591
        if self.uid:
            resargs += ['-u', str(self.uid)]
592
        if self.verbose:
593
            resargs += ['-v']
594

595
        # ... and start
596
        return self.start_process("b10-resolver", resargs, self.c_channel_env)
Likun Zhang's avatar
Likun Zhang committed
597

598
    def __ld_path_hack(self):
599
600
601
        # XXX: a quick-hack workaround.  xfrin/out will implicitly use
        # dynamically loadable data source modules, which will be installed in
        # $(libdir).
602
603
604
605
606
        # On some OSes (including MacOS X and *BSDs) the main process (python)
        # cannot find the modules unless they are located in a common shared
        # object path or a path in the (DY)LD_LIBRARY_PATH.  We should seek
        # a cleaner solution, but for a short term workaround we specify the
        # path here, unconditionally, and without even bothering which
reed's avatar
reed committed
607
        # environment variable should be used.
Jelte Jansen's avatar
Jelte Jansen committed
608
609
610
611
        #
        # We reuse the ADD_LIBEXEC_PATH variable to see whether we need to
        # do this, as the conditions that make this workaround needed are
        # the same as for the libexec path addition
612
613
        # TODO: Once #1292 is finished, remove this method and the special
        # component, use it as normal component.
614
        env = dict(self.c_channel_env)
Jelte Jansen's avatar
Jelte Jansen committed
615
        if ADD_LIBEXEC_PATH:
616
617
            cur_path = os.getenv('DYLD_LIBRARY_PATH')
            cur_path = '' if cur_path is None else ':' + cur_path
618
            env['DYLD_LIBRARY_PATH'] = "@@LIBDIR@@" + cur_path
619
620
621

            cur_path = os.getenv('LD_LIBRARY_PATH')
            cur_path = '' if cur_path is None else ':' + cur_path
622
            env['LD_LIBRARY_PATH'] = "@@LIBDIR@@" + cur_path
623
        return env
624

625
626
627
628
629
630
631
632
633
634
635
    def start_cmdctl(self):
        """
            Starts the command control process
        """
        args = ["b10-cmdctl"]
        if self.cmdctl_port is not None:
            args.append("--port=" + str(self.cmdctl_port))
        if self.verbose:
            args.append("-v")
        return self.start_process("b10-cmdctl", args, self.c_channel_env,
                                  self.cmdctl_port)
636

637
    def start_xfrin(self):
638
639
640
641
        # Set up the command arguments.
        args = ['b10-xfrin']
        if self.verbose:
            args += ['-v']
642

643
        return self.start_process("b10-xfrin", args, self.__ld_path_hack())
Naoki Kambe's avatar
Naoki Kambe committed
644

645
    def start_xfrout(self):
646
647
648
649
        # Set up the command arguments.
        args = ['b10-xfrout']
        if self.verbose:
            args += ['-v']
650

651
        return self.start_process("b10-xfrout", args, self.__ld_path_hack())
652

653
    def start_all_components(self):
654
        """
655
656
            Starts up all the components.  Any exception generated during the
            starting of the components is handled by the caller.
657
        """
658
659
        # Start the real core (sockcreator, msgq, cfgmgr)
        self._component_configurator.startup(self.__core_components)
660

661
662
        # Connect to the msgq. This is not a process, so it's not handled
        # inside the configurator.
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
663
        self.start_ccsession(self.c_channel_env)
664
665

        # Extract the parameters associated with Bob.  This can only be
666
667
        # done after the CC Session is started.  Note that the logging
        # configuration may override the "-v" switch set on the command line.
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
668
        self._read_bind10_config()
669

670
        # TODO: Return the dropping of privileges
671

672
673
674
    def startup(self):
        """
            Start the BoB instance.
675

676
677
678
679
680
681
682
            Returns None if successful, otherwise an string describing the
            problem.
        """
        # Try to connect to the c-channel daemon, to see if it is already
        # running
        c_channel_env = {}
        if self.msgq_socket_file is not None:
683
684
             c_channel_env["BIND10_MSGQ_SOCKET_FILE"] = self.msgq_socket_file
        logger.debug(DBG_PROCESS, BIND10_CHECK_MSGQ_ALREADY_RUNNING)
685
        # try to connect, and if we can't wait a short while
686
        try:
687
            self.cc_session = isc.cc.Session(self.msgq_socket_file)
688
            logger.fatal(BIND10_MSGQ_ALREADY_RUNNING)
689
690
691
692
            return "b10-msgq already running, or socket file not cleaned , cannot start"
        except isc.cc.session.SessionError:
            # this is the case we want, where the msgq is not running
            pass
693

694
695
        # Start all components.  If any one fails to start, kill all started
        # components and exit with an error indication.
696
        try:
697
            self.c_channel_env = c_channel_env
698
            self.start_all_components()
699
        except Exception as e:
700
            self.kill_started_components()
701
            return "Unable to start " + self.curproc + ": " + str(e)
702

703
        # Started successfully
704
        self.runnable = True
705
        self.__started = True
706
707
        return None

708
    def stop_process(self, process, recipient):
709
710
        """
        Stop the given process, friendly-like. The process is the name it has
711
        (in logs, etc), the recipient is the address on msgq.
712
        """
713
        logger.info(BIND10_STOP_PROCESS, process)
714
715
        self.cc_session.group_sendmsg({'command': ['shutdown']}, recipient,
            recipient)
716

717
718
719
720
    def component_shutdown(self, exitcode=0):
        """
        Stop the Boss instance from a components' request. The exitcode
        indicates the desired exit code.
721

722
723
724
725
        If we did not start yet, it raises an exception, which is meant
        to propagate through the component and configurator to the startup
        routine and abort the startup imediatelly. If it is started up already,
        we just mark it so we terminate soon.
726

727
728
729
730
731
732
733
        It does set the exit code in both cases.
        """
        self.exitcode = exitcode
        if not self.__started:
            raise Exception("Component failed during startup");
        else:
            self.runnable = False
734
735
736

    def shutdown(self):
        """Stop the BoB instance."""
737
        logger.info(BIND10_SHUTDOWN)
738
        # first try using the BIND 10 request to stop
739
        try:
740
            self._component_configurator.shutdown()
741
742
        except:
            pass
743
        # XXX: some delay probably useful... how much is uncertain
744
745
        # I have changed the delay from 0.5 to 1, but sometime it's 
        # still not enough.
746
        time.sleep(1)
747
        self.reap_children()
748
        # next try sending a SIGTERM
749
        components_to_stop = list(self.components.values())
750
751
        for component in components_to_stop:
            logger.info(BIND10_SEND_SIGTERM, component.name(), component.pid())
752
            try:
753
                component.kill()
Shane Kerr's avatar
Shane Kerr committed
754
            except OSError:
755
756
757
                # ignore these (usually ESRCH because the child
                # finally exited)
                pass
758
        # finally, send SIGKILL (unmaskable termination) until everybody dies
759
        while self.components:
760
761
762
            # XXX: some delay probably useful... how much is uncertain
            time.sleep(0.1)  
            self.reap_children()
763
            components_to_stop = list(self.components.values())
764
765
766
            for component in components_to_stop:
                logger.info(BIND10_SEND_SIGKILL, component.name(),
                            component.pid())
767
                try:
768
                    component.kill(True)
769
770
771
772
                except OSError:
                    # ignore these (usually ESRCH because the child
                    # finally exited)
                    pass
773
        logger.info(BIND10_SHUTDOWN_COMPLETE)
774

Shane Kerr's avatar
Shane Kerr committed
775
776
777
    def _get_process_exit_status(self):
        return os.waitpid(-1, os.WNOHANG)

778
779
780
    def reap_children(self):
        """Check to see if any of our child processes have exited, 
        and note this for later handling. 
781
        """
782
783
        while True:
            try:
Shane Kerr's avatar
Shane Kerr committed
784
                (pid, exit_status) = self._get_process_exit_status()
785
786
787
788
789
            except OSError as o:
                if o.errno == errno.ECHILD: break
                # XXX: should be impossible to get any other error here
                raise
            if pid == 0: break
790
791
792
            if pid in self.components:
                # One of the components we know about.  Get information on it.
                component = self.components.pop(pid)
793
794
                logger.info(BIND10_PROCESS_ENDED, component.name(), pid,
                            exit_status)
795
796
797
798
                if component.running() and self.runnable:
                    # Tell it it failed. But only if it matters (we are
                    # not shutting down and the component considers itself
                    # to be running.
799
800
801
802
803
                    component_restarted = component.failed(exit_status);
                    # if the process wants to be restarted, but not just yet,
                    # it returns False
                    if not component_restarted:
                        self.components_to_restart.append(component)
804
            else:
805
                logger.info(BIND10_UNKNOWN_CHILD_PROCESS_ENDED, pid)
806
807

    def restart_processes(self):
808
809
810
811
812
813
814
815
816
        """
            Restart any dead processes:

            * Returns the time when the next process is ready to be restarted. 
            * If the server is shutting down, returns 0.
            * If there are no processes, returns None.

            The values returned can be safely passed into select() as the 
            timeout value.
817

818
        """
819
        if not self.runnable:
820
            return 0
821
        still_dead = []
Jelte Jansen's avatar
Jelte Jansen committed
822
823
        # keep track of the first time we need to check this queue again,
        # if at all
824
        next_restart_time = None
825
        now = time.time()
826
        for component in self.components_to_restart:
Jelte Jansen's avatar
Jelte Jansen committed
827
            if not component.restart(now):
828
829
830
831
832
833
834
                still_dead.append(component)
                if next_restart_time is None or\
                   next_restart_time > component.get_restart_time():
                    next_restart_time = component.get_restart_time()
        self.components_to_restart = still_dead

        return next_restart_time
835

836
837
838
839
840
    def _get_socket(self, args):
        """
        Implementation of the get_socket CC command. It asks the cache
        to provide the token and sends the information back.
        """
841
842
843
844
845
846
847
848
849
850
        try:
            try:
                addr = isc.net.parse.addr_parse(args['address'])
                port = isc.net.parse.port_parse(args['port'])
                protocol = args['protocol']
                if protocol not in ['UDP', 'TCP']:
                    raise ValueError("Protocol must be either UDP or TCP")
                share_mode = args['share_mode']
                if share_mode not in ['ANY', 'SAMEAPP', 'NO']:
                    raise ValueError("Share mode must be one of ANY, SAMEAPP" +
851
                                     " or NO")
852
853
854
855
856
857
858
                share_name = args['share_name']
            except KeyError as ke:
                return \
                    isc.config.ccsession.create_answer(1,
                                                       "Missing parameter " +
                                                       str(ke))

859
860
861
            # FIXME: This call contains blocking IPC. It is expected to be
            # short, but if it turns out to be problem, we'll need to do
            # something about it.
862
863
864
865
866
867
868
869
            token = self._socket_cache.get_token(protocol, addr, port,
                                                 share_mode, share_name)
            return isc.config.ccsession.create_answer(0, {
                'token': token,
                'path': self._socket_path
            })
        except Exception as e:
            return isc.config.ccsession.create_answer(1, str(e))
870

871
872
873
874
    def socket_request_handler(self, token, unix_socket):
        """
        This function handles a token that comes over a unix_domain socket.
        The function looks into the _socket_cache and sends the socket
875
        identified by the token back over the unix_socket.
876
        """
877
878
        try:
            fd = self._socket_cache.get_socket(token, unix_socket.fileno())
879
880
881
882
883
            # FIXME: These two calls are blocking in their nature. An OS-level
            # buffer is likely to be large enough to hold all these data, but
            # if it wasn't and the remote application got stuck, we would have
            # a problem. If there appear such problems, we should do something
            # about it.
884
            unix_socket.sendall(CREATOR_SOCKET_OK)
885
886
            libutil_io_python.send_fd(unix_socket.fileno(), fd)
        except Exception as e:
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
887
            logger.info(BIND10_NO_SOCKET, token, e)
888
            unix_socket.sendall(CREATOR_SOCKET_UNAVAILABLE)
889
890
891
892
893
894
895

    def socket_consumer_dead(self, unix_socket):
        """
        This function handles when a unix_socket closes. This means all
        sockets sent to it are to be considered closed. This function signals
        so to the _socket_cache.
        """
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
896
        logger.info(BIND10_LOST_SOCKET_CONSUMER, unix_socket.fileno())
897
898
899
900
901
902
903
904
        try:
            self._socket_cache.drop_application(unix_socket.fileno())
        except ValueError:
            # This means the application holds no sockets. It's harmless, as it
            # can happen in real life - for example, it requests a socket, but
            # get_socket doesn't find it, so the application dies. It should be
            # rare, though.
            pass
905

906
    def set_creator(self, creator):
907
908
909
910
911
912
913
914
915
916
917
        """
        Registeres a socket creator into the boss. The socket creator is not
        used directly, but through a cache. The cache is created in this
        method.

        If called more than once, it raises a ValueError.
        """
        if self._socket_cache is not None:
            raise ValueError("A creator was inserted previously")
        self._socket_cache = isc.bind10.socket_cache.Cache(creator)

918
919
920
921
# global variables, needed for signal handlers
options = None
boss_of_bind = None

Shane Kerr's avatar
Shane Kerr committed
922
923
924
925
926
927
def reaper(signal_number, stack_frame):
    """A child process has died (SIGCHLD received)."""
    # don't do anything... 
    # the Python signal handler has been set up to write
    # down a pipe, waking up our select() bit
    pass
928

Shane Kerr's avatar
Shane Kerr committed
929
930
931
932
933
934
935
936
937
938
939
940
941
def get_signame(signal_number):
    """Return the symbolic name for a signal."""
    for sig in dir(signal):
        if sig.startswith("SIG") and sig[3].isalnum():
            if getattr(signal, sig) == signal_number:
                return sig
    return "Unknown signal %d" % signal_number

# XXX: perhaps register atexit() function and invoke that instead
def fatal_signal(signal_number, stack_frame):
    """We need to exit (SIGINT or SIGTERM received)."""
    global options
    global boss_of_bind
942
    logger.info(BIND10_RECEIVED_SIGNAL, get_signame(signal_number))
Shane Kerr's avatar
Shane Kerr committed
943
944
945
    signal.signal(signal.SIGCHLD, signal.SIG_DFL)
    boss_of_bind.runnable = False

Michal Vaner's avatar
Michal Vaner committed
946
947
def process_rename(option, opt_str, value, parser):
    """Function that renames the process if it is requested by a option."""
948
    isc.util.process.rename(value)
Michal Vaner's avatar
Michal Vaner committed
949

950
def parse_args(args=sys.argv[1:], Parser=OptionParser):
951
952
953
954
    """
    Function for parsing command line arguments. Returns the
    options object from OptionParser.
    """
955
    parser = Parser(version=VERSION)
Evan Hunt's avatar
Evan Hunt committed
956
957
958
    parser.add_option("-m", "--msgq-socket-file", dest="msgq_socket_file",
                      type="string", default=None,
                      help="UNIX domain socket file the b10-msgq daemon will use")
Evan Hunt's avatar
Evan Hunt committed
959
    parser.add_option("-n", "--no-cache", action="store_true", dest="nocache",
960
961
                      default=False, help="disable hot-spot cache in authoritative DNS server")
    parser.add_option("-u", "--user", dest="user", type="string", default=None,
962
                      help="Change user after startup (must run as root)")
Evan Hunt's avatar
Evan Hunt committed
963
964
    parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
                      help="display more about what is going on")
Michal Vaner's avatar
Michal Vaner committed
965
966
967
    parser.add_option("--pretty-name", type="string", action="callback",
                      callback=process_rename,
                      help="Set the process name (displayed in ps, top, ...)")
968
969
970
971
972
973
    parser.add_option("-c", "--config-file", action="store",
                      dest="config_file", default=None,
                      help="Configuration database filename")
    parser.add_option("-p", "--data-path", dest="data_path",
                      help="Directory to search for configuration files",
                      default=None)
974
975
    parser.add_option("--cmdctl-port", dest="cmdctl_port", type="int",
                      default=None, help="Port of command control")
976
977
978
    parser.add_option("--pid-file", dest="pid_file", type="string",
                      default=None,
                      help="file to dump the PID of the BIND 10 process")
979
980
    parser.add_option("-w", "--wait", dest="wait_time", type="int",
                      default=10, help="Time (in seconds) to wait for config manager to start up")
981

982
    (options, args) = parser.parse_args(args)
983

984
985
986
987
988
    if options.cmdctl_port is not None:
        try:
            isc.net.parse.port_parse(options.cmdctl_port)
        except ValueError as e:
            parser.error(e)
989

990
991
992
993
    if args:
        parser.print_help()
        sys.exit(1)

994
995
    return options

996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
def dump_pid(pid_file):
    """
    Dump the PID of the current process to the specified file.  If the given
    file is None this function does nothing.  If the file already exists,
    the existing content will be removed.  If a system error happens in
    creating or writing to the file, the corresponding exception will be
    propagated to the caller.
    """
    if pid_file is None:
        return
    f = open(pid_file, "w")
    f.write('%d\n' % os.getpid())
    f.close()

def unlink_pid_file(pid_file):
    """
    Remove the given file, which is basically expected to be the PID file
1013
    created by dump_pid().  The specified may or may not exist; if it
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
    doesn't this function does nothing.  Other system level errors in removing
    the file will be propagated as the corresponding exception.
    """
    if pid_file is None:
        return
    try:
        os.unlink(pid_file)
    except OSError as error:
        if error.errno is not errno.ENOENT:
            raise

1025

Shane Kerr's avatar
Shane Kerr committed
1026
1027
1028
def main():
    global options
    global boss_of_bind
1029
1030
1031
    # Enforce line buffering on stdout, even when not a TTY
    sys.stdout = io.TextIOWrapper(sys.stdout.detach(), line_buffering=True)

1032
    options = parse_args()
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059

    # Check user ID.
    setuid = None
    username = None
    if options.user:
        # Try getting information about the user, assuming UID passed.
        try:
            pw_ent = pwd.getpwuid(int(options.user))
            setuid = pw_ent.pw_uid
            username = pw_ent.pw_name
        except ValueError:
            pass
        except KeyError:
            pass

        # Next try getting information about the user, assuming user name 
        # passed.
        # If the information is both a valid user name and user number, we
        # prefer the name because we try it second. A minor point, hopefully.
        try:
            pw_ent = pwd.getpwnam(options.user)
            setuid = pw_ent.pw_uid
            username = pw_ent.pw_name
        except KeyError:
            pass

        if setuid is None:
1060
            logger.fatal(BIND10_INVALID_USER, options.user)
1061
            sys.exit(1)
1062
1063

    # Announce startup.
1064
    logger.info(BIND10_STARTING, VERSION)
1065

1066
1067
1068
1069
    # Create wakeup pipe for signal handlers
    wakeup_pipe = os.pipe()
    signal.set_wakeup_fd(wakeup_pipe[1])

1070
1071
    # Set signal handlers for catching child termination, as well
    # as our own demise.
1072
1073
1074
1075
1076
    signal.signal(signal.SIGCHLD, reaper)
    signal.siginterrupt(signal.SIGCHLD, False)
    signal.signal(signal.SIGINT, fatal_signal)
    signal.signal(signal.SIGTERM, fatal_signal)