init.py.in 54.5 KB
Newer Older
1
2
#!@PYTHON@

Naoki Kambe's avatar
Naoki Kambe committed
3
# Copyright (C) 2010,2011  Internet Systems Consortium.
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SYSTEMS CONSORTIUM
# DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
# INTERNET SYSTEMS CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
# FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
# WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

18
"""
19
This file implements the b10-init program.
20

Jeremy C. Reed's avatar
Jeremy C. Reed committed
21
Its purpose is to start up the BIND 10 system, and then manage the
Shane Kerr's avatar
Shane Kerr committed
22
23
processes, by starting and stopping processes, plus restarting
processes that exit.
24
25
26
27
28
29
30
31
32

To start the system, it first runs the c-channel program (msgq), then
connects to that. It then runs the configuration manager, and reads
its own configuration. Then it proceeds to starting other modules.

The Python subprocess module is used for starting processes, but
because this is not efficient for managing groups of processes,
SIGCHLD signals are caught and processed using the signal module.

33
Most of the logic is contained in the Init class. However, since Python
34
35
36
37
38
requires that signal processing happen in the main thread, we do
signal handling outside of that class, in the code running for
__main__.
"""

39
40
import sys; sys.path.append ('@@PYTHONPATH@@')
import os
41
from isc.util.address_formatter import AddressFormatter
42

Jelte Jansen's avatar
Jelte Jansen committed
43
44
45
# If B10_FROM_SOURCE is set in the environment, we use data files
# from a directory relative to that, otherwise we use the ones
# installed on the system
46
if "B10_FROM_SOURCE" in os.environ:
47
48
    SPECFILE_LOCATION = os.environ["B10_FROM_SOURCE"] +\
                        "/src/bin/bind10/init.spec"
Jelte Jansen's avatar
Jelte Jansen committed
49
50
51
else:
    PREFIX = "@prefix@"
    DATAROOTDIR = "@datarootdir@"
52
53
54
    SPECFILE_LOCATION = "@datadir@/@PACKAGE@/init.spec"\
                         .replace("${datarootdir}", DATAROOTDIR)\
                         .replace("${prefix}", PREFIX)
55

56
57
58
59
60
import subprocess
import signal
import re
import errno
import time
61
import select
62
import random
Evan Hunt's avatar
Evan Hunt committed
63
import socket
64
from optparse import OptionParser, OptionValueError
65
66
67
import io
import pwd
import posix
68
import copy
69

70
from bind10_config import LIBEXECPATH
71
import bind10_config
72
import isc.cc
73
import isc.util.process
Michal Vaner's avatar
Michal Vaner committed
74
import isc.net.parse
75
import isc.log
76
import isc.config
77
from isc.log_messages.init_messages import *
78
79
import isc.bind10.component
import isc.bind10.special_component
80
import isc.bind10.socket_cache
81
import isc.util.traceback_handler
82
import libutil_io_python
83
import tempfile
84

85
86
isc.log.init("b10-init", buffer=True)
logger = isc.log.Logger("init")
87
88
89

# Pending system-wide debug level definitions, the ones we
# use here are hardcoded for now
90
91
DBG_PROCESS = logger.DBGLVL_TRACE_BASIC
DBG_COMMANDS = logger.DBGLVL_TRACE_DETAIL
Michal Vaner's avatar
Michal Vaner committed
92

93
94
# Messages sent over the unix domain socket to indicate if it is followed by a
# real socket
95
96
CREATOR_SOCKET_OK = b"1\n"
CREATOR_SOCKET_UNAVAILABLE = b"0\n"
97

98
99
100
101
# RCodes of known exceptions for the get_token command
CREATOR_SOCKET_ERROR = 2
CREATOR_SHARE_ERROR = 3

Michal Vaner's avatar
Michal Vaner committed
102
# Assign this process some longer name
103
isc.util.process.rename()
104
105

# This is the version that gets displayed to the user.
106
107
# The VERSION string consists of the module name, the module version
# number, and the overall BIND 10 version number (set in configure.ac).
108
VERSION = "bind10 20110223 (BIND 10 @PACKAGE_VERSION@)"
109

Jelte Jansen's avatar
Jelte Jansen committed
110
# This is for boot_time of Init
111
_BASETIME = time.gmtime()
112

113
114
115
116
117
118
119
120
121
122
123
124
125
# Detailed error message commonly used on startup failure, possibly due to
# permission issue regarding log lock file.  We dump verbose message because
# it may not be clear exactly what to do if it simply says
# "failed to open <filename>: permission denied"
NOTE_ON_LOCK_FILE = """\
TIP: if this is about permission error for a lock file, check if the directory
of the file is writable for the user of the bind10 process; often you need
to start bind10 as a super user.  Also, if you specify the -u option to
change the user and group, the directory must be writable for the group,
and the created lock file must be writable for that user. Finally, make sure
the lock file is not left in the directly before restarting.
"""

126
127
class ProcessInfoError(Exception): pass

128
129
130
131
132
133
134
135
136
137
class ChangeUserError(Exception):
    '''Exception raised when setuid/setgid fails.

    When raised, it's expected to be propagated via underlying component
    management modules to the top level so that it will help provide useful
    fatal error message.

    '''
    pass

138
139
140
class ProcessInfo:
    """Information about a process"""

141
142
143
    dev_null = open(os.devnull, "w")

    def __init__(self, name, args, env={}, dev_null_stdout=False,
144
145
                 dev_null_stderr=False):
        self.name = name
146
147
148
149
        self.args = args
        self.env = env
        self.dev_null_stdout = dev_null_stdout
        self.dev_null_stderr = dev_null_stderr
150
151
        self.process = None
        self.pid = None
152

153
    def _preexec_work(self):
154
155
        """Function used before running a program that needs to run as a
        different user."""
156
        # First, put us into a separate process group so we don't get
157
        # SIGINT signals on Ctrl-C (b10-init will shut everything down by
158
159
        # other means).
        os.setpgrp()
160
161

    def _spawn(self):
162
163
164
165
        if self.dev_null_stdout:
            spawn_stdout = self.dev_null
        else:
            spawn_stdout = None
166
167
        if self.dev_null_stderr:
            spawn_stderr = self.dev_null
168
        else:
169
            spawn_stderr = None
170
        # Environment variables for the child process will be a copy of those
171
        # of the b10-init process with any additional specific variables given
172
        # on construction (self.env).
173
        spawn_env = copy.deepcopy(os.environ)
174
        spawn_env.update(self.env)
175
        spawn_env['PATH'] = LIBEXECPATH + ':' + spawn_env['PATH']
176
177
        self.process = subprocess.Popen(self.args,
                                        stdin=subprocess.PIPE,
178
                                        stdout=spawn_stdout,
179
                                        stderr=spawn_stderr,
180
                                        close_fds=True,
181
                                        env=spawn_env,
182
                                        preexec_fn=self._preexec_work)
183
184
        self.pid = self.process.pid

185
186
187
188
189
    # spawn() and respawn() are the same for now, but in the future they
    # may have different functionality
    def spawn(self):
        self._spawn()

190
191
192
    def respawn(self):
        self._spawn()

193
194
class CChannelConnectError(Exception): pass

195
196
class ProcessStartError(Exception): pass

197
class Init:
Jelte Jansen's avatar
Jelte Jansen committed
198
    """Init of BIND class."""
199

200
    def __init__(self, msgq_socket_file=None, data_path=None,
201
                 config_filename=None, clear_config=False,
202
203
                 verbose=False, nokill=False, setuid=None, setgid=None,
                 username=None, cmdctl_port=None, wait_time=10):
204
        """
205
206
            Initialize the Init of BIND. This is a singleton (only one can
            run).
207

208
            The msgq_socket_file specifies the UNIX domain socket file that the
209
            msgq process listens on.  If verbose is True, then b10-init reports
210
            what it is doing.
211

212
            Data path and config filename are passed through to config manager
213
214
215
216
            (if provided) and specify the config file to be used.

            The cmdctl_port is passed to cmdctl and specify on which port it
            should listen.
217

Jelte Jansen's avatar
Jelte Jansen committed
218
            wait_time controls the amount of time (in seconds) that Init waits
219
220
            for selected processes to initialize before continuing with the
            initialization.  Currently this is only the configuration manager.
221
        """
222
        self.cc_session = None
223
        self.ccs = None
224
225
        self.curproc = None
        self.msgq_socket_file = msgq_socket_file
226
        self.component_config = {}
227
        # Some time in future, it may happen that a single component has
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
228
        # multple processes (like a pipeline-like component). If so happens,
229
230
231
232
233
234
235
        # name "components" may be inappropriate. But as the code isn't
        # probably completely ready for it, we leave it at components for
        # now. We also want to support multiple instances of a single
        # component. If it turns out that we'll have a single component with
        # multiple same processes or if we start multiple components with the
        # same configuration (we do this now, but it might change) is an open
        # question.
236
        self.components = {}
Jelte Jansen's avatar
Jelte Jansen committed
237
238
        # Simply list of components that died and need to wait for a
        # restart. Components manage their own restart schedule now
239
        self.components_to_restart = []
240
        self.runnable = False
241
242
        self.__uid = setuid
        self.__gid = setgid
243
        self.username = username
244
        self.verbose = verbose
245
        self.nokill = nokill
246
247
        self.data_path = data_path
        self.config_filename = config_filename
248
        self.clear_config = clear_config
249
        self.cmdctl_port = cmdctl_port
250
        self.wait_time = wait_time
251
        self.msgq_timeout = 5
252
253
254
255
256

        # _run_under_unittests is only meant to be used when testing. It
        # bypasses execution of some code to help with testing.
        self._run_under_unittests = False

257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
        self._component_configurator = isc.bind10.component.Configurator(self,
            isc.bind10.special_component.get_specials())
        # The priorities here make them start in the correct order. First
        # the socket creator (which would drop root privileges by then),
        # then message queue and after that the config manager (which uses
        # the config manager)
        self.__core_components = {
            'sockcreator': {
                'kind': 'core',
                'special': 'sockcreator',
                'priority': 200
            },
            'msgq': {
                'kind': 'core',
                'special': 'msgq',
                'priority': 199
            },
            'cfgmgr': {
                'kind': 'core',
                'special': 'cfgmgr',
                'priority': 198
            }
        }
        self.__started = False
        self.exitcode = 0
282

283
284
285
        # If -v was set, enable full debug logging.
        if self.verbose:
            logger.set_severity("DEBUG", 99)
286
        # This is set in init_socket_srv
287
        self._socket_path = None
288
        self._socket_cache = None
289
290
        self._tmpdir = None
        self._srv_socket = None
291
        self._unix_sockets = {}
292

293
294
295
296
297
298
    def __propagate_component_config(self, config):
        comps = dict(config)
        # Fill in the core components, so they stay alive
        for comp in self.__core_components:
            if comp in comps:
                raise Exception(comp + " is core component managed by " +
299
                                "b10-init, do not set it")
300
301
302
303
            comps[comp] = self.__core_components[comp]
        # Update the configuration
        self._component_configurator.reconfigure(comps)

304
305
306
307
308
309
310
311
    def change_user(self):
        '''Change the user and group to those specified on construction.

        This method is expected to be called by a component on initial
        startup when the system is ready to switch the user and group
        (i.e., once all components that need the privilege of the original
        user have started).
        '''
312
313
314
315
316
317
318
319
320
321
322
323
324
        try:
            if self.__gid is not None:
                logger.info(BIND10_SETGID, self.__gid)
                posix.setgid(self.__gid)
        except Exception as ex:
            raise ChangeUserError('failed to change group: ' + str(ex))

        try:
            if self.__uid is not None:
                posix.setuid(self.__uid)
                # We use one-shot logger after setuid here.  This will
                # detect any permission issue regarding logging due to the
                # result of setuid at the earliest opportunity.
325
                isc.log.Logger("b10-init").info(BIND10_SETUID, self.__uid)
326
327
        except Exception as ex:
            raise ChangeUserError('failed to change user: ' + str(ex))
328

329
    def config_handler(self, new_config):
330
        # If this is initial update, don't do anything now, leave it to startup
331
332
        if not self.runnable:
            return
333
334
        logger.debug(DBG_COMMANDS, BIND10_RECEIVED_NEW_CONFIGURATION,
                     new_config)
335
336
337
338
339
        try:
            if 'components' in new_config:
                self.__propagate_component_config(new_config['components'])
            return isc.config.ccsession.create_answer(0)
        except Exception as e:
Mukund Sivaraman's avatar
Mukund Sivaraman committed
340
            logger.error(BIND10_RECONFIGURE_ERROR, e)
341
            return isc.config.ccsession.create_answer(1, str(e))
342

Shane Kerr's avatar
Shane Kerr committed
343
    def get_processes(self):
344
        pids = list(self.components.keys())
Shane Kerr's avatar
Shane Kerr committed
345
346
347
        pids.sort()
        process_list = [ ]
        for pid in pids:
348
349
            process_list.append([pid, self.components[pid].name(),
                                 self.components[pid].address()])
Shane Kerr's avatar
Shane Kerr committed
350
351
        return process_list

352
    def _get_stats_data(self):
353
354
        return { 'boot_time':
                     time.strftime('%Y-%m-%dT%H:%M:%SZ', _BASETIME)
355
                 }
356

Jelte Jansen's avatar
Jelte Jansen committed
357
    def command_handler(self, command, args):
358
        logger.debug(DBG_COMMANDS, BIND10_RECEIVED_COMMAND, command)
359
360
        answer = isc.config.ccsession.create_answer(1,
                                                    "command not implemented")
Jelte Jansen's avatar
Jelte Jansen committed
361
        if type(command) != str:
Jelte Jansen's avatar
Jelte Jansen committed
362
            answer = isc.config.ccsession.create_answer(1, "bad command")
363
        else:
364
            if command == "shutdown":
365
                self.runnable = False
Jelte Jansen's avatar
Jelte Jansen committed
366
                answer = isc.config.ccsession.create_answer(0)
367
            elif command == "getstats":
368
369
                answer = isc.config.ccsession.create_answer(
                    0, self._get_stats_data())
Shane Kerr's avatar
Shane Kerr committed
370
371
372
            elif command == "ping":
                answer = isc.config.ccsession.create_answer(0, "pong")
            elif command == "show_processes":
373
374
                answer = isc.config.ccsession. \
                    create_answer(0, self.get_processes())
375
376
            elif command == "get_socket":
                answer = self._get_socket(args)
377
378
379
380
381
382
383
384
385
386
            elif command == "drop_socket":
                if "token" not in args:
                    answer = isc.config.ccsession. \
                        create_answer(1, "Missing token parameter")
                else:
                    try:
                        self._socket_cache.drop_socket(args["token"])
                        answer = isc.config.ccsession.create_answer(0)
                    except Exception as e:
                        answer = isc.config.ccsession.create_answer(1, str(e))
Jelte Jansen's avatar
Jelte Jansen committed
387
            else:
388
                answer = isc.config.ccsession.create_answer(1,
389
                                                            "Unknown command")
Jelte Jansen's avatar
Jelte Jansen committed
390
        return answer
391

392
    def kill_started_components(self):
393
394
395
396
        """
            Called as part of the exception handling when a process fails to
            start, this runs through the list of started processes, killing
            each one.  It then clears that list.
397
        """
398
        logger.info(BIND10_KILLING_ALL_PROCESSES)
399
        self.__kill_children(True)
400
        self.components = {}
401

Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
402
    def _read_bind10_config(self):
403
        """
404
            Reads the parameters associated with the Init module itself.
405

406
            This means the list of components we should start now.
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
407
408
409
410

            This could easily be combined into start_all_processes, but
            it stays because of historical reasons and because the tests
            replace the method sometimes.
411
        """
412
        logger.info(BIND10_READING_INIT_CONFIGURATION)
413
414

        config_data = self.ccs.get_full_config()
415
        self.__propagate_component_config(config_data['components'])
416
417
418
419

    def log_starting(self, process, port = None, address = None):
        """
            A convenience function to output a "Starting xxx" message if the
420
421
            logging is set to DEBUG with debuglevel DBG_PROCESS or higher.
            Putting this into a separate method ensures
422
423
424
425
426
427
428
429
430
            that the output form is consistent across all processes.

            The process name (passed as the first argument) is put into
            self.curproc, and is used to indicate which process failed to
            start if there is an error (and is used in the "Started" message
            on success).  The optional port and address information are
            appended to the message (if present).
        """
        self.curproc = process
431
        if port is None and address is None:
432
            logger.info(BIND10_STARTING_PROCESS, self.curproc)
433
        elif address is None:
434
            logger.info(BIND10_STARTING_PROCESS_PORT, self.curproc,
435
436
                        port)
        else:
437
            logger.info(BIND10_STARTING_PROCESS_PORT_ADDRESS,
438
                        self.curproc, AddressFormatter((address, port)))
439

440
441
442
443
444
445
    def log_started(self, pid = None):
        """
            A convenience function to output a 'Started xxxx (PID yyyy)'
            message.  As with starting_message(), this ensures a consistent
            format.
        """
446
447
448
        if pid is None:
            logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS, self.curproc)
        else:
449
450
            logger.debug(DBG_PROCESS, BIND10_STARTED_PROCESS_PID, self.curproc,
                         pid)
451

452
453
    def process_running(self, msg, who):
        """
Jelte Jansen's avatar
Jelte Jansen committed
454
            Some processes return a message to the Init after they have
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
            started to indicate that they are running.  The form of the
            message is a dictionary with contents {"running:", "<process>"}.
            This method checks the passed message and returns True if the
            "who" process is contained in the message (so is presumably
            running).  It returns False for all other conditions and will
            log an error if appropriate.
        """
        if msg is not None:
            try:
                if msg["running"] == who:
                    return True
                else:
                    logger.error(BIND10_STARTUP_UNEXPECTED_MESSAGE, msg)
            except:
                logger.error(BIND10_STARTUP_UNRECOGNISED_MESSAGE, msg)
470

471
472
        return False

473
    # The next few methods start the individual processes of BIND-10.  They
474
475
    # are called via start_all_processes().  If any fail, an exception is
    # raised which is caught by the caller of start_all_processes(); this kills
476
477
    # processes started up to that point before terminating the program.

478
479
    def _make_process_info(self, name, args, env,
                           dev_null_stdout=False, dev_null_stderr=False):
480
481
482
483
        """
            Wrapper around ProcessInfo(), useful to override
            ProcessInfo() creation during testing.
        """
484
485
        return ProcessInfo(name, args, env, dev_null_stdout, dev_null_stderr)

486
    def start_msgq(self):
487
488
489
490
        """
            Start the message queue and connect to the command channel.
        """
        self.log_starting("b10-msgq")
491
492
        msgq_proc = self._make_process_info("b10-msgq", ["b10-msgq"],
                                            self.c_channel_env,
493
                                            not self.verbose, not self.verbose)
494
495
        msgq_proc.spawn()
        self.log_started(msgq_proc.pid)
496

497
        # Now connect to the c-channel
498
499
        cc_connect_start = time.time()
        while self.cc_session is None:
500
            # if we are run under unittests, break
501
            if self._run_under_unittests:
502
503
                break

504
            # if we have been trying for "a while" give up
505
            if (time.time() - cc_connect_start) > self.msgq_timeout:
506
507
                if msgq_proc.process:
                    msgq_proc.process.kill()
JINMEI Tatuya's avatar
JINMEI Tatuya committed
508
                logger.error(BIND10_CONNECTING_TO_CC_FAIL)
509
510
                raise CChannelConnectError("Unable to connect to c-channel " +
                                           "after 5 seconds")
511

512
513
            # try to connect, and if we can't wait a short while
            try:
514
                self.cc_session = isc.cc.Session(self.msgq_socket_file)
515
            except isc.cc.session.SessionError:
516
517
                time.sleep(0.1)

518
519
        # Subscribe to the message queue.  The only messages we expect to
        # receive on this channel are once relating to process startup.
520
        if self.cc_session is not None:
Jelte Jansen's avatar
Jelte Jansen committed
521
            self.cc_session.group_subscribe("Init")
522

523
524
        return msgq_proc

525
526
    def wait_msgq(self):
        """
527
528
529
            Wait for the message queue to fully start. It does so only after
            the config manager connects to it. We know it is ready when it
            starts answering commands.
530
531
532
533
534
535
536
537
538
539
540

            We don't add a specific command for it here, an error response is
            as good as positive one to know it is alive.
        """
        # We do 10 times shorter sleep here (since the start should be fast
        # now), so we have 10 times more attempts.
        time_remaining = self.wait_time * 10
        retry = True
        while time_remaining > 0 and retry:
            try:
                self.ccs.rpc_call('AreYouThere?', 'Msgq')
541
542
                # We don't expect this to succeed. If it does, it's programmer
                # error
543
544
545
546
547
548
549
550
551
552
553
554
                raise Exception("Non-existing RPC call succeeded")
            except isc.config.RPCRecipientMissing:
                retry = True # Not there yet
                time.sleep(0.1)
                time_remaining -= 1
            except isc.config.RPCError:
                retry = False # It doesn't like the RPC, so it's alive now

        if retry: # Still not started
            raise ProcessStartError("Msgq didn't complete the second stage " +
                                    "of startup")

555
    def start_cfgmgr(self):
556
557
558
559
        """
            Starts the configuration manager process
        """
        self.log_starting("b10-cfgmgr")
560
        args = ["b10-cfgmgr"]
561
        if self.data_path is not None:
562
            args.append("--data-path=" + self.data_path)
563
        if self.config_filename is not None:
564
            args.append("--config-filename=" + self.config_filename)
565
        if self.clear_config:
566
            args.append("--clear-config")
567
568
        bind_cfgd = self._make_process_info("b10-cfgmgr", args,
                                            self.c_channel_env)
569
        bind_cfgd.spawn()
570
        self.log_started(bind_cfgd.pid)
571

572
573
574
575
576
        # Wait for the configuration manager to start up as
        # subsequent initialization cannot proceed without it.  The
        # time to wait can be set on the command line.
        time_remaining = self.wait_time
        msg, env = self.cc_session.group_recvmsg()
577
578
        while time_remaining > 0 and not self.process_running(msg,
                                                              "ConfigManager"):
579
580
581
            logger.debug(DBG_PROCESS, BIND10_WAIT_CFGMGR)
            time.sleep(1)
            time_remaining = time_remaining - 1
582
            msg, env = self.cc_session.group_recvmsg()
583

584
        if not self.process_running(msg, "ConfigManager"):
585
586
            raise ProcessStartError("Configuration manager process has not " +
                                    "started")
587

588
589
        return bind_cfgd

590
591
592
593
594
595
    def start_ccsession(self, c_channel_env):
        """
            Start the CC Session

            The argument c_channel_env is unused but is supplied to keep the
            argument list the same for all start_xxx methods.
596
597
598

            With regards to logging, note that as the CC session is not a
            process, the log_starting/log_started methods are not used.
599
        """
600
        logger.info(BIND10_STARTING_CC)
601
602
603
604
605
606
607

        # Unsubscribe from the other CC session first, because we only
        # monitor one and msgq expects all data sent to us to be read,
        # or it will close its side of the socket.
        if self.cc_session is not None:
            self.cc_session.group_unsubscribe("Init")

608
        self.ccs = isc.config.ModuleCCSession(SPECFILE_LOCATION,
609
                                      self.config_handler,
610
611
                                      self.command_handler,
                                      socket_file = self.msgq_socket_file)
612
        self.ccs.start()
613
        logger.debug(DBG_PROCESS, BIND10_STARTED_CC)
614
615
616

    # A couple of utility methods for starting processes...

617
618
    def start_process(self, name, args, c_channel_env, port=None,
                      address=None):
619
620
621
622
623
624
625
626
        """
            Given a set of command arguments, start the process and output
            appropriate log messages.  If the start is successful, the process
            is added to the list of started processes.

            The port and address arguments are for log messages only.
        """
        self.log_starting(name, port, address)
627
        newproc = self._make_process_info(name, args, c_channel_env)
628
        newproc.spawn()
629
        self.log_started(newproc.pid)
630
        return newproc
631

Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
632
    def register_process(self, pid, component):
633
        """
634
        Put another process into b10-init to watch over it.  When the process
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
635
        dies, the component.failed() is called with the exit code.
636

637
638
        It is expected the info is a isc.bind10.component.BaseComponent
        subclass (or anything having the same interface).
639
        """
640
        self.components[pid] = component
641
642

    def start_simple(self, name):
643
644
645
646
647
648
649
650
651
652
653
654
        """
            Most of the BIND-10 processes are started with the command:

                <process-name> [-v]

            ... where -v is appended if verbose is enabled.  This method
            generates the arguments from the name and starts the process.

            The port and address arguments are for log messages only.
        """
        # Set up the command arguments.
        args = [name]
655
        if self.verbose:
656
            args += ['-v']
657

658
        # ... and start the process
659
        return self.start_process(name, args, self.c_channel_env)
660

661
662
    # The next few methods start up the rest of the BIND-10 processes.
    # Although many of these methods are little more than a call to
663
664
665
    # start_simple, they are retained (a) for testing reasons and (b) as a
    # place where modifications can be made if the process start-up sequence
    # changes for a given process.
666

667
    def start_auth(self):
668
669
670
        """
            Start the Authoritative server
        """
671
        authargs = ['b10-auth']
672
        if self.verbose:
673
            authargs += ['-v']
674

675
        # ... and start
676
        return self.start_process("b10-auth", authargs, self.c_channel_env)
677

678
    def start_resolver(self):
679
680
681
682
683
        """
            Start the Resolver.  At present, all these arguments and switches
            are pure speculation.  As with the auth daemon, they should be
            read from the configuration database.
        """
684
        self.curproc = "b10-resolver"
685
        # XXX: this must be read from the configuration manager in the future
686
        resargs = ['b10-resolver']
687
        if self.verbose:
688
            resargs += ['-v']
689

690
        # ... and start
691
        return self.start_process("b10-resolver", resargs, self.c_channel_env)
Likun Zhang's avatar
Likun Zhang committed
692

693
694
695
696
697
698
699
700
701
702
703
    def start_cmdctl(self):
        """
            Starts the command control process
        """
        args = ["b10-cmdctl"]
        if self.cmdctl_port is not None:
            args.append("--port=" + str(self.cmdctl_port))
        if self.verbose:
            args.append("-v")
        return self.start_process("b10-cmdctl", args, self.c_channel_env,
                                  self.cmdctl_port)
704

705
    def start_all_components(self):
706
        """
707
708
            Starts up all the components.  Any exception generated during the
            starting of the components is handled by the caller.
709
        """
710
711
        # Start the real core (sockcreator, msgq, cfgmgr)
        self._component_configurator.startup(self.__core_components)
712

713
714
        # Connect to the msgq. This is not a process, so it's not handled
        # inside the configurator.
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
715
        self.start_ccsession(self.c_channel_env)
716

717
718
719
720
        # Make sure msgq is fully started before proceeding to the rest
        # of the components.
        self.wait_msgq()

721
        # Extract the parameters associated with Init.  This can only be
722
723
        # done after the CC Session is started.  Note that the logging
        # configuration may override the "-v" switch set on the command line.
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
724
        self._read_bind10_config()
725

726
        # TODO: Return the dropping of privileges
727

728
729
    def startup(self):
        """
730
            Start the Init instance.
731

732
733
734
735
736
737
738
            Returns None if successful, otherwise an string describing the
            problem.
        """
        # Try to connect to the c-channel daemon, to see if it is already
        # running
        c_channel_env = {}
        if self.msgq_socket_file is not None:
739
740
             c_channel_env["BIND10_MSGQ_SOCKET_FILE"] = self.msgq_socket_file
        logger.debug(DBG_PROCESS, BIND10_CHECK_MSGQ_ALREADY_RUNNING)
741
        try:
742
            self.cc_session = isc.cc.Session(self.msgq_socket_file)
743
            logger.fatal(BIND10_MSGQ_ALREADY_RUNNING)
744
745
746
747
748
749
            if self.msgq_socket_file is not None:
              socket_name = "socket file '" + self.msg_socket_file + "'"
            else:
              socket_name = "default socket file"
            return "b10-msgq already running, or " + socket_name +\
                " not cleaned - cannot start"
750
751
752
        except isc.cc.session.SessionError:
            # this is the case we want, where the msgq is not running
            pass
753

754
755
        # Start all components.  If any one fails to start, kill all started
        # components and exit with an error indication.
756
        try:
757
            self.c_channel_env = c_channel_env
758
            self.start_all_components()
759
760
761
        except ChangeUserError as e:
            self.kill_started_components()
            return str(e) + '; ' + NOTE_ON_LOCK_FILE.replace('\n', ' ')
762
        except Exception as e:
763
            self.kill_started_components()
764
            return "Unable to start " + self.curproc + ": " + str(e)
765

766
        # Started successfully
767
        self.runnable = True
768
        self.__started = True
769
770
        return None

771
    def stop_process(self, process, recipient, pid):
772
773
        """
        Stop the given process, friendly-like. The process is the name it has
774
775
776
        (in logs, etc), the recipient is the address on msgq. The pid is the
        pid of the process (if we have multiple processes of the same name,
        it might want to choose if it is for this one).
777
        """
778
        logger.info(BIND10_STOP_PROCESS, process)
Mukund Sivaraman's avatar
Mukund Sivaraman committed
779
780
781
782
783
784
785
786
        try:
            self.cc_session.group_sendmsg(isc.config.ccsession.
                                          create_command('shutdown',
                                                         {'pid': pid}),
                                          recipient, recipient)
        except:
            logger.error(BIND10_COMPONENT_SHUTDOWN_ERROR, process)
            raise
787

788
789
    def component_shutdown(self, exitcode=0):
        """
Jelte Jansen's avatar
Jelte Jansen committed
790
        Stop the Init instance from a components' request. The exitcode
791
        indicates the desired exit code.
792

793
794
        If we did not start yet, it raises an exception, which is meant
        to propagate through the component and configurator to the startup
795
        routine and abort the startup immediately. If it is started up already,
796
        we just mark it so we terminate soon.
797

798
799
800
801
802
803
804
        It does set the exit code in both cases.
        """
        self.exitcode = exitcode
        if not self.__started:
            raise Exception("Component failed during startup");
        else:
            self.runnable = False
805
806

    def shutdown(self):
807
        """Stop the Init instance."""
808
        logger.info(BIND10_SHUTDOWN)
Jelte Jansen's avatar
Jelte Jansen committed
809
810
        # If ccsession is still there, inform rest of the system this module
        # is stopping. Since everything will be stopped shortly, this is not
811
        # really necessary, but this is done to reflect that b10-init is also
Jelte Jansen's avatar
Jelte Jansen committed
812
        # 'just' a module.
813
        self.ccs.send_stopping()
Jelte Jansen's avatar
Jelte Jansen committed
814
815

        # try using the BIND 10 request to stop
816
        try:
817
            self._component_configurator.shutdown()
818
819
        except:
            pass
820
        # XXX: some delay probably useful... how much is uncertain
821
        # I have changed the delay from 0.5 to 1, but sometime it's
822
        # still not enough.
823
        time.sleep(1)
824
        self.reap_children()
825
826
827

        # Send TERM and KILL signals to modules if we're not prevented
        # from doing so
828
        if not self.nokill:
829
            # next try sending a SIGTERM
830
            self.__kill_children(False)
831
832
            # finally, send SIGKILL (unmaskable termination) until everybody
            # dies
833
834
835
836
            while self.components:
                # XXX: some delay probably useful... how much is uncertain
                time.sleep(0.1)
                self.reap_children()
837
                self.__kill_children(True)
838
            logger.info(BIND10_SHUTDOWN_COMPLETE)
839

840
    def __kill_children(self, forceful):
841
842
843
844
845
846
847
848
        '''Terminate remaining subprocesses by sending a signal.

        The forceful paramter will be passed Component.kill().
        This is a dedicated subroutine of shutdown(), just to unify two
        similar cases.

        '''
        logmsg = BIND10_SEND_SIGKILL if forceful else BIND10_SEND_SIGTERM
849
850
851
        # We need to make a copy of values as the components may be modified
        # in the loop.
        for component in list(self.components.values()):
852
853
854
            logger.info(logmsg, component.name(), component.pid())
            try:
                component.kill(forceful)
855
856
857
858
859
860
861
862
863
864
            except OSError as ex:
                # If kill() failed due to EPERM, it doesn't make sense to
                # keep trying, so we just log the fact and forget that
                # component.  Ignore other OSErrors (usually ESRCH because
                # the child finally exited)
                signame = "SIGKILL" if forceful else "SIGTERM"
                logger.info(BIND10_SEND_SIGNAL_FAIL, signame,
                            component.name(), component.pid(), ex)
                if ex.errno == errno.EPERM:
                    del self.components[component.pid()]
865

Shane Kerr's avatar
Shane Kerr committed
866
867
868
    def _get_process_exit_status(self):
        return os.waitpid(-1, os.WNOHANG)

869
    def reap_children(self):
870
871
        """Check to see if any of our child processes have exited,
        and note this for later handling.
872
        """
873
874
        while True:
            try:
Shane Kerr's avatar
Shane Kerr committed
875
                (pid, exit_status) = self._get_process_exit_status()
876
            except OSError as o:
Mukund Sivaraman's avatar
Mukund Sivaraman committed
877
878
                if o.errno == errno.ECHILD:
                    break
879
880
                # XXX: should be impossible to get any other error here
                raise
Mukund Sivaraman's avatar
Mukund Sivaraman committed
881
882
            if pid == 0:
                break
883
884
885
            if pid in self.components:
                # One of the components we know about.  Get information on it.
                component = self.components.pop(pid)
886
887
                logger.info(BIND10_PROCESS_ENDED, component.name(), pid,
                            exit_status)
888
                if component.is_running() and self.runnable:
889
890
891
                    # Tell it it failed. But only if it matters (we are
                    # not shutting down and the component considers itself
                    # to be running.
892
893
894
895
896
                    component_restarted = component.failed(exit_status);
                    # if the process wants to be restarted, but not just yet,
                    # it returns False
                    if not component_restarted:
                        self.components_to_restart.append(component)
897
            else:
898
                logger.info(BIND10_UNKNOWN_CHILD_PROCESS_ENDED, pid)
899
900

    def restart_processes(self):
901
902
903
        """
            Restart any dead processes:

904
            * Returns the time when the next process is ready to be restarted.
905
906
907
            * If the server is shutting down, returns 0.
            * If there are no processes, returns None.

908
            The values returned can be safely passed into select() as the
909
            timeout value.
910

911
        """
912
        if not self.runnable:
913
            return 0
914
        still_dead = []
Jelte Jansen's avatar
Jelte Jansen committed
915
916
        # keep track of the first time we need to check this queue again,
        # if at all
917
        next_restart_time = None
918
        now = time.time()
919
        for component in self.components_to_restart:
920
921
922
923
924
925
            # If the component was removed from the configurator between since
            # scheduled to restart, just ignore it.  The object will just be
            # dropped here.
            if not self._component_configurator.has_component(component):
                logger.info(BIND10_RESTART_COMPONENT_SKIPPED, component.name())
            elif not component.restart(now):
926
927
928
929
930
931
932
                still_dead.append(component)
                if next_restart_time is None or\
                   next_restart_time > component.get_restart_time():
                    next_restart_time = component.get_restart_time()
        self.components_to_restart = still_dead

        return next_restart_time
933

934
935
936
937
938
    def _get_socket(self, args):
        """
        Implementation of the get_socket CC command. It asks the cache
        to provide the token and sends the information back.
        """
939
940
941
942
943
944
945
946
947
948
        try:
            try:
                addr = isc.net.parse.addr_parse(args['address'])
                port = isc.net.parse.port_parse(args['port'])
                protocol = args['protocol']
                if protocol not in ['UDP', 'TCP']:
                    raise ValueError("Protocol must be either UDP or TCP")
                share_mode = args['share_mode']
                if share_mode not in ['ANY', 'SAMEAPP', 'NO']:
                    raise ValueError("Share mode must be one of ANY, SAMEAPP" +
949
                                     " or NO")
950
951
952
953
954
955
956
                share_name = args['share_name']
            except KeyError as ke:
                return \
                    isc.config.ccsession.create_answer(1,
                                                       "Missing parameter " +
                                                       str(ke))

957
958
959
            # FIXME: This call contains blocking IPC. It is expected to be
            # short, but if it turns out to be problem, we'll need to do
            # something about it.
960
961
962
963
964
965
            token = self._socket_cache.get_token(protocol, addr, port,
                                                 share_mode, share_name)
            return isc.config.ccsession.create_answer(0, {
                'token': token,
                'path': self._socket_path
            })
966
        except isc.bind10.socket_cache.SocketError as e:
967
968
            return isc.config.ccsession.create_answer(CREATOR_SOCKET_ERROR,
                                                      str(e))
969
        except isc.bind10.socket_cache.ShareError as e:
970
971
            return isc.config.ccsession.create_answer(CREATOR_SHARE_ERROR,
                                                      str(e))
972
973
        except Exception as e:
            return isc.config.ccsession.create_answer(1, str(e))
974

975
976
977
978
    def socket_request_handler(self, token, unix_socket):
        """
        This function handles a token that comes over a unix_domain socket.
        The function looks into the _socket_cache and sends the socket
979
        identified by the token back over the unix_socket.
980
        """
981
        try:
982
            token = str(token, 'ASCII') # Convert from bytes to str
983
            fd = self._socket_cache.get_socket(token, unix_socket.fileno())
984
985
986
987
988
            # FIXME: These two calls are blocking in their nature. An OS-level
            # buffer is likely to be large enough to hold all these data, but
            # if it wasn't and the remote application got stuck, we would have
            # a problem. If there appear such problems, we should do something
            # about it.
989
            unix_socket.sendall(CREATOR_SOCKET_OK)
990
991
            libutil_io_python.send_fd(unix_socket.fileno(), fd)
        except Exception as e:
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
992
            logger.info(BIND10_NO_SOCKET, token, e)
993
            unix_socket.sendall(CREATOR_SOCKET_UNAVAILABLE)
994
995
996
997
998
999
1000

    def socket_consumer_dead(self, unix_socket):
        """
        This function handles when a unix_socket closes. This means all
        sockets sent to it are to be considered closed. This function signals
        so to the _socket_cache.
        """
Michal 'vorner' Vaner's avatar
Michal 'vorner' Vaner committed
1001
        logger.info(BIND10_LOST_SOCKET_CONSUMER, unix_socket.fileno())
1002
1003
1004
1005
1006
1007
1008
1009
        try:
            self._socket_cache.drop_application(unix_socket.fileno())
        except ValueError:
            # This means the application holds no sockets. It's harmless, as it
            # can happen in real life - for example, it requests a socket, but
            # get_socket doesn't find it, so the application dies. It should be
            # rare, though.
            pass
1010

1011
    def set_creator(self, creator):
1012
        """
1013
1014
        Registeres a socket creator into the b10-init. The socket creator is
        not used directly, but through a cache. The cache is created in this
1015
1016
1017
1018
1019
1020
1021
1022
        method.

        If called more than once, it raises a ValueError.
        """
        if self._socket_cache is not None:
            raise ValueError("A creator was inserted previously")
        self._socket_cache = isc.bind10.socket_cache.Cache(creator)

1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
    def init_socket_srv(self):
        """
        Creates and listens on a unix-domain socket to be able to send out
        the sockets.

        This method should be called after switching user, or the switched
        applications won't be able to access the socket.
        """
        self._srv_socket = socket.socket(socket.AF_UNIX)
        # We create a temporary directory somewhere safe and unique, to avoid
        # the need to find the place ourself or bother users. Also, this
        # secures the socket on some platforms, as it creates a private
        # directory.
1036
        self._tmpdir = tempfile.mkdtemp(prefix='sockcreator-')
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050