dispatch.c 88 KB
Newer Older
Michael Graff's avatar
Michael Graff committed
1
/*
Automatic Updater's avatar
Automatic Updater committed
2
 * Copyright (C) 2004-2009  Internet Systems Consortium, Inc. ("ISC")
Mark Andrews's avatar
Mark Andrews committed
3
 * Copyright (C) 1999-2003  Internet Software Consortium.
4
 *
Automatic Updater's avatar
Automatic Updater committed
5
 * Permission to use, copy, modify, and/or distribute this software for any
Michael Graff's avatar
Michael Graff committed
6 7
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
8
 *
Mark Andrews's avatar
Mark Andrews committed
9 10 11 12 13 14 15
 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
Michael Graff's avatar
Michael Graff committed
16 17
 */

18
/* $Id: dispatch.c,v 1.169 2011/02/03 05:41:54 marka Exp $ */
19 20

/*! \file */
David Lawrence's avatar
David Lawrence committed
21

Michael Graff's avatar
Michael Graff committed
22 23 24
#include <config.h>

#include <stdlib.h>
25 26
#include <sys/types.h>
#include <unistd.h>
27
#include <stdlib.h>
Michael Graff's avatar
Michael Graff committed
28

29
#include <isc/entropy.h>
Michael Graff's avatar
Michael Graff committed
30
#include <isc/mem.h>
31
#include <isc/mutex.h>
32
#include <isc/portset.h>
33
#include <isc/print.h>
34
#include <isc/random.h>
35
#include <isc/stats.h>
36
#include <isc/string.h>
37
#include <isc/task.h>
38
#include <isc/time.h>
Michael Graff's avatar
Michael Graff committed
39
#include <isc/util.h>
Michael Graff's avatar
Michael Graff committed
40

41
#include <dns/acl.h>
Michael Graff's avatar
Michael Graff committed
42
#include <dns/dispatch.h>
43 44
#include <dns/events.h>
#include <dns/log.h>
45
#include <dns/message.h>
46
#include <dns/portlist.h>
47
#include <dns/stats.h>
48
#include <dns/tcpmsg.h>
49 50
#include <dns/types.h>

51 52
typedef ISC_LIST(dns_dispentry_t)	dns_displist_t;

53
typedef struct dispsocket		dispsocket_t;
54 55
typedef ISC_LIST(dispsocket_t)		dispsocketlist_t;

56 57 58
typedef struct dispportentry		dispportentry_t;
typedef ISC_LIST(dispportentry_t)	dispportlist_t;

59 60 61 62 63 64 65 66 67
/* ARC4 Random generator state */
typedef struct arc4ctx {
	isc_uint8_t	i;
	isc_uint8_t	j;
	isc_uint8_t	s[256];
	int		count;
	isc_entropy_t	*entropy;	/*%< entropy source for ARC4 */
	isc_mutex_t	*lock;
} arc4ctx_t;
68

69 70
typedef struct dns_qid {
	unsigned int	magic;
71 72
	unsigned int	qid_nbuckets;	/*%< hash table size */
	unsigned int	qid_increment;	/*%< id increment on collision */
73
	isc_mutex_t	lock;
74
	dns_displist_t	*qid_table;	/*%< the table itself */
75
	dispsocketlist_t *sock_table;	/*%< socket table */
76 77
} dns_qid_t;

78 79 80 81
struct dns_dispatchmgr {
	/* Unlocked. */
	unsigned int			magic;
	isc_mem_t		       *mctx;
82
	dns_acl_t		       *blackhole;
83
	dns_portlist_t		       *portlist;
84
	isc_stats_t		       *stats;
85
	isc_entropy_t		       *entropy; /*%< entropy source */
86 87 88 89 90

	/* Locked by "lock". */
	isc_mutex_t			lock;
	unsigned int			state;
	ISC_LIST(dns_dispatch_t)	list;
91

92 93 94 95
	/* Locked by arc4_lock. */
	isc_mutex_t			arc4_lock;
	arc4ctx_t			arc4ctx;    /*%< ARC4 context for QID */

96
	/* locked by buffer lock */
97
	dns_qid_t			*qid;
98
	isc_mutex_t			buffer_lock;
99 100 101
	unsigned int			buffers;    /*%< allocated buffers */
	unsigned int			buffersize; /*%< size of each buffer */
	unsigned int			maxbuffers; /*%< max buffers */
102 103 104

	/* Locked internally. */
	isc_mutex_t			pool_lock;
105 106 107 108
	isc_mempool_t		       *epool;	/*%< memory pool for events */
	isc_mempool_t		       *rpool;	/*%< memory pool for replies */
	isc_mempool_t		       *dpool;  /*%< dispatch allocations */
	isc_mempool_t		       *bpool;	/*%< memory pool for buffers */
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
	isc_mempool_t		       *spool;	/*%< memory pool for dispsocs */

	/*%
	 * Locked by qid->lock if qid exists; otherwise, can be used without
	 * being locked.
	 * Memory footprint considerations: this is a simple implementation of
	 * available ports, i.e., an ordered array of the actual port numbers.
	 * This will require about 256KB of memory in the worst case (128KB for
	 * each of IPv4 and IPv6).  We could reduce it by representing it as a
	 * more sophisticated way such as a list (or array) of ranges that are
	 * searched to identify a specific port.  Our decision here is the saved
	 * memory isn't worth the implementation complexity, considering the
	 * fact that the whole BIND9 process (which is mainly named) already
	 * requires a pretty large memory footprint.  We may, however, have to
	 * revisit the decision when we want to use it as a separate module for
	 * an environment where memory requirement is severer.
	 */
	in_port_t	*v4ports;	/*%< available ports for IPv4 */
	unsigned int	nv4ports;	/*%< # of available ports for IPv4 */
	in_port_t	*v6ports;	/*%< available ports for IPv4 */
	unsigned int	nv6ports;	/*%< # of available ports for IPv4 */
130 131 132 133 134 135
};

#define MGR_SHUTTINGDOWN		0x00000001U
#define MGR_IS_SHUTTINGDOWN(l)	(((l)->state & MGR_SHUTTINGDOWN) != 0)

#define IS_PRIVATE(d)	(((d)->attributes & DNS_DISPATCHATTR_PRIVATE) != 0)
Michael Graff's avatar
Michael Graff committed
136

Michael Graff's avatar
Michael Graff committed
137
struct dns_dispentry {
Michael Graff's avatar
Michael Graff committed
138
	unsigned int			magic;
139
	dns_dispatch_t		       *disp;
140
	dns_messageid_t			id;
141
	in_port_t			port;
Michael Graff's avatar
Michael Graff committed
142
	unsigned int			bucket;
Michael Graff's avatar
Michael Graff committed
143
	isc_sockaddr_t			host;
Michael Graff's avatar
Michael Graff committed
144 145 146
	isc_task_t		       *task;
	isc_taskaction_t		action;
	void			       *arg;
Michael Graff's avatar
Michael Graff committed
147
	isc_boolean_t			item_out;
148
	dispsocket_t			*dispsocket;
Michael Graff's avatar
Michael Graff committed
149
	ISC_LIST(dns_dispatchevent_t)	items;
Michael Graff's avatar
Michael Graff committed
150
	ISC_LINK(dns_dispentry_t)	link;
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
};

/*%
 * Maximum number of dispatch sockets that can be pooled for reuse.  The
 * appropriate value may vary, but experiments have shown a busy caching server
 * may need more than 1000 sockets concurrently opened.  The maximum allowable
 * number of dispatch sockets (per manager) will be set to the double of this
 * value.
 */
#ifndef DNS_DISPATCH_POOLSOCKS
#define DNS_DISPATCH_POOLSOCKS			2048
#endif

/*%
 * Quota to control the number of dispatch sockets.  If a dispatch has more
 * than the quota of sockets, new queries will purge oldest ones, so that
 * a massive number of outstanding queries won't prevent subsequent queries
 * (especially if the older ones take longer time and result in timeout).
 */
#ifndef DNS_DISPATCH_SOCKSQUOTA
#define DNS_DISPATCH_SOCKSQUOTA			3072
#endif

struct dispsocket {
	unsigned int			magic;
	isc_socket_t			*socket;
	dns_dispatch_t			*disp;
178
	isc_sockaddr_t			host;
179 180
	in_port_t			localport; /* XXX: should be removed later */
	dispportentry_t			*portentry;
181 182 183
	dns_dispentry_t			*resp;
	isc_task_t			*task;
	ISC_LINK(dispsocket_t)		link;
184 185
	unsigned int			bucket;
	ISC_LINK(dispsocket_t)		blink;
Michael Graff's avatar
Michael Graff committed
186
};
Michael Graff's avatar
Michael Graff committed
187

188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
/*%
 * A port table entry.  We remember every port we first open in a table with a
 * reference counter so that we can 'reuse' the same port (with different
 * destination addresses) using the SO_REUSEADDR socket option.
 */
struct dispportentry {
	in_port_t			port;
	unsigned int			refs;
	ISC_LINK(struct dispportentry)	link;
};

#ifndef DNS_DISPATCH_PORTTABLESIZE
#define DNS_DISPATCH_PORTTABLESIZE	1024
#endif

203
#define INVALID_BUCKET		(0xffffdead)
Michael Graff's avatar
Michael Graff committed
204

205 206 207 208 209 210 211
/*%
 * Number of tasks for each dispatch that use separate sockets for different
 * transactions.  This must be a power of 2 as it will divide 32 bit numbers
 * to get an uniformly random tasks selection.  See get_dispsocket().
 */
#define MAX_INTERNAL_TASKS	64

Michael Graff's avatar
Michael Graff committed
212 213
struct dns_dispatch {
	/* Unlocked. */
214 215
	unsigned int		magic;		/*%< magic */
	dns_dispatchmgr_t      *mgr;		/*%< dispatch manager */
216 217 218 219 220 221 222
	int			ntasks;
	/*%
	 * internal task buckets.  We use multiple tasks to distribute various
	 * socket events well when using separate dispatch sockets.  We use the
	 * 1st task (task[0]) for internal control events.
	 */
	isc_task_t	       *task[MAX_INTERNAL_TASKS];
223 224
	isc_socket_t	       *socket;		/*%< isc socket attached to */
	isc_sockaddr_t		local;		/*%< local address */
225
	in_port_t		localport;	/*%< local UDP port */
226
	unsigned int		maxrequests;	/*%< max requests */
227
	isc_event_t	       *ctlevent;
Michael Graff's avatar
Michael Graff committed
228

229
	/*% Locked by mgr->lock. */
230 231 232
	ISC_LINK(dns_dispatch_t) link;

	/* Locked by "lock". */
233
	isc_mutex_t		lock;		/*%< locks all below */
234
	isc_sockettype_t	socktype;
235
	unsigned int		attributes;
236 237
	unsigned int		refcount;	/*%< number of users */
	dns_dispatchevent_t    *failsafe_ev;	/*%< failsafe cancel event */
Michael Graff's avatar
Michael Graff committed
238
	unsigned int		shutting_down : 1,
239 240
				shutdown_out : 1,
				connected : 1,
241
				tcpmsg_valid : 1,
242
				recv_pending : 1; /*%< is a recv() pending? */
Michael Graff's avatar
Michael Graff committed
243
	isc_result_t		shutdown_why;
244 245 246
	ISC_LIST(dispsocket_t)	activesockets;
	ISC_LIST(dispsocket_t)	inactivesockets;
	unsigned int		nsockets;
247 248 249
	unsigned int		requests;	/*%< how many requests we have */
	unsigned int		tcpbuffers;	/*%< allocated buffers */
	dns_tcpmsg_t		tcpmsg;		/*%< for tcp streams */
250
	dns_qid_t		*qid;
251
	arc4ctx_t		arc4ctx;	/*%< for QID/UDP port num */
252 253
	dispportlist_t		*port_table;	/*%< hold ports 'owned' by us */
	isc_mempool_t		*portpool;	/*%< port table entries  */
Michael Graff's avatar
Michael Graff committed
254 255
};

256 257 258
#define QID_MAGIC		ISC_MAGIC('Q', 'i', 'd', ' ')
#define VALID_QID(e)		ISC_MAGIC_VALID((e), QID_MAGIC)

259 260
#define RESPONSE_MAGIC		ISC_MAGIC('D', 'r', 's', 'p')
#define VALID_RESPONSE(e)	ISC_MAGIC_VALID((e), RESPONSE_MAGIC)
Michael Graff's avatar
Michael Graff committed
261

262 263 264
#define DISPSOCK_MAGIC		ISC_MAGIC('D', 's', 'o', 'c')
#define VALID_DISPSOCK(e)	ISC_MAGIC_VALID((e), DISPSOCK_MAGIC)

265 266
#define DISPATCH_MAGIC		ISC_MAGIC('D', 'i', 's', 'p')
#define VALID_DISPATCH(e)	ISC_MAGIC_VALID((e), DISPATCH_MAGIC)
Michael Graff's avatar
Michael Graff committed
267

268 269
#define DNS_DISPATCHMGR_MAGIC	ISC_MAGIC('D', 'M', 'g', 'r')
#define VALID_DISPATCHMGR(e)	ISC_MAGIC_VALID((e), DNS_DISPATCHMGR_MAGIC)
Michael Graff's avatar
Michael Graff committed
270

271 272
#define DNS_QID(disp) ((disp)->socktype == isc_sockettype_tcp) ? \
		       (disp)->qid : (disp)->mgr->qid
273 274 275 276 277 278 279 280 281 282 283 284 285
#define DISP_ARC4CTX(disp) ((disp)->socktype == isc_sockettype_udp) ? \
			(&(disp)->arc4ctx) : (&(disp)->mgr->arc4ctx)

/*%
 * Locking a query port buffer is a bit tricky.  We access the buffer without
 * locking until qid is created.  Technically, there is a possibility of race
 * between the creation of qid and access to the port buffer; in practice,
 * however, this should be safe because qid isn't created until the first
 * dispatch is created and there should be no contending situation until then.
 */
#define PORTBUFLOCK(mgr) if ((mgr)->qid != NULL) LOCK(&((mgr)->qid->lock))
#define PORTBUFUNLOCK(mgr) if ((mgr)->qid != NULL) UNLOCK((&(mgr)->qid->lock))

286
/*
287
 * Statics.
288
 */
289 290
static dns_dispentry_t *entry_search(dns_qid_t *, isc_sockaddr_t *,
				     dns_messageid_t, in_port_t, unsigned int);
291
static isc_boolean_t destroy_disp_ok(dns_dispatch_t *);
292
static void destroy_disp(isc_task_t *task, isc_event_t *event);
293 294 295 296 297
static void destroy_dispsocket(dns_dispatch_t *, dispsocket_t **);
static void deactivate_dispsocket(dns_dispatch_t *, dispsocket_t *);
static void udp_exrecv(isc_task_t *, isc_event_t *);
static void udp_shrecv(isc_task_t *, isc_event_t *);
static void udp_recv(isc_event_t *, dns_dispatch_t *, dispsocket_t *);
298
static void tcp_recv(isc_task_t *, isc_event_t *);
299 300 301
static isc_result_t startrecv(dns_dispatch_t *, dispsocket_t *);
static isc_uint32_t dns_hash(dns_qid_t *, isc_sockaddr_t *, dns_messageid_t,
			     in_port_t);
302
static void free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len);
303
static void *allocate_udp_buffer(dns_dispatch_t *disp);
304 305
static inline void free_event(dns_dispatch_t *disp, dns_dispatchevent_t *ev);
static inline dns_dispatchevent_t *allocate_event(dns_dispatch_t *disp);
306
static void do_cancel(dns_dispatch_t *disp);
307 308
static dns_dispentry_t *linear_first(dns_qid_t *disp);
static dns_dispentry_t *linear_next(dns_qid_t *disp,
309
				    dns_dispentry_t *resp);
310
static void dispatch_free(dns_dispatch_t **dispp);
311 312 313 314
static isc_result_t get_udpsocket(dns_dispatchmgr_t *mgr,
				  dns_dispatch_t *disp,
				  isc_socketmgr_t *sockmgr,
				  isc_sockaddr_t *localaddr,
315
				  isc_socket_t **sockp);
316 317 318 319 320 321 322 323 324
static isc_result_t dispatch_createudp(dns_dispatchmgr_t *mgr,
				       isc_socketmgr_t *sockmgr,
				       isc_taskmgr_t *taskmgr,
				       isc_sockaddr_t *localaddr,
				       unsigned int maxrequests,
				       unsigned int attributes,
				       dns_dispatch_t **dispp);
static isc_boolean_t destroy_mgr_ok(dns_dispatchmgr_t *mgr);
static void destroy_mgr(dns_dispatchmgr_t **mgrp);
325
static isc_result_t qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
326 327
				 unsigned int increment, dns_qid_t **qidp,
				 isc_boolean_t needaddrtable);
328
static void qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp);
329
static isc_result_t open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
330
				unsigned int options, isc_socket_t **sockp);
331 332
static isc_boolean_t portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
				   isc_sockaddr_t *sockaddrp);
333 334

#define LVL(x) ISC_LOG_DEBUG(x)
Michael Graff's avatar
Michael Graff committed
335

336 337 338 339
static void
mgr_log(dns_dispatchmgr_t *mgr, int level, const char *fmt, ...)
     ISC_FORMAT_PRINTF(3, 4);

340
static void
341
mgr_log(dns_dispatchmgr_t *mgr, int level, const char *fmt, ...) {
342 343 344
	char msgbuf[2048];
	va_list ap;

345 346 347
	if (! isc_log_wouldlog(dns_lctx, level))
		return;

348 349 350 351
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

352 353 354 355 356
	isc_log_write(dns_lctx,
		      DNS_LOGCATEGORY_DISPATCH, DNS_LOGMODULE_DISPATCH,
		      level, "dispatchmgr %p: %s", mgr, msgbuf);
}

357
static inline void
Mark Andrews's avatar
Mark Andrews committed
358
inc_stats(dns_dispatchmgr_t *mgr, isc_statscounter_t counter) {
359 360 361 362
	if (mgr->stats != NULL)
		isc_stats_increment(mgr->stats, counter);
}

363 364 365 366
static void
dispatch_log(dns_dispatch_t *disp, int level, const char *fmt, ...)
     ISC_FORMAT_PRINTF(3, 4);

367 368 369 370 371
static void
dispatch_log(dns_dispatch_t *disp, int level, const char *fmt, ...) {
	char msgbuf[2048];
	va_list ap;

Andreas Gustafsson's avatar
Andreas Gustafsson committed
372 373
	if (! isc_log_wouldlog(dns_lctx, level))
		return;
374

375 376 377 378 379 380 381
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(dns_lctx,
		      DNS_LOGCATEGORY_DISPATCH, DNS_LOGMODULE_DISPATCH,
		      level, "dispatch %p: %s", disp, msgbuf);
382 383
}

384 385 386 387 388
static void
request_log(dns_dispatch_t *disp, dns_dispentry_t *resp,
	    int level, const char *fmt, ...)
     ISC_FORMAT_PRINTF(4, 5);

389 390
static void
request_log(dns_dispatch_t *disp, dns_dispentry_t *resp,
391
	    int level, const char *fmt, ...)
392 393 394 395 396
{
	char msgbuf[2048];
	char peerbuf[256];
	va_list ap;

397 398 399
	if (! isc_log_wouldlog(dns_lctx, level))
		return;

400 401 402 403 404
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (VALID_RESPONSE(resp)) {
Andreas Gustafsson's avatar
Andreas Gustafsson committed
405
		isc_sockaddr_format(&resp->host, peerbuf, sizeof(peerbuf));
406 407
		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DISPATCH,
			      DNS_LOGMODULE_DISPATCH, level,
408
			      "dispatch %p response %p %s: %s", disp, resp,
409 410
			      peerbuf, msgbuf);
	} else {
411 412 413 414
		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DISPATCH,
			      DNS_LOGMODULE_DISPATCH, level,
			      "dispatch %p req/resp %p: %s", disp, resp,
			      msgbuf);
415 416 417
	}
}

418 419
/*%
 * ARC4 random number generator derived from OpenBSD.
420
 * Only dispatch_random() and dispatch_uniformrandom() are expected
421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
 * to be called from general dispatch routines; the rest of them are subroutines
 * for these two.
 *
 * The original copyright follows:
 * Copyright (c) 1996, David Mazieres <dm@uun.org>
 * Copyright (c) 2008, Damien Miller <djm@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
439
 */
440
#ifdef BIND9
441
static void
442 443 444
dispatch_initrandom(arc4ctx_t *actx, isc_entropy_t *entropy,
		    isc_mutex_t *lock)
{
445 446 447 448 449 450 451 452 453
	int n;
	for (n = 0; n < 256; n++)
		actx->s[n] = n;
	actx->i = 0;
	actx->j = 0;
	actx->count = 0;
	actx->entropy = entropy; /* don't have to attach */
	actx->lock = lock;
}
454

455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
static void
dispatch_arc4addrandom(arc4ctx_t *actx, unsigned char *dat, int datlen) {
	int n;
	isc_uint8_t si;

	actx->i--;
	for (n = 0; n < 256; n++) {
		actx->i = (actx->i + 1);
		si = actx->s[actx->i];
		actx->j = (actx->j + si + dat[n % datlen]);
		actx->s[actx->i] = actx->s[actx->j];
		actx->s[actx->j] = si;
	}
	actx->j = actx->i;
}

static inline isc_uint8_t
dispatch_arc4get8(arc4ctx_t *actx) {
	isc_uint8_t si, sj;

	actx->i = (actx->i + 1);
	si = actx->s[actx->i];
	actx->j = (actx->j + si);
	sj = actx->s[actx->j];
	actx->s[actx->i] = sj;
	actx->s[actx->j] = si;
481

482
	return (actx->s[(si + sj) & 0xff]);
483 484
}

485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
static inline isc_uint16_t
dispatch_arc4get16(arc4ctx_t *actx) {
	isc_uint16_t val;

	val = dispatch_arc4get8(actx) << 8;
	val |= dispatch_arc4get8(actx);

	return (val);
}

static void
dispatch_arc4stir(arc4ctx_t *actx) {
	int i;
	union {
		unsigned char rnd[128];
		isc_uint32_t rnd32[32];
	} rnd;
	isc_result_t result;

	if (actx->entropy != NULL) {
		/*
		 * We accept any quality of random data to avoid blocking.
		 */
		result = isc_entropy_getdata(actx->entropy, rnd.rnd,
					     sizeof(rnd), NULL, 0);
		RUNTIME_CHECK(result == ISC_R_SUCCESS);
	} else {
		for (i = 0; i < 32; i++)
			isc_random_get(&rnd.rnd32[i]);
	}
	dispatch_arc4addrandom(actx, rnd.rnd, sizeof(rnd.rnd));

	/*
	 * Discard early keystream, as per recommendations in:
	 * http://www.wisdom.weizmann.ac.il/~itsik/RC4/Papers/Rc4_ksa.ps
	 */
	for (i = 0; i < 256; i++)
		(void)dispatch_arc4get8(actx);

	/*
	 * Derived from OpenBSD's implementation.  The rationale is not clear,
	 * but should be conservative enough in safety, and reasonably large
	 * for efficiency.
	 */
	actx->count = 1600000;
}

static isc_uint16_t
533
dispatch_random(arc4ctx_t *actx) {
534 535 536 537 538 539 540 541 542 543 544 545
	isc_uint16_t result;

	if (actx->lock != NULL)
		LOCK(actx->lock);

	actx->count -= sizeof(isc_uint16_t);
	if (actx->count <= 0)
		dispatch_arc4stir(actx);
	result = dispatch_arc4get16(actx);

	if (actx->lock != NULL)
		UNLOCK(actx->lock);
Michael Graff's avatar
Michael Graff committed
546

547 548
	return (result);
}
549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577
#else
/*
 * For general purpose library, we don't have to be too strict about the
 * quality of random values.  Performance doesn't matter much, either.
 * So we simply use the isc_random module to keep the library as small as
 * possible.
 */

static void
dispatch_initrandom(arc4ctx_t *actx, isc_entropy_t *entropy,
		    isc_mutex_t *lock)
{
	UNUSED(actx);
	UNUSED(entropy);
	UNUSED(lock);

	return;
}

static isc_uint16_t
dispatch_random(arc4ctx_t *actx) {
	isc_uint32_t r;

	UNUSED(actx);

	isc_random_get(&r);
	return (r & 0xffff);
}
#endif	/* BIND9 */
578 579

static isc_uint16_t
580
dispatch_uniformrandom(arc4ctx_t *actx, isc_uint16_t upper_bound) {
581 582 583 584 585 586 587 588 589
	isc_uint16_t min, r;

	if (upper_bound < 2)
		return (0);

	/*
	 * Ensure the range of random numbers [min, 0xffff] be a multiple of
	 * upper_bound and contain at least a half of the 16 bit range.
	 */
Michael Graff's avatar
Michael Graff committed
590

591 592 593 594 595 596 597 598 599 600 601 602
	if (upper_bound > 0x8000)
		min = 1 + ~upper_bound; /* 0x8000 - upper_bound */
	else
		min = (isc_uint16_t)(0x10000 % (isc_uint32_t)upper_bound);

	/*
	 * This could theoretically loop forever but each retry has
	 * p > 0.5 (worst case, usually far better) of selecting a
	 * number inside the range we need, so it should rarely need
	 * to re-roll.
	 */
	for (;;) {
603
		r = dispatch_random(actx);
604 605 606 607 608
		if (r >= min)
			break;
	}

	return (r % upper_bound);
609
}
Michael Graff's avatar
Michael Graff committed
610

611 612 613 614
/*
 * Return a hash of the destination and message id.
 */
static isc_uint32_t
615 616 617
dns_hash(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id,
	 in_port_t port)
{
618 619 620
	unsigned int ret;

	ret = isc_sockaddr_hash(dest, ISC_TRUE);
621
	ret ^= (id << 16) | port;
622
	ret %= qid->qid_nbuckets;
623

624
	INSIST(ret < qid->qid_nbuckets);
625 626 627 628

	return (ret);
}

629 630 631
/*
 * Find the first entry in 'qid'.  Returns NULL if there are no entries.
 */
Michael Graff's avatar
Michael Graff committed
632
static dns_dispentry_t *
633
linear_first(dns_qid_t *qid) {
Michael Graff's avatar
Michael Graff committed
634 635 636 637 638
	dns_dispentry_t *ret;
	unsigned int bucket;

	bucket = 0;

639 640
	while (bucket < qid->qid_nbuckets) {
		ret = ISC_LIST_HEAD(qid->qid_table[bucket]);
Michael Graff's avatar
Michael Graff committed
641 642 643 644 645 646 647 648
		if (ret != NULL)
			return (ret);
		bucket++;
	}

	return (NULL);
}

649 650 651 652
/*
 * Find the next entry after 'resp' in 'qid'.  Return NULL if there are
 * no more entries.
 */
Michael Graff's avatar
Michael Graff committed
653
static dns_dispentry_t *
654
linear_next(dns_qid_t *qid, dns_dispentry_t *resp) {
Michael Graff's avatar
Michael Graff committed
655 656 657 658 659 660 661 662
	dns_dispentry_t *ret;
	unsigned int bucket;

	ret = ISC_LIST_NEXT(resp, link);
	if (ret != NULL)
		return (ret);

	bucket = resp->bucket;
663
	bucket++;
664 665
	while (bucket < qid->qid_nbuckets) {
		ret = ISC_LIST_HEAD(qid->qid_table[bucket]);
Michael Graff's avatar
Michael Graff committed
666 667 668 669 670 671 672
		if (ret != NULL)
			return (ret);
		bucket++;
	}

	return (NULL);
}
673

674 675 676 677 678 679 680 681 682
/*
 * The dispatch must be locked.
 */
static isc_boolean_t
destroy_disp_ok(dns_dispatch_t *disp)
{
	if (disp->refcount != 0)
		return (ISC_FALSE);

683
	if (disp->recv_pending != 0)
684 685
		return (ISC_FALSE);

686 687 688
	if (!ISC_LIST_EMPTY(disp->activesockets))
		return (ISC_FALSE);

689 690 691 692 693 694
	if (disp->shutting_down == 0)
		return (ISC_FALSE);

	return (ISC_TRUE);
}

695
/*
696 697
 * Called when refcount reaches 0 (and safe to destroy).
 *
698 699
 * The dispatcher must not be locked.
 * The manager must be locked.
700 701
 */
static void
702
destroy_disp(isc_task_t *task, isc_event_t *event) {
703
	dns_dispatch_t *disp;
704 705
	dns_dispatchmgr_t *mgr;
	isc_boolean_t killmgr;
706 707
	dispsocket_t *dispsocket;
	int i;
708

709 710 711 712 713
	INSIST(event->ev_type == DNS_EVENT_DISPATCHCONTROL);

	UNUSED(task);

	disp = event->ev_arg;
714 715
	mgr = disp->mgr;

716
	LOCK(&mgr->lock);
717
	ISC_LIST_UNLINK(mgr->list, disp, link);
Michael Graff's avatar
Michael Graff committed
718

719 720
	dispatch_log(disp, LVL(90),
		     "shutting down; detaching from sock %p, task %p",
721
		     disp->socket, disp->task[0]); /* XXXX */
722

723 724 725 726 727 728 729 730
	if (disp->socket != NULL)
		isc_socket_detach(&disp->socket);
	while ((dispsocket = ISC_LIST_HEAD(disp->inactivesockets)) != NULL) {
		ISC_LIST_UNLINK(disp->inactivesockets, dispsocket, link);
		destroy_dispsocket(disp, &dispsocket);
	}
	for (i = 0; i < disp->ntasks; i++)
		isc_task_detach(&disp->task[i]);
731
	isc_event_free(&event);
732

733
	dispatch_free(&disp);
734 735 736 737 738

	killmgr = destroy_mgr_ok(mgr);
	UNLOCK(&mgr->lock);
	if (killmgr)
		destroy_mgr(&mgr);
739 740
}

741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780
/*%
 * Manipulate port table per dispatch: find an entry for a given port number,
 * create a new entry, and decrement a given entry with possible clean-up.
 */
static dispportentry_t *
port_search(dns_dispatch_t *disp, in_port_t port) {
	dispportentry_t *portentry;

	REQUIRE(disp->port_table != NULL);

	portentry = ISC_LIST_HEAD(disp->port_table[port %
						   DNS_DISPATCH_PORTTABLESIZE]);
	while (portentry != NULL) {
		if (portentry->port == port)
			return (portentry);
		portentry = ISC_LIST_NEXT(portentry, link);
	}

	return (NULL);
}

static dispportentry_t *
new_portentry(dns_dispatch_t *disp, in_port_t port) {
	dispportentry_t *portentry;

	REQUIRE(disp->port_table != NULL);

	portentry = isc_mempool_get(disp->portpool);
	if (portentry == NULL)
		return (portentry);

	portentry->port = port;
	portentry->refs = 0;
	ISC_LINK_INIT(portentry, link);
	ISC_LIST_APPEND(disp->port_table[port % DNS_DISPATCH_PORTTABLESIZE],
			portentry, link);

	return (portentry);
}

781 782 783
/*%
 * The caller must not hold the qid->lock.
 */
784 785 786
static void
deref_portentry(dns_dispatch_t *disp, dispportentry_t **portentryp) {
	dispportentry_t *portentry = *portentryp;
Mark Andrews's avatar
Mark Andrews committed
787
	dns_qid_t *qid;
788 789 790 791

	REQUIRE(disp->port_table != NULL);
	REQUIRE(portentry != NULL && portentry->refs > 0);

792 793
	qid = DNS_QID(disp);
	LOCK(&qid->lock);
794 795 796 797 798 799 800 801 802
	portentry->refs--;
	if (portentry->refs == 0) {
		ISC_LIST_UNLINK(disp->port_table[portentry->port %
						 DNS_DISPATCH_PORTTABLESIZE],
				portentry, link);
		isc_mempool_put(disp->portpool, portentry);
	}

	*portentryp = NULL;
803
	UNLOCK(&qid->lock);
804 805
}

806 807 808 809 810 811 812 813 814 815 816 817 818 819 820
/*%
 * Find a dispsocket for socket address 'dest', and port number 'port'.
 * Return NULL if no such entry exists.
 */
static dispsocket_t *
socket_search(dns_qid_t *qid, isc_sockaddr_t *dest, in_port_t port,
	      unsigned int bucket)
{
	dispsocket_t *dispsock;

	REQUIRE(bucket < qid->qid_nbuckets);

	dispsock = ISC_LIST_HEAD(qid->sock_table[bucket]);

	while (dispsock != NULL) {
Automatic Updater's avatar
Automatic Updater committed
821
		if (dispsock->portentry != NULL &&
822 823
		    dispsock->portentry->port == port &&
		    isc_sockaddr_equal(dest, &dispsock->host))
824 825 826 827 828 829 830
			return (dispsock);
		dispsock = ISC_LIST_NEXT(dispsock, blink);
	}

	return (NULL);
}

831 832 833 834 835 836 837
/*%
 * Make a new socket for a single dispatch with a random port number.
 * The caller must hold the disp->lock and qid->lock.
 */
static isc_result_t
get_dispsocket(dns_dispatch_t *disp, isc_sockaddr_t *dest,
	       isc_socketmgr_t *sockmgr, dns_qid_t *qid,
838
	       dispsocket_t **dispsockp, in_port_t *portp)
839 840 841 842 843 844 845 846
{
	int i;
	isc_uint32_t r;
	dns_dispatchmgr_t *mgr = disp->mgr;
	isc_socket_t *sock = NULL;
	isc_result_t result = ISC_R_FAILURE;
	in_port_t port;
	isc_sockaddr_t localaddr;
847
	unsigned int bucket = 0;
848 849 850
	dispsocket_t *dispsock;
	unsigned int nports;
	in_port_t *ports;
851
	unsigned int bindoptions;
852
	dispportentry_t *portentry = NULL;
853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877

	if (isc_sockaddr_pf(&disp->local) == AF_INET) {
		nports = disp->mgr->nv4ports;
		ports = disp->mgr->v4ports;
	} else {
		nports = disp->mgr->nv6ports;
		ports = disp->mgr->v6ports;
	}
	if (nports == 0)
		return (ISC_R_ADDRNOTAVAIL);

	dispsock = ISC_LIST_HEAD(disp->inactivesockets);
	if (dispsock != NULL) {
		ISC_LIST_UNLINK(disp->inactivesockets, dispsock, link);
		sock = dispsock->socket;
		dispsock->socket = NULL;
	} else {
		dispsock = isc_mempool_get(mgr->spool);
		if (dispsock == NULL)
			return (ISC_R_NOMEMORY);

		disp->nsockets++;
		dispsock->socket = NULL;
		dispsock->disp = disp;
		dispsock->resp = NULL;
878
		dispsock->portentry = NULL;
879 880 881 882
		isc_random_get(&r);
		dispsock->task = NULL;
		isc_task_attach(disp->task[r % disp->ntasks], &dispsock->task);
		ISC_LINK_INIT(dispsock, link);
883
		ISC_LINK_INIT(dispsock, blink);
884 885 886 887 888 889 890 891 892 893
		dispsock->magic = DISPSOCK_MAGIC;
	}

	/*
	 * Pick up a random UDP port and open a new socket with it.  Avoid
	 * choosing ports that share the same destination because it will be
	 * very likely to fail in bind(2) or connect(2).
	 */
	localaddr = disp->local;
	for (i = 0; i < 64; i++) {
894
		port = ports[dispatch_uniformrandom(DISP_ARC4CTX(disp),
895 896 897
							nports)];
		isc_sockaddr_setport(&localaddr, port);

898 899
		bucket = dns_hash(qid, dest, 0, port);
		if (socket_search(qid, dest, port, bucket) != NULL)
900
			continue;
901
		bindoptions = 0;
902 903 904 905 906 907 908 909 910 911 912 913 914 915 916
		portentry = port_search(disp, port);
		if (portentry != NULL)
			bindoptions |= ISC_SOCKET_REUSEADDRESS;
		result = open_socket(sockmgr, &localaddr, bindoptions, &sock);
		if (result == ISC_R_SUCCESS) {
			if (portentry == NULL) {
				portentry = new_portentry(disp, port);
				if (portentry == NULL) {
					result = ISC_R_NOMEMORY;
					break;
				}
			}
			portentry->refs++;
			break;
		} else if (result != ISC_R_ADDRINUSE)
917 918 919 920 921
			break;
	}

	if (result == ISC_R_SUCCESS) {
		dispsock->socket = sock;
922
		dispsock->host = *dest;
923
		dispsock->portentry = portentry;
924 925
		dispsock->bucket = bucket;
		ISC_LIST_APPEND(qid->sock_table[bucket], dispsock, blink);
926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947
		*dispsockp = dispsock;
		*portp = port;
	} else {
		/*
		 * We could keep it in the inactive list, but since this should
		 * be an exceptional case and might be resource shortage, we'd
		 * rather destroy it.
		 */
		if (sock != NULL)
			isc_socket_detach(&sock);
		destroy_dispsocket(disp, &dispsock);
	}

	return (result);
}

/*%
 * Destroy a dedicated dispatch socket.
 */
static void
destroy_dispsocket(dns_dispatch_t *disp, dispsocket_t **dispsockp) {
	dispsocket_t *dispsock;
948
	dns_qid_t *qid;
949 950 951 952 953 954 955 956 957 958 959

	/*
	 * The dispatch must be locked.
	 */

	REQUIRE(dispsockp != NULL && *dispsockp != NULL);
	dispsock = *dispsockp;
	REQUIRE(!ISC_LINK_LINKED(dispsock, link));

	disp->nsockets--;
	dispsock->magic = 0;
960 961
	if (dispsock->portentry != NULL)
		deref_portentry(disp, &dispsock->portentry);
962 963
	if (dispsock->socket != NULL)
		isc_socket_detach(&dispsock->socket);
964 965 966 967 968 969 970
	if (ISC_LINK_LINKED(dispsock, blink)) {
		qid = DNS_QID(disp);
		LOCK(&qid->lock);
		ISC_LIST_UNLINK(qid->sock_table[dispsock->bucket], dispsock,
				blink);
		UNLOCK(&qid->lock);
	}
971 972 973 974 975 976 977 978 979 980 981 982 983
	if (dispsock->task != NULL)
		isc_task_detach(&dispsock->task);
	isc_mempool_put(disp->mgr->spool, dispsock);

	*dispsockp = NULL;
}

/*%
 * Deactivate a dedicated dispatch socket.  Move it to the inactive list for
 * future reuse unless the total number of sockets are exceeding the maximum.
 */
static void
deactivate_dispsocket(dns_dispatch_t *disp, dispsocket_t *dispsock) {
984
	isc_result_t result;
985
	dns_qid_t *qid;
986

987 988 989 990 991 992 993 994 995
	/*
	 * The dispatch must be locked.
	 */
	ISC_LIST_UNLINK(disp->activesockets, dispsock, link);
	if (dispsock->resp != NULL) {
		INSIST(dispsock->resp->dispsocket == dispsock);
		dispsock->resp->dispsocket = NULL;
	}

996 997 998
	INSIST(dispsock->portentry != NULL);
	deref_portentry(disp, &dispsock->portentry);

999
#ifdef BIND9
1000 1001 1002
	if (disp->nsockets > DNS_DISPATCH_POOLSOCKS)
		destroy_dispsocket(disp, &dispsock);
	else {
1003
		result = isc_socket_close(dispsock->socket);
1004 1005 1006 1007 1008 1009 1010

		qid = DNS_QID(disp);
		LOCK(&qid->lock);
		ISC_LIST_UNLINK(qid->sock_table[dispsock->bucket], dispsock,
				blink);
		UNLOCK(&qid->lock);

1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
		if (result == ISC_R_SUCCESS)
			ISC_LIST_APPEND(disp->inactivesockets, dispsock, link);
		else {
			/*
			 * If the underlying system does not allow this
			 * optimization, destroy this temporary structure (and
			 * create a new one for a new transaction).
			 */
			INSIST(result == ISC_R_NOTIMPLEMENTED);
			destroy_dispsocket(disp, &dispsock);
		}
1022
	}
1023 1024 1025 1026 1027 1028 1029
#else
	/* This kind of optimization isn't necessary for normal use */
	UNUSED(qid);
	UNUSED(result);

	destroy_dispsocket(disp, &dispsock);
#endif
1030
}
1031

1032
/*
1033
 * Find an entry for query ID 'id', socket address 'dest', and port number
1034
 * 'port'.
1035 1036
 * Return NULL if no such entry exists.
 */
Michael Graff's avatar
Michael Graff committed
1037
static dns_dispentry_t *
1038 1039
entry_search(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id,
	     in_port_t port, unsigned int bucket)
1040
{
Michael Graff's avatar
Michael Graff committed
1041
	dns_dispentry_t *res;
1042

1043
	REQUIRE(bucket < qid->qid_nbuckets);
1044

1045
	res = ISC_LIST_HEAD(qid->qid_table[bucket]);
1046 1047

	while (res != NULL) {
1048
		if (res->id == id && isc_sockaddr_equal(dest, &res->host) &&
1049
		    res->port == port) {
1050
			return (res);
1051
		}
1052 1053 1054 1055 1056 1057
		res = ISC_LIST_NEXT(res, link);
	}

	return (NULL);
}

Michael Graff's avatar
Michael Graff committed
1058
static void
1059
free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len) {
Michael Graff's avatar
Michael Graff committed
1060
	INSIST(buf != NULL && len != 0);
1061

Michael Graff's avatar
Michael Graff committed
1062

1063
	switch (disp->socktype) {
1064
	case isc_sockettype_tcp:
1065 1066
		INSIST(disp->tcpbuffers > 0);
		disp->tcpbuffers--;
1067
		isc_mem_put(disp->mgr->mctx, buf, len);
1068
		break;
1069
	case isc_sockettype_udp:
1070 1071 1072 1073 1074 1075
		LOCK(&disp->mgr->buffer_lock);
		INSIST(disp->mgr->buffers > 0);
		INSIST(len == disp->mgr->buffersize);
		disp->mgr->buffers--;
		isc_mempool_put(disp->mgr->bpool, buf);
		UNLOCK(&disp->mgr->buffer_lock);
1076 1077
		break;
	default:
Michael Graff's avatar
Michael Graff committed
1078
		INSIST(0);
1079 1080
		break;
	}
Michael Graff's avatar
Michael Graff committed
1081 1082 1083
}

static void *
1084
allocate_udp_buffer(dns_dispatch_t *disp) {
Michael Graff's avatar
Michael Graff committed
1085 1086
	void *temp;

1087 1088
	LOCK(&disp->mgr->buffer_lock);
	temp = isc_mempool_get(disp->mgr->bpool);
Michael Graff's avatar
Michael Graff committed
1089

1090
	if (temp != NULL)
1091 1092
		disp->mgr->buffers++;
	UNLOCK(&disp->mgr->buffer_lock);
Michael Graff's avatar
Michael Graff committed
1093

Michael Graff's avatar
Michael Graff committed
1094 1095 1096 1097
	return (temp);
}

static inline void
1098
free_event(dns_dispatch_t *disp, dns_dispatchevent_t *ev) {
Michael Graff's avatar
Michael Graff committed
1099 1100 1101
	if (disp->failsafe_ev == ev) {
		INSIST(disp->shutdown_out == 1);
		disp->shutdown_out = 0;
1102

Michael Graff's avatar
Michael Graff committed
1103 1104 1105
		return;
	}

1106
	isc_mempool_put(disp->mgr->epool, ev);
Michael Graff's avatar
Michael Graff committed
1107 1108 1109
}

static inline dns_dispatchevent_t *
1110
allocate_event(dns_dispatch_t *disp) {
Michael Graff's avatar
Michael Graff committed
1111 1112
	dns_dispatchevent_t *ev;

1113
	ev = isc_mempool_get(disp->mgr->epool);
1114 1115
	if (ev == NULL)
		return (NULL);
1116 1117
	ISC_EVENT_INIT(ev, sizeof(*ev), 0, NULL, 0,
		       NULL, NULL, NULL, NULL, NULL);
Michael Graff's avatar
Michael Graff committed
1118 1119 1120 1121

	return (ev);
}

1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141
static void