dispatch.c 97.5 KB
Newer Older
Michael Graff's avatar
Michael Graff committed
1
/*
2
 * Copyright (C) 2004-2009, 2011-2015  Internet Systems Consortium, Inc. ("ISC")
Mark Andrews's avatar
Mark Andrews committed
3
 * Copyright (C) 1999-2003  Internet Software Consortium.
4
 *
Automatic Updater's avatar
Automatic Updater committed
5
 * Permission to use, copy, modify, and/or distribute this software for any
Michael Graff's avatar
Michael Graff committed
6 7
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
8
 *
Mark Andrews's avatar
Mark Andrews committed
9 10 11 12 13 14 15
 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
Michael Graff's avatar
Michael Graff committed
16 17
 */

18
/* $Id: dispatch.c,v 1.175 2011/11/29 01:03:47 marka Exp $ */
19 20

/*! \file */
David Lawrence's avatar
David Lawrence committed
21

Michael Graff's avatar
Michael Graff committed
22 23 24
#include <config.h>

#include <stdlib.h>
25 26
#include <sys/types.h>
#include <unistd.h>
27
#include <stdlib.h>
Michael Graff's avatar
Michael Graff committed
28

29
#include <isc/entropy.h>
Michael Graff's avatar
Michael Graff committed
30
#include <isc/mem.h>
31
#include <isc/mutex.h>
32
#include <isc/portset.h>
33
#include <isc/print.h>
34
#include <isc/random.h>
35
#include <isc/socket.h>
36
#include <isc/stats.h>
37
#include <isc/string.h>
38
#include <isc/task.h>
39
#include <isc/time.h>
Michael Graff's avatar
Michael Graff committed
40
#include <isc/util.h>
Michael Graff's avatar
Michael Graff committed
41

42
#include <dns/acl.h>
Michael Graff's avatar
Michael Graff committed
43
#include <dns/dispatch.h>
44 45
#include <dns/events.h>
#include <dns/log.h>
46
#include <dns/message.h>
47
#include <dns/portlist.h>
48
#include <dns/stats.h>
49
#include <dns/tcpmsg.h>
50 51
#include <dns/types.h>

52 53
typedef ISC_LIST(dns_dispentry_t)	dns_displist_t;

54
typedef struct dispsocket		dispsocket_t;
55 56
typedef ISC_LIST(dispsocket_t)		dispsocketlist_t;

57 58 59
typedef struct dispportentry		dispportentry_t;
typedef ISC_LIST(dispportentry_t)	dispportlist_t;

60 61
typedef struct dns_qid {
	unsigned int	magic;
62 63
	unsigned int	qid_nbuckets;	/*%< hash table size */
	unsigned int	qid_increment;	/*%< id increment on collision */
64
	isc_mutex_t	lock;
65
	dns_displist_t	*qid_table;	/*%< the table itself */
66
	dispsocketlist_t *sock_table;	/*%< socket table */
67 68
} dns_qid_t;

69 70 71 72
struct dns_dispatchmgr {
	/* Unlocked. */
	unsigned int			magic;
	isc_mem_t		       *mctx;
73
	dns_acl_t		       *blackhole;
74
	dns_portlist_t		       *portlist;
75
	isc_stats_t		       *stats;
76
	isc_entropy_t		       *entropy; /*%< entropy source */
77 78 79 80 81

	/* Locked by "lock". */
	isc_mutex_t			lock;
	unsigned int			state;
	ISC_LIST(dns_dispatch_t)	list;
82

83 84 85
	/* Locked by rng_lock. */
	isc_mutex_t			rng_lock;
	isc_rng_t		       *rngctx; /*%< RNG context for QID */
86

87
	/* locked by buffer_lock */
88
	dns_qid_t			*qid;
89
	isc_mutex_t			buffer_lock;
90 91 92
	unsigned int			buffers;    /*%< allocated buffers */
	unsigned int			buffersize; /*%< size of each buffer */
	unsigned int			maxbuffers; /*%< max buffers */
93 94

	/* Locked internally. */
95 96 97 98 99
	isc_mutex_t			depool_lock;
	isc_mempool_t		       *depool;	/*%< pool for dispatch events */
	isc_mutex_t			rpool_lock;
	isc_mempool_t		       *rpool;	/*%< pool for replies */
	isc_mutex_t			dpool_lock;
100
	isc_mempool_t		       *dpool;  /*%< dispatch allocations */
101 102 103 104
	isc_mutex_t			bpool_lock;
	isc_mempool_t		       *bpool;	/*%< pool for buffers */
	isc_mutex_t			spool_lock;
	isc_mempool_t		       *spool;	/*%< pool for dispsocks */
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124

	/*%
	 * Locked by qid->lock if qid exists; otherwise, can be used without
	 * being locked.
	 * Memory footprint considerations: this is a simple implementation of
	 * available ports, i.e., an ordered array of the actual port numbers.
	 * This will require about 256KB of memory in the worst case (128KB for
	 * each of IPv4 and IPv6).  We could reduce it by representing it as a
	 * more sophisticated way such as a list (or array) of ranges that are
	 * searched to identify a specific port.  Our decision here is the saved
	 * memory isn't worth the implementation complexity, considering the
	 * fact that the whole BIND9 process (which is mainly named) already
	 * requires a pretty large memory footprint.  We may, however, have to
	 * revisit the decision when we want to use it as a separate module for
	 * an environment where memory requirement is severer.
	 */
	in_port_t	*v4ports;	/*%< available ports for IPv4 */
	unsigned int	nv4ports;	/*%< # of available ports for IPv4 */
	in_port_t	*v6ports;	/*%< available ports for IPv4 */
	unsigned int	nv6ports;	/*%< # of available ports for IPv4 */
125 126 127 128 129 130
};

#define MGR_SHUTTINGDOWN		0x00000001U
#define MGR_IS_SHUTTINGDOWN(l)	(((l)->state & MGR_SHUTTINGDOWN) != 0)

#define IS_PRIVATE(d)	(((d)->attributes & DNS_DISPATCHATTR_PRIVATE) != 0)
Michael Graff's avatar
Michael Graff committed
131

Michael Graff's avatar
Michael Graff committed
132
struct dns_dispentry {
Michael Graff's avatar
Michael Graff committed
133
	unsigned int			magic;
134
	dns_dispatch_t		       *disp;
135
	dns_messageid_t			id;
136
	in_port_t			port;
Michael Graff's avatar
Michael Graff committed
137
	unsigned int			bucket;
Michael Graff's avatar
Michael Graff committed
138
	isc_sockaddr_t			host;
Michael Graff's avatar
Michael Graff committed
139 140 141
	isc_task_t		       *task;
	isc_taskaction_t		action;
	void			       *arg;
Michael Graff's avatar
Michael Graff committed
142
	isc_boolean_t			item_out;
143
	dispsocket_t			*dispsocket;
Michael Graff's avatar
Michael Graff committed
144
	ISC_LIST(dns_dispatchevent_t)	items;
Michael Graff's avatar
Michael Graff committed
145
	ISC_LINK(dns_dispentry_t)	link;
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
};

/*%
 * Maximum number of dispatch sockets that can be pooled for reuse.  The
 * appropriate value may vary, but experiments have shown a busy caching server
 * may need more than 1000 sockets concurrently opened.  The maximum allowable
 * number of dispatch sockets (per manager) will be set to the double of this
 * value.
 */
#ifndef DNS_DISPATCH_POOLSOCKS
#define DNS_DISPATCH_POOLSOCKS			2048
#endif

/*%
 * Quota to control the number of dispatch sockets.  If a dispatch has more
 * than the quota of sockets, new queries will purge oldest ones, so that
 * a massive number of outstanding queries won't prevent subsequent queries
 * (especially if the older ones take longer time and result in timeout).
 */
#ifndef DNS_DISPATCH_SOCKSQUOTA
#define DNS_DISPATCH_SOCKSQUOTA			3072
#endif

struct dispsocket {
	unsigned int			magic;
	isc_socket_t			*socket;
	dns_dispatch_t			*disp;
173
	isc_sockaddr_t			host;
174 175
	in_port_t			localport; /* XXX: should be removed later */
	dispportentry_t			*portentry;
176 177 178
	dns_dispentry_t			*resp;
	isc_task_t			*task;
	ISC_LINK(dispsocket_t)		link;
179 180
	unsigned int			bucket;
	ISC_LINK(dispsocket_t)		blink;
Michael Graff's avatar
Michael Graff committed
181
};
Michael Graff's avatar
Michael Graff committed
182

183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
/*%
 * A port table entry.  We remember every port we first open in a table with a
 * reference counter so that we can 'reuse' the same port (with different
 * destination addresses) using the SO_REUSEADDR socket option.
 */
struct dispportentry {
	in_port_t			port;
	unsigned int			refs;
	ISC_LINK(struct dispportentry)	link;
};

#ifndef DNS_DISPATCH_PORTTABLESIZE
#define DNS_DISPATCH_PORTTABLESIZE	1024
#endif

198
#define INVALID_BUCKET		(0xffffdead)
Michael Graff's avatar
Michael Graff committed
199

200 201 202 203 204 205 206
/*%
 * Number of tasks for each dispatch that use separate sockets for different
 * transactions.  This must be a power of 2 as it will divide 32 bit numbers
 * to get an uniformly random tasks selection.  See get_dispsocket().
 */
#define MAX_INTERNAL_TASKS	64

Michael Graff's avatar
Michael Graff committed
207 208
struct dns_dispatch {
	/* Unlocked. */
209 210
	unsigned int		magic;		/*%< magic */
	dns_dispatchmgr_t      *mgr;		/*%< dispatch manager */
211 212 213 214 215 216 217
	int			ntasks;
	/*%
	 * internal task buckets.  We use multiple tasks to distribute various
	 * socket events well when using separate dispatch sockets.  We use the
	 * 1st task (task[0]) for internal control events.
	 */
	isc_task_t	       *task[MAX_INTERNAL_TASKS];
218 219
	isc_socket_t	       *socket;		/*%< isc socket attached to */
	isc_sockaddr_t		local;		/*%< local address */
220
	in_port_t		localport;	/*%< local UDP port */
221
	isc_sockaddr_t		peer;		/*%< peer address (TCP) */
Evan Hunt's avatar
Evan Hunt committed
222
	isc_dscp_t		dscp;		/*%< "listen-on" DSCP value */
223
	unsigned int		maxrequests;	/*%< max requests */
224
	isc_event_t	       *ctlevent;
Michael Graff's avatar
Michael Graff committed
225

226 227 228
	isc_mutex_t		sepool_lock;
	isc_mempool_t	       *sepool;		/*%< pool for socket events */

229
	/*% Locked by mgr->lock. */
230 231 232
	ISC_LINK(dns_dispatch_t) link;

	/* Locked by "lock". */
233
	isc_mutex_t		lock;		/*%< locks all below */
234
	isc_sockettype_t	socktype;
235
	unsigned int		attributes;
236 237
	unsigned int		refcount;	/*%< number of users */
	dns_dispatchevent_t    *failsafe_ev;	/*%< failsafe cancel event */
Michael Graff's avatar
Michael Graff committed
238
	unsigned int		shutting_down : 1,
239 240
				shutdown_out : 1,
				connected : 1,
241
				tcpmsg_valid : 1,
242
				recv_pending : 1; /*%< is a recv() pending? */
Michael Graff's avatar
Michael Graff committed
243
	isc_result_t		shutdown_why;
244 245 246
	ISC_LIST(dispsocket_t)	activesockets;
	ISC_LIST(dispsocket_t)	inactivesockets;
	unsigned int		nsockets;
247 248 249
	unsigned int		requests;	/*%< how many requests we have */
	unsigned int		tcpbuffers;	/*%< allocated buffers */
	dns_tcpmsg_t		tcpmsg;		/*%< for tcp streams */
250
	dns_qid_t		*qid;
251
	isc_rng_t		*rngctx;	/*%< for QID/UDP port num */
252 253
	dispportlist_t		*port_table;	/*%< hold ports 'owned' by us */
	isc_mempool_t		*portpool;	/*%< port table entries  */
Michael Graff's avatar
Michael Graff committed
254 255
};

256 257 258
#define QID_MAGIC		ISC_MAGIC('Q', 'i', 'd', ' ')
#define VALID_QID(e)		ISC_MAGIC_VALID((e), QID_MAGIC)

259 260
#define RESPONSE_MAGIC		ISC_MAGIC('D', 'r', 's', 'p')
#define VALID_RESPONSE(e)	ISC_MAGIC_VALID((e), RESPONSE_MAGIC)
Michael Graff's avatar
Michael Graff committed
261

262 263 264
#define DISPSOCK_MAGIC		ISC_MAGIC('D', 's', 'o', 'c')
#define VALID_DISPSOCK(e)	ISC_MAGIC_VALID((e), DISPSOCK_MAGIC)

265 266
#define DISPATCH_MAGIC		ISC_MAGIC('D', 'i', 's', 'p')
#define VALID_DISPATCH(e)	ISC_MAGIC_VALID((e), DISPATCH_MAGIC)
Michael Graff's avatar
Michael Graff committed
267

268 269
#define DNS_DISPATCHMGR_MAGIC	ISC_MAGIC('D', 'M', 'g', 'r')
#define VALID_DISPATCHMGR(e)	ISC_MAGIC_VALID((e), DNS_DISPATCHMGR_MAGIC)
Michael Graff's avatar
Michael Graff committed
270

271 272
#define DNS_QID(disp) ((disp)->socktype == isc_sockettype_tcp) ? \
		       (disp)->qid : (disp)->mgr->qid
273 274
#define DISP_RNGCTX(disp) ((disp)->socktype == isc_sockettype_udp) ? \
			((disp)->rngctx) : ((disp)->mgr->rngctx)
275 276 277 278 279 280 281 282 283 284 285

/*%
 * Locking a query port buffer is a bit tricky.  We access the buffer without
 * locking until qid is created.  Technically, there is a possibility of race
 * between the creation of qid and access to the port buffer; in practice,
 * however, this should be safe because qid isn't created until the first
 * dispatch is created and there should be no contending situation until then.
 */
#define PORTBUFLOCK(mgr) if ((mgr)->qid != NULL) LOCK(&((mgr)->qid->lock))
#define PORTBUFUNLOCK(mgr) if ((mgr)->qid != NULL) UNLOCK((&(mgr)->qid->lock))

286
/*
287
 * Statics.
288
 */
289 290
static dns_dispentry_t *entry_search(dns_qid_t *, isc_sockaddr_t *,
				     dns_messageid_t, in_port_t, unsigned int);
291
static isc_boolean_t destroy_disp_ok(dns_dispatch_t *);
292
static void destroy_disp(isc_task_t *task, isc_event_t *event);
293 294 295 296 297
static void destroy_dispsocket(dns_dispatch_t *, dispsocket_t **);
static void deactivate_dispsocket(dns_dispatch_t *, dispsocket_t *);
static void udp_exrecv(isc_task_t *, isc_event_t *);
static void udp_shrecv(isc_task_t *, isc_event_t *);
static void udp_recv(isc_event_t *, dns_dispatch_t *, dispsocket_t *);
298
static void tcp_recv(isc_task_t *, isc_event_t *);
299 300 301
static isc_result_t startrecv(dns_dispatch_t *, dispsocket_t *);
static isc_uint32_t dns_hash(dns_qid_t *, isc_sockaddr_t *, dns_messageid_t,
			     in_port_t);
302
static void free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len);
303
static void *allocate_udp_buffer(dns_dispatch_t *disp);
304 305
static inline void free_devent(dns_dispatch_t *disp, dns_dispatchevent_t *ev);
static inline dns_dispatchevent_t *allocate_devent(dns_dispatch_t *disp);
306
static void do_cancel(dns_dispatch_t *disp);
307 308
static dns_dispentry_t *linear_first(dns_qid_t *disp);
static dns_dispentry_t *linear_next(dns_qid_t *disp,
309
				    dns_dispentry_t *resp);
310
static void dispatch_free(dns_dispatch_t **dispp);
311 312 313 314
static isc_result_t get_udpsocket(dns_dispatchmgr_t *mgr,
				  dns_dispatch_t *disp,
				  isc_socketmgr_t *sockmgr,
				  isc_sockaddr_t *localaddr,
315 316
				  isc_socket_t **sockp,
				  isc_socket_t *dup_socket);
317 318 319 320 321 322
static isc_result_t dispatch_createudp(dns_dispatchmgr_t *mgr,
				       isc_socketmgr_t *sockmgr,
				       isc_taskmgr_t *taskmgr,
				       isc_sockaddr_t *localaddr,
				       unsigned int maxrequests,
				       unsigned int attributes,
323 324
				       dns_dispatch_t **dispp,
				       isc_socket_t *dup_socket);
325 326
static isc_boolean_t destroy_mgr_ok(dns_dispatchmgr_t *mgr);
static void destroy_mgr(dns_dispatchmgr_t **mgrp);
327
static isc_result_t qid_allocate(dns_dispatchmgr_t *mgr, unsigned int buckets,
328 329
				 unsigned int increment, dns_qid_t **qidp,
				 isc_boolean_t needaddrtable);
330
static void qid_destroy(isc_mem_t *mctx, dns_qid_t **qidp);
331
static isc_result_t open_socket(isc_socketmgr_t *mgr, isc_sockaddr_t *local,
332 333
				unsigned int options, isc_socket_t **sockp,
				isc_socket_t *dup_socket);
334 335
static isc_boolean_t portavailable(dns_dispatchmgr_t *mgr, isc_socket_t *sock,
				   isc_sockaddr_t *sockaddrp);
336 337

#define LVL(x) ISC_LOG_DEBUG(x)
Michael Graff's avatar
Michael Graff committed
338

339 340 341 342
static void
mgr_log(dns_dispatchmgr_t *mgr, int level, const char *fmt, ...)
     ISC_FORMAT_PRINTF(3, 4);

343
static void
344
mgr_log(dns_dispatchmgr_t *mgr, int level, const char *fmt, ...) {
345 346 347
	char msgbuf[2048];
	va_list ap;

348 349 350
	if (! isc_log_wouldlog(dns_lctx, level))
		return;

351 352 353 354
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

355 356 357 358 359
	isc_log_write(dns_lctx,
		      DNS_LOGCATEGORY_DISPATCH, DNS_LOGMODULE_DISPATCH,
		      level, "dispatchmgr %p: %s", mgr, msgbuf);
}

360
static inline void
Mark Andrews's avatar
Mark Andrews committed
361
inc_stats(dns_dispatchmgr_t *mgr, isc_statscounter_t counter) {
362 363 364 365
	if (mgr->stats != NULL)
		isc_stats_increment(mgr->stats, counter);
}

366 367 368 369 370 371
static inline void
dec_stats(dns_dispatchmgr_t *mgr, isc_statscounter_t counter) {
	if (mgr->stats != NULL)
		isc_stats_decrement(mgr->stats, counter);
}

372 373 374 375
static void
dispatch_log(dns_dispatch_t *disp, int level, const char *fmt, ...)
     ISC_FORMAT_PRINTF(3, 4);

376 377 378 379 380
static void
dispatch_log(dns_dispatch_t *disp, int level, const char *fmt, ...) {
	char msgbuf[2048];
	va_list ap;

Andreas Gustafsson's avatar
Andreas Gustafsson committed
381 382
	if (! isc_log_wouldlog(dns_lctx, level))
		return;
383

384 385 386 387 388 389 390
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(dns_lctx,
		      DNS_LOGCATEGORY_DISPATCH, DNS_LOGMODULE_DISPATCH,
		      level, "dispatch %p: %s", disp, msgbuf);
391 392
}

393 394 395 396 397
static void
request_log(dns_dispatch_t *disp, dns_dispentry_t *resp,
	    int level, const char *fmt, ...)
     ISC_FORMAT_PRINTF(4, 5);

398 399
static void
request_log(dns_dispatch_t *disp, dns_dispentry_t *resp,
400
	    int level, const char *fmt, ...)
401 402 403 404 405
{
	char msgbuf[2048];
	char peerbuf[256];
	va_list ap;

406 407 408
	if (! isc_log_wouldlog(dns_lctx, level))
		return;

409 410 411 412 413
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (VALID_RESPONSE(resp)) {
Andreas Gustafsson's avatar
Andreas Gustafsson committed
414
		isc_sockaddr_format(&resp->host, peerbuf, sizeof(peerbuf));
415 416
		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DISPATCH,
			      DNS_LOGMODULE_DISPATCH, level,
417
			      "dispatch %p response %p %s: %s", disp, resp,
418 419
			      peerbuf, msgbuf);
	} else {
420 421 422 423
		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DISPATCH,
			      DNS_LOGMODULE_DISPATCH, level,
			      "dispatch %p req/resp %p: %s", disp, resp,
			      msgbuf);
424 425 426
	}
}

427 428 429 430
/*
 * Return a hash of the destination and message id.
 */
static isc_uint32_t
431 432 433
dns_hash(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id,
	 in_port_t port)
{
434 435 436
	unsigned int ret;

	ret = isc_sockaddr_hash(dest, ISC_TRUE);
437
	ret ^= (id << 16) | port;
438
	ret %= qid->qid_nbuckets;
439

440
	INSIST(ret < qid->qid_nbuckets);
441 442 443 444

	return (ret);
}

445 446 447
/*
 * Find the first entry in 'qid'.  Returns NULL if there are no entries.
 */
Michael Graff's avatar
Michael Graff committed
448
static dns_dispentry_t *
449
linear_first(dns_qid_t *qid) {
Michael Graff's avatar
Michael Graff committed
450 451 452 453 454
	dns_dispentry_t *ret;
	unsigned int bucket;

	bucket = 0;

455 456
	while (bucket < qid->qid_nbuckets) {
		ret = ISC_LIST_HEAD(qid->qid_table[bucket]);
Michael Graff's avatar
Michael Graff committed
457 458 459 460 461 462 463 464
		if (ret != NULL)
			return (ret);
		bucket++;
	}

	return (NULL);
}

465 466 467 468
/*
 * Find the next entry after 'resp' in 'qid'.  Return NULL if there are
 * no more entries.
 */
Michael Graff's avatar
Michael Graff committed
469
static dns_dispentry_t *
470
linear_next(dns_qid_t *qid, dns_dispentry_t *resp) {
Michael Graff's avatar
Michael Graff committed
471 472 473 474 475 476 477 478
	dns_dispentry_t *ret;
	unsigned int bucket;

	ret = ISC_LIST_NEXT(resp, link);
	if (ret != NULL)
		return (ret);

	bucket = resp->bucket;
479
	bucket++;
480 481
	while (bucket < qid->qid_nbuckets) {
		ret = ISC_LIST_HEAD(qid->qid_table[bucket]);
Michael Graff's avatar
Michael Graff committed
482 483 484 485 486 487 488
		if (ret != NULL)
			return (ret);
		bucket++;
	}

	return (NULL);
}
489

490 491 492 493 494 495 496 497 498
/*
 * The dispatch must be locked.
 */
static isc_boolean_t
destroy_disp_ok(dns_dispatch_t *disp)
{
	if (disp->refcount != 0)
		return (ISC_FALSE);

499
	if (disp->recv_pending != 0)
500 501
		return (ISC_FALSE);

502 503 504
	if (!ISC_LIST_EMPTY(disp->activesockets))
		return (ISC_FALSE);

505 506 507 508 509 510
	if (disp->shutting_down == 0)
		return (ISC_FALSE);

	return (ISC_TRUE);
}

511
/*
512 513
 * Called when refcount reaches 0 (and safe to destroy).
 *
Evan Hunt's avatar
Evan Hunt committed
514 515
 * The dispatcher must be locked.
 * The manager must not be locked.
516 517
 */
static void
518
destroy_disp(isc_task_t *task, isc_event_t *event) {
519
	dns_dispatch_t *disp;
520 521
	dns_dispatchmgr_t *mgr;
	isc_boolean_t killmgr;
522 523
	dispsocket_t *dispsocket;
	int i;
524

525 526 527 528 529
	INSIST(event->ev_type == DNS_EVENT_DISPATCHCONTROL);

	UNUSED(task);

	disp = event->ev_arg;
530 531
	mgr = disp->mgr;

532
	LOCK(&mgr->lock);
533
	ISC_LIST_UNLINK(mgr->list, disp, link);
Michael Graff's avatar
Michael Graff committed
534

535 536
	dispatch_log(disp, LVL(90),
		     "shutting down; detaching from sock %p, task %p",
537
		     disp->socket, disp->task[0]); /* XXXX */
538

539 540
	if (disp->sepool != NULL) {
		isc_mempool_destroy(&disp->sepool);
541
		(void)isc_mutex_destroy(&disp->sepool_lock);
542 543
	}

544 545 546 547 548 549 550 551
	if (disp->socket != NULL)
		isc_socket_detach(&disp->socket);
	while ((dispsocket = ISC_LIST_HEAD(disp->inactivesockets)) != NULL) {
		ISC_LIST_UNLINK(disp->inactivesockets, dispsocket, link);
		destroy_dispsocket(disp, &dispsocket);
	}
	for (i = 0; i < disp->ntasks; i++)
		isc_task_detach(&disp->task[i]);
552
	isc_event_free(&event);
553

554
	dispatch_free(&disp);
555 556 557 558 559

	killmgr = destroy_mgr_ok(mgr);
	UNLOCK(&mgr->lock);
	if (killmgr)
		destroy_mgr(&mgr);
560 561
}

562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585
/*%
 * Manipulate port table per dispatch: find an entry for a given port number,
 * create a new entry, and decrement a given entry with possible clean-up.
 */
static dispportentry_t *
port_search(dns_dispatch_t *disp, in_port_t port) {
	dispportentry_t *portentry;

	REQUIRE(disp->port_table != NULL);

	portentry = ISC_LIST_HEAD(disp->port_table[port %
						   DNS_DISPATCH_PORTTABLESIZE]);
	while (portentry != NULL) {
		if (portentry->port == port)
			return (portentry);
		portentry = ISC_LIST_NEXT(portentry, link);
	}

	return (NULL);
}

static dispportentry_t *
new_portentry(dns_dispatch_t *disp, in_port_t port) {
	dispportentry_t *portentry;
586
	dns_qid_t *qid;
587 588 589 590 591 592 593 594

	REQUIRE(disp->port_table != NULL);

	portentry = isc_mempool_get(disp->portpool);
	if (portentry == NULL)
		return (portentry);

	portentry->port = port;
595
	portentry->refs = 1;
596
	ISC_LINK_INIT(portentry, link);
597 598
	qid = DNS_QID(disp);
	LOCK(&qid->lock);
599 600
	ISC_LIST_APPEND(disp->port_table[port % DNS_DISPATCH_PORTTABLESIZE],
			portentry, link);
601
	UNLOCK(&qid->lock);
602 603 604 605

	return (portentry);
}

606 607 608
/*%
 * The caller must not hold the qid->lock.
 */
609 610 611
static void
deref_portentry(dns_dispatch_t *disp, dispportentry_t **portentryp) {
	dispportentry_t *portentry = *portentryp;
Mark Andrews's avatar
Mark Andrews committed
612
	dns_qid_t *qid;
613 614 615 616

	REQUIRE(disp->port_table != NULL);
	REQUIRE(portentry != NULL && portentry->refs > 0);

617 618
	qid = DNS_QID(disp);
	LOCK(&qid->lock);
619
	portentry->refs--;
620

621
	if (portentry->refs == 0) {
622 623 624 625 626 627
		ISC_LIST_UNLINK(disp->port_table[portentry->port %
						 DNS_DISPATCH_PORTTABLESIZE],
				portentry, link);
		isc_mempool_put(disp->portpool, portentry);
	}

Evan Hunt's avatar
Evan Hunt committed
628 629 630 631
	/*
	 * Set '*portentryp' to NULL inside the lock so that
	 * dispsock->portentry does not change in socket_search.
	 */
632
	*portentryp = NULL;
Evan Hunt's avatar
Evan Hunt committed
633 634

	UNLOCK(&qid->lock);
635 636
}

637 638
/*%
 * Find a dispsocket for socket address 'dest', and port number 'port'.
Evan Hunt's avatar
Evan Hunt committed
639
 * Return NULL if no such entry exists.  Requires qid->lock to be held.
640 641 642 643 644 645 646
 */
static dispsocket_t *
socket_search(dns_qid_t *qid, isc_sockaddr_t *dest, in_port_t port,
	      unsigned int bucket)
{
	dispsocket_t *dispsock;

Evan Hunt's avatar
Evan Hunt committed
647
	REQUIRE(VALID_QID(qid));
648 649 650 651 652
	REQUIRE(bucket < qid->qid_nbuckets);

	dispsock = ISC_LIST_HEAD(qid->sock_table[bucket]);

	while (dispsock != NULL) {
Automatic Updater's avatar
Automatic Updater committed
653
		if (dispsock->portentry != NULL &&
654 655
		    dispsock->portentry->port == port &&
		    isc_sockaddr_equal(dest, &dispsock->host))
656 657 658 659 660 661 662
			return (dispsock);
		dispsock = ISC_LIST_NEXT(dispsock, blink);
	}

	return (NULL);
}

663 664
/*%
 * Make a new socket for a single dispatch with a random port number.
665
 * The caller must hold the disp->lock
666 667 668
 */
static isc_result_t
get_dispsocket(dns_dispatch_t *disp, isc_sockaddr_t *dest,
669 670
	       isc_socketmgr_t *sockmgr, dispsocket_t **dispsockp,
	       in_port_t *portp)
671 672 673 674 675 676 677 678
{
	int i;
	isc_uint32_t r;
	dns_dispatchmgr_t *mgr = disp->mgr;
	isc_socket_t *sock = NULL;
	isc_result_t result = ISC_R_FAILURE;
	in_port_t port;
	isc_sockaddr_t localaddr;
679
	unsigned int bucket = 0;
680 681 682
	dispsocket_t *dispsock;
	unsigned int nports;
	in_port_t *ports;
683
	unsigned int bindoptions;
684
	dispportentry_t *portentry = NULL;
685
	dns_qid_t *qid;
686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710

	if (isc_sockaddr_pf(&disp->local) == AF_INET) {
		nports = disp->mgr->nv4ports;
		ports = disp->mgr->v4ports;
	} else {
		nports = disp->mgr->nv6ports;
		ports = disp->mgr->v6ports;
	}
	if (nports == 0)
		return (ISC_R_ADDRNOTAVAIL);

	dispsock = ISC_LIST_HEAD(disp->inactivesockets);
	if (dispsock != NULL) {
		ISC_LIST_UNLINK(disp->inactivesockets, dispsock, link);
		sock = dispsock->socket;
		dispsock->socket = NULL;
	} else {
		dispsock = isc_mempool_get(mgr->spool);
		if (dispsock == NULL)
			return (ISC_R_NOMEMORY);

		disp->nsockets++;
		dispsock->socket = NULL;
		dispsock->disp = disp;
		dispsock->resp = NULL;
711
		dispsock->portentry = NULL;
712 713 714 715
		isc_random_get(&r);
		dispsock->task = NULL;
		isc_task_attach(disp->task[r % disp->ntasks], &dispsock->task);
		ISC_LINK_INIT(dispsock, link);
716
		ISC_LINK_INIT(dispsock, blink);
717 718 719 720 721 722 723 724 725
		dispsock->magic = DISPSOCK_MAGIC;
	}

	/*
	 * Pick up a random UDP port and open a new socket with it.  Avoid
	 * choosing ports that share the same destination because it will be
	 * very likely to fail in bind(2) or connect(2).
	 */
	localaddr = disp->local;
726 727
	qid = DNS_QID(disp);

728
	for (i = 0; i < 64; i++) {
729
		port = ports[isc_rng_uniformrandom(DISP_RNGCTX(disp), nports)];
730 731
		isc_sockaddr_setport(&localaddr, port);

732
		LOCK(&qid->lock);
733
		bucket = dns_hash(qid, dest, 0, port);
734 735
		if (socket_search(qid, dest, port, bucket) != NULL) {
			UNLOCK(&qid->lock);
736
			continue;
737 738
		}
		UNLOCK(&qid->lock);
739
		bindoptions = 0;
740
		portentry = port_search(disp, port);
741

742 743
		if (portentry != NULL)
			bindoptions |= ISC_SOCKET_REUSEADDRESS;
744
		result = open_socket(sockmgr, &localaddr, bindoptions, &sock,
Automatic Updater's avatar
Automatic Updater committed
745
				     NULL);
746 747 748 749 750 751 752
		if (result == ISC_R_SUCCESS) {
			if (portentry == NULL) {
				portentry = new_portentry(disp, port);
				if (portentry == NULL) {
					result = ISC_R_NOMEMORY;
					break;
				}
753 754 755 756
			} else {
				LOCK(&qid->lock);
				portentry->refs++;
				UNLOCK(&qid->lock);
757 758
			}
			break;
759 760 761 762 763 764
		} else if (result == ISC_R_NOPERM) {
			char buf[ISC_SOCKADDR_FORMATSIZE];
			isc_sockaddr_format(&localaddr, buf, sizeof(buf));
			dispatch_log(disp, ISC_LOG_WARNING,
				     "open_socket(%s) -> %s: continuing",
				     buf, isc_result_totext(result));
765
		} else if (result != ISC_R_ADDRINUSE)
766 767 768 769 770
			break;
	}

	if (result == ISC_R_SUCCESS) {
		dispsock->socket = sock;
771
		dispsock->host = *dest;
772
		dispsock->portentry = portentry;
773
		dispsock->bucket = bucket;
774
		LOCK(&qid->lock);
775
		ISC_LIST_APPEND(qid->sock_table[bucket], dispsock, blink);
776
		UNLOCK(&qid->lock);
777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798
		*dispsockp = dispsock;
		*portp = port;
	} else {
		/*
		 * We could keep it in the inactive list, but since this should
		 * be an exceptional case and might be resource shortage, we'd
		 * rather destroy it.
		 */
		if (sock != NULL)
			isc_socket_detach(&sock);
		destroy_dispsocket(disp, &dispsock);
	}

	return (result);
}

/*%
 * Destroy a dedicated dispatch socket.
 */
static void
destroy_dispsocket(dns_dispatch_t *disp, dispsocket_t **dispsockp) {
	dispsocket_t *dispsock;
799
	dns_qid_t *qid;
800 801 802 803 804 805 806 807 808 809 810

	/*
	 * The dispatch must be locked.
	 */

	REQUIRE(dispsockp != NULL && *dispsockp != NULL);
	dispsock = *dispsockp;
	REQUIRE(!ISC_LINK_LINKED(dispsock, link));

	disp->nsockets--;
	dispsock->magic = 0;
811 812
	if (dispsock->portentry != NULL)
		deref_portentry(disp, &dispsock->portentry);
813 814
	if (dispsock->socket != NULL)
		isc_socket_detach(&dispsock->socket);
815 816 817 818 819 820 821
	if (ISC_LINK_LINKED(dispsock, blink)) {
		qid = DNS_QID(disp);
		LOCK(&qid->lock);
		ISC_LIST_UNLINK(qid->sock_table[dispsock->bucket], dispsock,
				blink);
		UNLOCK(&qid->lock);
	}
822 823 824 825 826 827 828 829 830 831 832 833 834
	if (dispsock->task != NULL)
		isc_task_detach(&dispsock->task);
	isc_mempool_put(disp->mgr->spool, dispsock);

	*dispsockp = NULL;
}

/*%
 * Deactivate a dedicated dispatch socket.  Move it to the inactive list for
 * future reuse unless the total number of sockets are exceeding the maximum.
 */
static void
deactivate_dispsocket(dns_dispatch_t *disp, dispsocket_t *dispsock) {
835
	isc_result_t result;
836
	dns_qid_t *qid;
837

838 839 840 841 842 843 844 845 846
	/*
	 * The dispatch must be locked.
	 */
	ISC_LIST_UNLINK(disp->activesockets, dispsock, link);
	if (dispsock->resp != NULL) {
		INSIST(dispsock->resp->dispsocket == dispsock);
		dispsock->resp->dispsocket = NULL;
	}

847 848 849
	INSIST(dispsock->portentry != NULL);
	deref_portentry(disp, &dispsock->portentry);

850 851 852
	if (disp->nsockets > DNS_DISPATCH_POOLSOCKS)
		destroy_dispsocket(disp, &dispsock);
	else {
853
		result = isc_socket_close(dispsock->socket);
854 855 856 857 858 859 860

		qid = DNS_QID(disp);
		LOCK(&qid->lock);
		ISC_LIST_UNLINK(qid->sock_table[dispsock->bucket], dispsock,
				blink);
		UNLOCK(&qid->lock);

861 862 863 864 865 866 867 868 869 870 871
		if (result == ISC_R_SUCCESS)
			ISC_LIST_APPEND(disp->inactivesockets, dispsock, link);
		else {
			/*
			 * If the underlying system does not allow this
			 * optimization, destroy this temporary structure (and
			 * create a new one for a new transaction).
			 */
			INSIST(result == ISC_R_NOTIMPLEMENTED);
			destroy_dispsocket(disp, &dispsock);
		}
872 873
	}
}
874

875
/*
876
 * Find an entry for query ID 'id', socket address 'dest', and port number
877
 * 'port'.
878 879
 * Return NULL if no such entry exists.
 */
Michael Graff's avatar
Michael Graff committed
880
static dns_dispentry_t *
881 882
entry_search(dns_qid_t *qid, isc_sockaddr_t *dest, dns_messageid_t id,
	     in_port_t port, unsigned int bucket)
883
{
Michael Graff's avatar
Michael Graff committed
884
	dns_dispentry_t *res;
885

Evan Hunt's avatar
Evan Hunt committed
886
	REQUIRE(VALID_QID(qid));
887
	REQUIRE(bucket < qid->qid_nbuckets);
888

889
	res = ISC_LIST_HEAD(qid->qid_table[bucket]);
890 891

	while (res != NULL) {
892
		if (res->id == id && isc_sockaddr_equal(dest, &res->host) &&
893
		    res->port == port) {
894
			return (res);
895
		}
896 897 898 899 900 901
		res = ISC_LIST_NEXT(res, link);
	}

	return (NULL);
}

Michael Graff's avatar
Michael Graff committed
902
static void
903
free_buffer(dns_dispatch_t *disp, void *buf, unsigned int len) {
904
	isc_mempool_t *bpool;
Michael Graff's avatar
Michael Graff committed
905
	INSIST(buf != NULL && len != 0);
906

Michael Graff's avatar
Michael Graff committed
907

908
	switch (disp->socktype) {
909
	case isc_sockettype_tcp:
910 911
		INSIST(disp->tcpbuffers > 0);
		disp->tcpbuffers--;
912
		isc_mem_put(disp->mgr->mctx, buf, len);
913
		break;
914
	case isc_sockettype_udp:
915 916 917 918
		LOCK(&disp->mgr->buffer_lock);
		INSIST(disp->mgr->buffers > 0);
		INSIST(len == disp->mgr->buffersize);
		disp->mgr->buffers--;
919
		bpool = disp->mgr->bpool;
920
		UNLOCK(&disp->mgr->buffer_lock);
921
		isc_mempool_put(bpool, buf);
922 923
		break;
	default:
Michael Graff's avatar
Michael Graff committed
924
		INSIST(0);
925 926
		break;
	}
Michael Graff's avatar
Michael Graff committed
927 928 929
}

static void *
930
allocate_udp_buffer(dns_dispatch_t *disp) {
931
	isc_mempool_t *bpool;
Michael Graff's avatar
Michael Graff committed
932 933
	void *temp;