socket.c 130 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
Automatic Updater's avatar
Automatic Updater committed
2
 * Copyright (C) 2004-2008  Internet Systems Consortium, Inc. ("ISC")
Mark Andrews's avatar
Mark Andrews committed
3
 * Copyright (C) 1998-2003  Internet Software Consortium.
4
 *
Automatic Updater's avatar
Automatic Updater committed
5
 * Permission to use, copy, modify, and/or distribute this software for any
Bob Halley's avatar
Bob Halley committed
6 7
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
8
 *
Mark Andrews's avatar
Mark Andrews committed
9 10 11 12 13 14 15
 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
Bob Halley's avatar
Bob Halley committed
16
 */
Bob Halley's avatar
Bob Halley committed
17

18
/* $Id: socket.c,v 1.300 2008/08/20 06:16:05 marka Exp $ */
19 20

/*! \file */
David Lawrence's avatar
David Lawrence committed
21

Bob Halley's avatar
Bob Halley committed
22
#include <config.h>
23

24
#include <sys/param.h>
Michael Graff's avatar
Michael Graff committed
25
#include <sys/types.h>
Michael Graff's avatar
Michael Graff committed
26
#include <sys/socket.h>
27
#include <sys/stat.h>
Michael Graff's avatar
Michael Graff committed
28
#include <sys/time.h>
Michael Graff's avatar
Michael Graff committed
29 30
#include <sys/uio.h>

31
#include <errno.h>
Andreas Gustafsson's avatar
Andreas Gustafsson committed
32
#include <fcntl.h>
33 34 35 36 37
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

38
#include <isc/buffer.h>
39
#include <isc/bufferlist.h>
40
#include <isc/condition.h>
41
#include <isc/formatcheck.h>
42
#include <isc/list.h>
Michael Graff's avatar
Michael Graff committed
43
#include <isc/log.h>
44
#include <isc/mem.h>
45
#include <isc/msgs.h>
46
#include <isc/mutex.h>
47
#include <isc/net.h>
48
#include <isc/once.h>
49
#include <isc/platform.h>
Michael Graff's avatar
Michael Graff committed
50
#include <isc/print.h>
51
#include <isc/region.h>
52
#include <isc/socket.h>
53
#include <isc/strerror.h>
54
#include <isc/task.h>
55
#include <isc/thread.h>
Michael Graff's avatar
Michael Graff committed
56
#include <isc/util.h>
57
#include <isc/xml.h>
Bob Halley's avatar
Bob Halley committed
58

59 60 61 62 63 64 65 66 67 68 69 70 71
#ifdef ISC_PLATFORM_HAVESYSUNH
#include <sys/un.h>
#endif
#ifdef ISC_PLATFORM_HAVEKQUEUE
#include <sys/event.h>
#endif
#ifdef ISC_PLATFORM_HAVEEPOLL
#include <sys/epoll.h>
#endif
#ifdef ISC_PLATFORM_HAVEDEVPOLL
#include <sys/devpoll.h>
#endif

72 73
#include "errno2result.h"

74
#ifndef ISC_PLATFORM_USETHREADS
75
#include "socket_p.h"
76
#endif /* ISC_PLATFORM_USETHREADS */
77

78 79 80 81
#if defined(SO_BSDCOMPAT) && defined(__linux__)
#include <sys/utsname.h>
#endif

82
/*%
Automatic Updater's avatar
Automatic Updater committed
83
 * Choose the most preferable multiplex method.
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
 */
#ifdef ISC_PLATFORM_HAVEKQUEUE
#define USE_KQUEUE
#elif defined (ISC_PLATFORM_HAVEEPOLL)
#define USE_EPOLL
#elif defined (ISC_PLATFORM_HAVEDEVPOLL)
#define USE_DEVPOLL
typedef struct {
	unsigned int want_read : 1,
		want_write : 1;
} pollinfo_t;
#else
#define USE_SELECT
#endif	/* ISC_PLATFORM_HAVEKQUEUE */

#ifndef ISC_PLATFORM_USETHREADS
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
struct isc_socketwait {
	int nevents;
};
#elif defined (USE_SELECT)
struct isc_socketwait {
106 107
	fd_set *readset;
	fd_set *writeset;
108 109 110 111 112 113 114 115
	int nfds;
	int maxfd;
};
#endif	/* USE_KQUEUE */
#endif /* !ISC_PLATFORM_USETHREADS */

/*%
 * Maximum number of allowable open sockets.  This is also the maximum
116 117 118 119 120 121
 * allowable socket file descriptor.
 *
 * Care should be taken before modifying this value for select():
 * The API standard doesn't ensure select() accept more than (the system default
 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
 * the vast majority of cases.  This constant should therefore be increased only
Automatic Updater's avatar
Automatic Updater committed
122
 * when absolutely necessary and possible, i.e., the server is exhausting all
123 124 125 126 127 128 129 130 131
 * available file descriptors (up to FD_SETSIZE) and the select() function
 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
 * always by true, but we keep using some of them to ensure as much
 * portability as possible).  Note also that overall server performance
 * may be rather worsened with a larger value of this constant due to
 * inherent scalability problems of select().
 *
 * As a special note, this value shouldn't have to be touched if
 * this is a build for an authoritative only DNS server.
132 133
 */
#ifndef ISC_SOCKET_MAXSOCKETS
134
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
135
#define ISC_SOCKET_MAXSOCKETS 4096
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
#elif defined(USE_SELECT)
#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
#endif	/* USE_KQUEUE... */
#endif	/* ISC_SOCKET_MAXSOCKETS */

#ifdef USE_SELECT
/*%
 * Mac OS X needs a special definition to support larger values in select()
 */
#if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
#ifdef __APPLE__
#define _DARWIN_UNLIMITED_SELECT
#endif	/* __APPLE__ */
#endif	/* ISC_SOCKET_MAXSOCKETS > FD_SETSIZE */
#endif	/* USE_SELECT */

152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173

/*%
 * Size of per-FD lock buckets.
 */
#ifdef ISC_PLATFORM_USETHREADS
#define FDLOCK_COUNT		1024
#define FDLOCK_ID(fd)		((fd) % FDLOCK_COUNT)
#else
#define FDLOCK_COUNT		1
#define FDLOCK_ID(fd)		0
#endif	/* ISC_PLATFORM_USETHREADS */

/*%
 * Maximum number of events communicated with the kernel.  There should normally
 * be no need for having a large number.
 */
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
#ifndef ISC_SOCKET_MAXEVENTS
#define ISC_SOCKET_MAXEVENTS	64
#endif
#endif

174
/*%
175
 * Some systems define the socket length argument as an int, some as size_t,
176
 * some as socklen_t.  This is here so it can be easily changed if needed.
177
 */
178
#ifndef ISC_SOCKADDR_LEN_T
179
#define ISC_SOCKADDR_LEN_T unsigned int
180
#endif
181

182
/*%
183 184
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
185 186 187 188
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
189
 */
190 191 192 193
#define SOFT_ERROR(e)	((e) == EAGAIN || \
			 (e) == EWOULDBLOCK || \
			 (e) == EINTR || \
			 (e) == 0)
194

Michael Graff's avatar
Michael Graff committed
195
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
196

197
/*!<
Michael Graff's avatar
Michael Graff committed
198 199 200 201 202 203
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
204 205 206 207 208 209 210 211 212 213 214
#define TRACE_LEVEL		90
#define CORRECTNESS_LEVEL	70
#define IOEVENT_LEVEL		60
#define EVENT_LEVEL		50
#define CREATION_LEVEL		20

#define TRACE		DLVL(TRACE_LEVEL)
#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
#define IOEVENT		DLVL(IOEVENT_LEVEL)
#define EVENT		DLVL(EVENT_LEVEL)
#define CREATION	DLVL(CREATION_LEVEL)
215

216
typedef isc_event_t intev_t;
Michael Graff's avatar
Michael Graff committed
217

218 219
#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
#define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
Michael Graff's avatar
Michael Graff committed
220

221
/*!
Michael Graff's avatar
Michael Graff committed
222 223 224 225
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
226
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
Michael Graff's avatar
Michael Graff committed
227 228 229 230 231
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

232
/*%
233
 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
Michael Graff's avatar
Michael Graff committed
234 235 236 237 238 239 240 241 242
 * a setsockopt() like interface to request timestamps, and if the OS
 * doesn't do it for us, call gettimeofday() on every UDP receive?
 */
#ifdef SO_TIMESTAMP
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

243 244 245 246 247
/*%
 * The size to raise the recieve buffer to (from BIND 8).
 */
#define RCVBUFSIZE (32*1024)

248
/*%
249 250 251 252
 * The number of times a send operation is repeated if the result is EINTR.
 */
#define NRETRIES 10

253 254
struct isc_socket {
	/* Not locked. */
255 256 257 258
	unsigned int		magic;
	isc_socketmgr_t	       *manager;
	isc_mutex_t		lock;
	isc_sockettype_t	type;
Michael Graff's avatar
Michael Graff committed
259

260
	/* Locked by socket lock. */
261
	ISC_LINK(isc_socket_t)	link;
262 263
	unsigned int		references;
	int			fd;
264
	int			pf;
265 266 267
	char				name[16];
	void *				tag;

268
	ISC_LIST(isc_socketevent_t)		send_list;
269
	ISC_LIST(isc_socketevent_t)		recv_list;
270
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
271 272 273 274 275 276 277
	isc_socket_connev_t		       *connect_ev;

	/*
	 * Internal events.  Posted when a descriptor is readable or
	 * writable.  These are statically allocated and never freed.
	 * They will be set to non-purgable before use.
	 */
278 279
	intev_t			readable_ev;
	intev_t			writable_ev;
280

281
	isc_sockaddr_t		peer_address;  /* remote address */
282

283 284 285 286 287 288 289
	unsigned int		pending_recv : 1,
				pending_send : 1,
				pending_accept : 1,
				listener : 1, /* listener socket */
				connected : 1,
				connecting : 1, /* connect pending */
				bound : 1; /* bound to local addr */
290

291
#ifdef ISC_NET_RECVOVERFLOW
292
	unsigned char		overflow; /* used for MSG_TRUNC fake */
293
#endif
294 295 296 297 298

	char			*recvcmsgbuf;
	ISC_SOCKADDR_LEN_T	recvcmsgbuflen;
	char			*sendcmsgbuf;
	ISC_SOCKADDR_LEN_T	sendcmsgbuflen;
299 300 301 302 303

	void			*fdwatcharg;
	isc_sockfdwatch_t	fdwatchcb;
	int			fdwatchflags;
	isc_task_t		*fdwatchtask;
304 305
};

306 307 308
#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)

309 310
struct isc_socketmgr {
	/* Not locked. */
311 312 313
	unsigned int		magic;
	isc_mem_t	       *mctx;
	isc_mutex_t		lock;
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
	isc_mutex_t		*fdlock;
#ifdef USE_KQUEUE
	int			kqueue_fd;
	int			nevents;
	struct kevent		*events;
#endif	/* USE_KQUEUE */
#ifdef USE_EPOLL
	int			epoll_fd;
	int			nevents;
	struct epoll_event	*events;
#endif	/* USE_EPOLL */
#ifdef USE_DEVPOLL
	int			devpoll_fd;
	int			nevents;
	struct pollfd		*events;
#endif	/* USE_DEVPOLL */
330 331 332
#ifdef USE_SELECT
	int			fd_bufsize;
#endif	/* USE_SELECT */
333 334 335 336 337 338 339 340 341 342 343 344
	unsigned int		maxsocks;
#ifdef ISC_PLATFORM_USETHREADS
	int			pipe_fds[2];
#endif

	/* Locked by fdlock. */
	isc_socket_t	       **fds;
	int			*fdstate;
#ifdef USE_DEVPOLL
	pollinfo_t		*fdpollinfo;
#endif

345
	/* Locked by manager lock. */
346
	ISC_LIST(isc_socket_t)	socklist;
347
#ifdef USE_SELECT
348 349 350 351
	fd_set			*read_fds;
	fd_set			*read_fds_copy;
	fd_set			*write_fds;
	fd_set			*write_fds_copy;
352
	int			maxfd;
353
#endif	/* USE_SELECT */
354 355 356
#ifdef ISC_PLATFORM_USETHREADS
	isc_thread_t		watcher;
	isc_condition_t		shutdown_ok;
357
#else /* ISC_PLATFORM_USETHREADS */
358
	unsigned int		refs;
359
#endif /* ISC_PLATFORM_USETHREADS */
360 361
};

362 363
#ifndef ISC_PLATFORM_USETHREADS
static isc_socketmgr_t *socketmgr = NULL;
364
#endif /* ISC_PLATFORM_USETHREADS */
365

366 367 368
#define CLOSED			0	/* this one must be zero */
#define MANAGED			1
#define CLOSE_PENDING		2
Michael Graff's avatar
Michael Graff committed
369

370 371 372 373 374 375 376 377 378 379
/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
#ifdef ISC_NET_RECVOVERFLOW
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
#else
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
#endif

380 381
static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
Bob Halley's avatar
Bob Halley committed
382 383 384 385
static void free_socket(isc_socket_t **);
static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
				    isc_socket_t **);
static void destroy(isc_socket_t **);
386 387 388 389
static void internal_accept(isc_task_t *, isc_event_t *);
static void internal_connect(isc_task_t *, isc_event_t *);
static void internal_recv(isc_task_t *, isc_event_t *);
static void internal_send(isc_task_t *, isc_event_t *);
390 391
static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
392
static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
Michael Graff's avatar
Michael Graff committed
393
static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
394
			      struct msghdr *, struct iovec *, size_t *);
Michael Graff's avatar
Michael Graff committed
395
static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
396
			      struct msghdr *, struct iovec *, size_t *);
397 398 399
#ifdef ISC_PLATFORM_USETHREADS
static isc_boolean_t process_ctlfd(isc_socketmgr_t *manager);
#endif
Michael Graff's avatar
Michael Graff committed
400 401 402

#define SELECT_POKE_SHUTDOWN		(-1)
#define SELECT_POKE_NOTHING		(-2)
403
#define SELECT_POKE_READ		(-3)
404
#define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
405
#define SELECT_POKE_WRITE		(-4)
406
#define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
407
#define SELECT_POKE_CLOSE		(-5)
408

409 410
#define SOCK_DEAD(s)			((s)->references == 0)

411 412 413 414
static void
manager_log(isc_socketmgr_t *sockmgr,
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
Michael Graff's avatar
Michael Graff committed
415 416 417 418 419 420 421 422
static void
manager_log(isc_socketmgr_t *sockmgr,
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...)
{
	char msgbuf[2048];
	va_list ap;

423 424 425
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
426 427 428 429 430 431 432 433
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
		      "sockmgr %p: %s", sockmgr, msgbuf);
}

434 435 436 437 438
static void
socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   isc_msgcat_t *msgcat, int msgset, int message,
	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
Michael Graff's avatar
Michael Graff committed
439 440 441
static void
socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
442
	   isc_msgcat_t *msgcat, int msgset, int message,
Michael Graff's avatar
Michael Graff committed
443 444 445
	   const char *fmt, ...)
{
	char msgbuf[2048];
446
	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
Michael Graff's avatar
Michael Graff committed
447 448
	va_list ap;

449 450 451
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
452 453 454 455 456
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
457 458 459
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p: %s", sock, msgbuf);
Michael Graff's avatar
Michael Graff committed
460
	} else {
Andreas Gustafsson's avatar
Andreas Gustafsson committed
461
		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
462 463 464
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p %s: %s", sock, peerbuf, msgbuf);
Michael Graff's avatar
Michael Graff committed
465 466 467
	}
}

468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525
static inline isc_result_t
watch_fd(isc_socketmgr_t *manager, int fd, int msg) {
	isc_result_t result = ISC_R_SUCCESS;

#ifdef USE_KQUEUE
	struct kevent evchange;

	memset(&evchange, 0, sizeof(evchange));
	if (msg == SELECT_POKE_READ)
		evchange.filter = EVFILT_READ;
	else
		evchange.filter = EVFILT_WRITE;
	evchange.flags = EV_ADD;
	evchange.ident = fd;
	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
		result = isc__errno2result(errno);

	return (result);
#elif defined(USE_EPOLL)
	struct epoll_event event;

	if (msg == SELECT_POKE_READ)
		event.events = EPOLLIN;
	else
		event.events = EPOLLOUT;
	event.data.fd = fd;
	if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
	    errno != EEXIST) {
		result = isc__errno2result(errno);
	}

	return (result);
#elif defined(USE_DEVPOLL)
	struct pollfd pfd;
	int lockid = FDLOCK_ID(fd);

	memset(&pfd, 0, sizeof(pfd));
	if (msg == SELECT_POKE_READ)
		pfd.events = POLLIN;
	else
		pfd.events = POLLOUT;
	pfd.fd = fd;
	pfd.revents = 0;
	LOCK(&manager->fdlock[lockid]);
	if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
		result = isc__errno2result(errno);
	else {
		if (msg == SELECT_POKE_READ)
			manager->fdpollinfo[fd].want_read = 1;
		else
			manager->fdpollinfo[fd].want_write = 1;
	}
	UNLOCK(&manager->fdlock[lockid]);

	return (result);
#elif defined(USE_SELECT)
	LOCK(&manager->lock);
	if (msg == SELECT_POKE_READ)
526
		FD_SET(fd, manager->read_fds);
527
	if (msg == SELECT_POKE_WRITE)
528
		FD_SET(fd, manager->write_fds);
529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597
	UNLOCK(&manager->lock);

	return (result);
#endif
}

static inline isc_result_t
unwatch_fd(isc_socketmgr_t *manager, int fd, int msg) {
	isc_result_t result = ISC_R_SUCCESS;

#ifdef USE_KQUEUE
	struct kevent evchange;

	memset(&evchange, 0, sizeof(evchange));
	if (msg == SELECT_POKE_READ)
		evchange.filter = EVFILT_READ;
	else
		evchange.filter = EVFILT_WRITE;
	evchange.flags = EV_DELETE;
	evchange.ident = fd;
	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
		result = isc__errno2result(errno);

	return (result);
#elif defined(USE_EPOLL)
	struct epoll_event event;

	if (msg == SELECT_POKE_READ)
		event.events = EPOLLIN;
	else
		event.events = EPOLLOUT;
	event.data.fd = fd;
	if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
	    errno != ENOENT) {
		char strbuf[ISC_STRERRORSIZE];
		isc__strerror(errno, strbuf, sizeof(strbuf));
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "epoll_ctl(DEL), %d: %s", fd, strbuf);
		result = ISC_R_UNEXPECTED;
	}
	return (result);
#elif defined(USE_DEVPOLL)
	struct pollfd pfds[2];
	size_t writelen = sizeof(pfds[0]);
	int lockid = FDLOCK_ID(fd);

	memset(pfds, 0, sizeof(pfds));
	pfds[0].events = POLLREMOVE;
	pfds[0].fd = fd;

	/*
	 * Canceling read or write polling via /dev/poll is tricky.  Since it
	 * only provides a way of canceling per FD, we may need to re-poll the
	 * socket for the other operation.
	 */
	LOCK(&manager->fdlock[lockid]);
	if (msg == SELECT_POKE_READ &&
	    manager->fdpollinfo[fd].want_write == 1) {
		pfds[1].events = POLLOUT;
		pfds[1].fd = fd;
		writelen += sizeof(pfds[1]);
	}
	if (msg == SELECT_POKE_WRITE &&
	    manager->fdpollinfo[fd].want_read == 1) {
		pfds[1].events = POLLIN;
		pfds[1].fd = fd;
		writelen += sizeof(pfds[1]);
	}

Automatic Updater's avatar
Automatic Updater committed
598
	if (write(manager->devpoll_fd, pfds, writelen) == -1)
599 600 601 602 603 604 605 606 607 608 609 610 611
		result = isc__errno2result(errno);
	else {
		if (msg == SELECT_POKE_READ)
			manager->fdpollinfo[fd].want_read = 0;
		else
			manager->fdpollinfo[fd].want_write = 0;
	}
	UNLOCK(&manager->fdlock[lockid]);

	return (result);
#elif defined(USE_SELECT)
	LOCK(&manager->lock);
	if (msg == SELECT_POKE_READ)
612
		FD_CLR(fd, manager->read_fds);
613
	else if (msg == SELECT_POKE_WRITE)
614
		FD_CLR(fd, manager->write_fds);
615 616 617 618 619 620
	UNLOCK(&manager->lock);

	return (result);
#endif
}

621
static void
622
wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
623 624
	isc_result_t result;
	int lockid = FDLOCK_ID(fd);
625 626

	/*
627 628 629
	 * This is a wakeup on a socket.  If the socket is not in the
	 * process of being closed, start watching it for either reads
	 * or writes.
630
	 */
Andreas Gustafsson's avatar
 
Andreas Gustafsson committed
631

632
	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
633

634 635 636
	if (msg == SELECT_POKE_CLOSE) {
		/* No one should be updating fdstate, so no need to lock it */
		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
Michael Graff's avatar
Michael Graff committed
637
		manager->fdstate[fd] = CLOSED;
638 639 640 641 642 643 644 645
		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
		(void)close(fd);
		return;
	}

	LOCK(&manager->fdlock[lockid]);
	if (manager->fdstate[fd] == CLOSE_PENDING) {
646 647 648 649 650 651 652 653 654 655 656 657
		UNLOCK(&manager->fdlock[lockid]);

		/*
		 * We accept (and ignore) any error from unwatch_fd() as we are
		 * closing the socket, hoping it doesn't leave dangling state in
		 * the kernel.
		 * Note that unwatch_fd() must be called after releasing the
		 * fdlock; otherwise it could cause deadlock due to a lock order
		 * reversal.
		 */
		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
658 659
		return;
	}
660 661
	if (manager->fdstate[fd] != MANAGED) {
		UNLOCK(&manager->fdlock[lockid]);
662
		return;
663 664
	}
	UNLOCK(&manager->fdlock[lockid]);
665 666

	/*
Mark Andrews's avatar
Mark Andrews committed
667
	 * Set requested bit.
668
	 */
669 670 671 672 673 674 675 676 677 678 679 680
	result = watch_fd(manager, fd, msg);
	if (result != ISC_R_SUCCESS) {
		/*
		 * XXXJT: what should we do?  Ignoring the failure of watching
		 * a socket will make the application dysfunctional, but there
		 * seems to be no reasonable recovery process.
		 */
		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
			      "failed to start watching FD (%d): %s",
			      fd, isc_result_totext(result));
	}
681 682 683
}

#ifdef ISC_PLATFORM_USETHREADS
684
/*
Michael Graff's avatar
Michael Graff committed
685
 * Poke the select loop when there is something for us to do.
686 687
 * The write is required (by POSIX) to complete.  That is, we
 * will not get partial writes.
688 689
 */
static void
690
select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
Michael Graff's avatar
Michael Graff committed
691
	int cc;
692
	int buf[2];
693
	char strbuf[ISC_STRERRORSIZE];
694 695 696

	buf[0] = fd;
	buf[1] = msg;
Michael Graff's avatar
Michael Graff committed
697

698
	do {
699
		cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
700 701 702 703 704 705 706 707 708 709
#ifdef ENOSR
		/*
		 * Treat ENOSR as EAGAIN but loop slowly as it is
		 * unlikely to clear fast.
		 */
		if (cc < 0 && errno == ENOSR) {
			sleep(1);
			errno = EAGAIN;
		}
#endif
710
	} while (cc < 0 && SOFT_ERROR(errno));
711

712 713
	if (cc < 0) {
		isc__strerror(errno, strbuf, sizeof(strbuf));
Michael Graff's avatar
Michael Graff committed
714
		FATAL_ERROR(__FILE__, __LINE__,
715 716 717 718
			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
					   ISC_MSG_WRITEFAILED,
					   "write() failed "
					   "during watcher poke: %s"),
719 720
			    strbuf);
	}
721

722
	INSIST(cc == sizeof(buf));
723 724 725
}

/*
Andreas Gustafsson's avatar
Andreas Gustafsson committed
726
 * Read a message on the internal fd.
727
 */
728 729 730
static void
select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
	int buf[2];
Michael Graff's avatar
Michael Graff committed
731
	int cc;
732
	char strbuf[ISC_STRERRORSIZE];
Michael Graff's avatar
Michael Graff committed
733

734
	cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
Michael Graff's avatar
Michael Graff committed
735
	if (cc < 0) {
736
		*msg = SELECT_POKE_NOTHING;
737
		*fd = -1;	/* Silence compiler. */
Michael Graff's avatar
Michael Graff committed
738
		if (SOFT_ERROR(errno))
739
			return;
Michael Graff's avatar
Michael Graff committed
740

741
		isc__strerror(errno, strbuf, sizeof(strbuf));
Michael Graff's avatar
Michael Graff committed
742
		FATAL_ERROR(__FILE__, __LINE__,
743 744 745 746
			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
					   ISC_MSG_READFAILED,
					   "read() failed "
					   "during watcher poke: %s"),
747
			    strbuf);
Automatic Updater's avatar
Automatic Updater committed
748

749
		return;
Michael Graff's avatar
Michael Graff committed
750
	}
751
	INSIST(cc == sizeof(buf));
752

753 754
	*fd = buf[0];
	*msg = buf[1];
755
}
756
#else /* ISC_PLATFORM_USETHREADS */
757 758 759 760
/*
 * Update the state of the socketmgr when something changes.
 */
static void
761
select_poke(isc_socketmgr_t *manager, int fd, int msg) {
762 763
	if (msg == SELECT_POKE_SHUTDOWN)
		return;
764 765
	else if (fd >= 0)
		wakeup_socket(manager, fd, msg);
766 767
	return;
}
768
#endif /* ISC_PLATFORM_USETHREADS */
769 770

/*
Andreas Gustafsson's avatar
Andreas Gustafsson committed
771
 * Make a fd non-blocking.
772
 */
Michael Graff's avatar
Michael Graff committed
773
static isc_result_t
774
make_nonblock(int fd) {
Michael Graff's avatar
Michael Graff committed
775 776
	int ret;
	int flags;
777
	char strbuf[ISC_STRERRORSIZE];
778 779
#ifdef USE_FIONBIO_IOCTL
	int on = 1;
780

781 782
	ret = ioctl(fd, FIONBIO, (char *)&on);
#else
Michael Graff's avatar
Michael Graff committed
783
	flags = fcntl(fd, F_GETFL, 0);
784
	flags |= PORT_NONBLOCK;
Michael Graff's avatar
Michael Graff committed
785
	ret = fcntl(fd, F_SETFL, flags);
786
#endif
787

Michael Graff's avatar
Michael Graff committed
788
	if (ret == -1) {
789
		isc__strerror(errno, strbuf, sizeof(strbuf));
Michael Graff's avatar
Michael Graff committed
790
		UNEXPECTED_ERROR(__FILE__, __LINE__,
791 792 793 794 795 796
#ifdef USE_FIONBIO_IOCTL
				 "ioctl(%d, FIONBIO, &on): %s", fd,
#else
				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
#endif
				 strbuf);
Michael Graff's avatar
Michael Graff committed
797

Michael Graff's avatar
Michael Graff committed
798
		return (ISC_R_UNEXPECTED);
Michael Graff's avatar
Michael Graff committed
799 800
	}

Michael Graff's avatar
Michael Graff committed
801
	return (ISC_R_SUCCESS);
802 803
}

804 805 806 807 808 809 810 811 812 813 814 815 816 817 818
#ifdef USE_CMSG
/*
 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
 * In order to ensure as much portability as possible, we provide wrapper
 * functions of these macros.
 * Note that cmsg_space() could run slow on OSes that do not have
 * CMSG_SPACE.
 */
static inline ISC_SOCKADDR_LEN_T
cmsg_len(ISC_SOCKADDR_LEN_T len) {
#ifdef CMSG_LEN
	return (CMSG_LEN(len));
#else
	ISC_SOCKADDR_LEN_T hdrlen;

819 820 821 822 823
	/*
	 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
	 * is correct.
	 */
	hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849
	return (hdrlen + len);
#endif
}

static inline ISC_SOCKADDR_LEN_T
cmsg_space(ISC_SOCKADDR_LEN_T len) {
#ifdef CMSG_SPACE
	return (CMSG_SPACE(len));
#else
	struct msghdr msg;
	struct cmsghdr *cmsgp;
	/*
	 * XXX: The buffer length is an ad-hoc value, but should be enough
	 * in a practical sense.
	 */
	char dummybuf[sizeof(struct cmsghdr) + 1024];

	memset(&msg, 0, sizeof(msg));
	msg.msg_control = dummybuf;
	msg.msg_controllen = sizeof(dummybuf);

	cmsgp = (struct cmsghdr *)dummybuf;
	cmsgp->cmsg_len = cmsg_len(len);

	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
	if (cmsgp != NULL)
Mark Andrews's avatar
Mark Andrews committed
850
		return ((char *)cmsgp - (char *)msg.msg_control);
851 852
	else
		return (0);
Automatic Updater's avatar
Automatic Updater committed
853
#endif
854 855 856
}
#endif /* USE_CMSG */

857 858 859 860
/*
 * Process control messages received on a socket.
 */
static void
861
process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
Michael Graff's avatar
Michael Graff committed
862
#ifdef USE_CMSG
863
	struct cmsghdr *cmsgp;
864
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
Michael Graff's avatar
Michael Graff committed
865 866 867 868 869 870 871
	struct in6_pktinfo *pktinfop;
#endif
#ifdef SO_TIMESTAMP
	struct timeval *timevalp;
#endif
#endif

872 873 874 875 876 877
	/*
	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
	 * They are all here, outside of the CPP tests, because it is
	 * more consistent with the usual ISC coding style.
	 */
878
	UNUSED(sock);
879 880 881
	UNUSED(msg);
	UNUSED(dev);

882
#ifdef ISC_NET_BSD44MSGHDR
883

Bob Halley's avatar
Bob Halley committed
884
#ifdef MSG_TRUNC
885 886
	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
Bob Halley's avatar
Bob Halley committed
887
#endif
888

Bob Halley's avatar
Bob Halley committed
889
#ifdef MSG_CTRUNC
890 891
	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
Bob Halley's avatar
Bob Halley committed
892
#endif
893

Michael Graff's avatar
Michael Graff committed
894 895 896
#ifndef USE_CMSG
	return;
#else
Mark Andrews's avatar
Mark Andrews committed
897
	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
898
		return;
Michael Graff's avatar
Michael Graff committed
899 900 901 902

#ifdef SO_TIMESTAMP
	timevalp = NULL;
#endif
903
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
Michael Graff's avatar
Michael Graff committed
904 905 906 907 908
	pktinfop = NULL;
#endif

	cmsgp = CMSG_FIRSTHDR(msg);
	while (cmsgp != NULL) {
909
		socket_log(sock, NULL, TRACE,
910 911
			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
			   "processing cmsg %p", cmsgp);
Michael Graff's avatar
Michael Graff committed
912

913
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
914 915
		if (cmsgp->cmsg_level == IPPROTO_IPV6
		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
916

Michael Graff's avatar
Michael Graff committed
917
			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
918 919
			memcpy(&dev->pktinfo, pktinfop,
			       sizeof(struct in6_pktinfo));
Michael Graff's avatar
Michael Graff committed
920
			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
921
			socket_log(sock, NULL, TRACE,
922 923 924
				   isc_msgcat, ISC_MSGSET_SOCKET,
				   ISC_MSG_IFRECEIVED,
				   "interface received on ifindex %u",
David Lawrence's avatar
David Lawrence committed
925
				   dev->pktinfo.ipi6_ifindex);
926
			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
Automatic Updater's avatar
Automatic Updater committed
927
				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
Michael Graff's avatar
Michael Graff committed
928 929 930 931 932
			goto next;
		}
#endif

#ifdef SO_TIMESTAMP
933 934
		if (cmsgp->cmsg_level == SOL_SOCKET
		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
Michael Graff's avatar
Michael Graff committed
935 936 937 938 939 940 941 942 943 944 945 946 947
			timevalp = (struct timeval *)CMSG_DATA(cmsgp);
			dev->timestamp.seconds = timevalp->tv_sec;
			dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
			goto next;
		}
#endif

	next:
		cmsgp = CMSG_NXTHDR(msg, cmsgp);
	}
#endif /* USE_CMSG */

948
#endif /* ISC_NET_BSD44MSGHDR */
949 950
}

951
/*
952 953 954 955
 * Construct an iov array and attach it to the msghdr passed in.  This is
 * the SEND constructor, which will use the used region of the buffer
 * (if using a buffer list) or will use the internal region (if a single
 * buffer I/O is requested).
956 957 958
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
Michael Graff's avatar
fix  
Michael Graff committed
959 960 961
 *
 * If write_countp != NULL, *write_countp will hold the number of bytes
 * this transaction can send.
962
 */
Michael Graff's avatar
Michael Graff committed
963
static void
964
build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
965
		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
966 967 968 969
{
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t used;
970 971 972
	size_t write_count;
	size_t skip_count;

Andreas Gustafsson's avatar
Andreas Gustafsson committed
973
	memset(msg, 0, sizeof(*msg));
974

975
	if (!sock->connected) {
976 977
		msg->msg_name = (void *)&dev->address.type.sa;
		msg->msg_namelen = dev->address.length;
Michael Graff's avatar
fix  
Michael Graff committed
978 979 980
	} else {
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
981
	}
982 983

	buffer = ISC_LIST_HEAD(dev->bufferlist);
984
	write_count = 0;
Michael Graff's avatar
fix  
Michael Graff committed
985
	iovcount = 0;
986

987
	/*
988
	 * Single buffer I/O?  Skip what we've done so far in this region.
989 990
	 */
	if (buffer == NULL) {
991