socket.c 173 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
2
 * Copyright (C) 1998-2016  Internet Systems Consortium, Inc. ("ISC")
3
 *
4
5
6
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
Bob Halley's avatar
Bob Halley committed
7
 */
Bob Halley's avatar
Bob Halley committed
8

9
/*! \file */
David Lawrence's avatar
David Lawrence committed
10

Bob Halley's avatar
Bob Halley committed
11
#include <config.h>
12

13
#include <sys/param.h>
Michael Graff's avatar
Michael Graff committed
14
#include <sys/types.h>
Michael Graff's avatar
Michael Graff committed
15
#include <sys/socket.h>
16
#include <sys/stat.h>
Michael Graff's avatar
Michael Graff committed
17
#include <sys/time.h>
Michael Graff's avatar
Michael Graff committed
18
19
#include <sys/uio.h>

Mark Andrews's avatar
Mark Andrews committed
20
21
22
23
24
#if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#endif

25
#include <errno.h>
Andreas Gustafsson's avatar
Andreas Gustafsson committed
26
#include <fcntl.h>
27
28
29
30
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
Mark Andrews's avatar
Mark Andrews committed
31
32
33
#ifdef HAVE_INTTYPES_H
#include <inttypes.h> /* uintptr_t */
#endif
34

35
#include <isc/buffer.h>
36
#include <isc/bufferlist.h>
37
#include <isc/condition.h>
38
#include <isc/formatcheck.h>
39
#include <isc/json.h>
40
#include <isc/list.h>
Michael Graff's avatar
Michael Graff committed
41
#include <isc/log.h>
42
#include <isc/mem.h>
43
#include <isc/msgs.h>
44
#include <isc/mutex.h>
45
#include <isc/net.h>
46
#include <isc/once.h>
47
#include <isc/platform.h>
Michael Graff's avatar
Michael Graff committed
48
#include <isc/print.h>
49
#include <isc/region.h>
50
#include <isc/resource.h>
51
#include <isc/socket.h>
52
#include <isc/stats.h>
53
#include <isc/strerror.h>
54
#include <isc/task.h>
55
#include <isc/thread.h>
Michael Graff's avatar
Michael Graff committed
56
#include <isc/util.h>
57
#include <isc/xml.h>
Bob Halley's avatar
Bob Halley committed
58

59
60
61
62
63
64
65
66
67
68
#ifdef ISC_PLATFORM_HAVESYSUNH
#include <sys/un.h>
#endif
#ifdef ISC_PLATFORM_HAVEKQUEUE
#include <sys/event.h>
#endif
#ifdef ISC_PLATFORM_HAVEEPOLL
#include <sys/epoll.h>
#endif
#ifdef ISC_PLATFORM_HAVEDEVPOLL
69
#if defined(HAVE_SYS_DEVPOLL_H)
70
#include <sys/devpoll.h>
71
72
73
#elif defined(HAVE_DEVPOLL_H)
#include <devpoll.h>
#endif
74
75
#endif

76
77
#include <netinet/tcp.h>

78
79
#include "errno2result.h"

80
81
82
83
84
85
86
87
/* See task.c about the following definition: */
#ifdef ISC_PLATFORM_USETHREADS
#define USE_WATCHER_THREAD
#else
#define USE_SHARED_MANAGER
#endif	/* ISC_PLATFORM_USETHREADS */

#ifndef USE_WATCHER_THREAD
88
#include "socket_p.h"
89
#include "../task_p.h"
90
#endif /* USE_WATCHER_THREAD */
91

92
93
94
95
#if defined(SO_BSDCOMPAT) && defined(__linux__)
#include <sys/utsname.h>
#endif

96
97
98
99
#ifdef ISC_PLATFORM_HAVETFO
#include <netinet/tcp.h>
#endif

100
/*%
Automatic Updater's avatar
Automatic Updater committed
101
 * Choose the most preferable multiplex method.
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
 */
#ifdef ISC_PLATFORM_HAVEKQUEUE
#define USE_KQUEUE
#elif defined (ISC_PLATFORM_HAVEEPOLL)
#define USE_EPOLL
#elif defined (ISC_PLATFORM_HAVEDEVPOLL)
#define USE_DEVPOLL
typedef struct {
	unsigned int want_read : 1,
		want_write : 1;
} pollinfo_t;
#else
#define USE_SELECT
#endif	/* ISC_PLATFORM_HAVEKQUEUE */

117
#ifndef USE_WATCHER_THREAD
118
119
120
121
122
123
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
struct isc_socketwait {
	int nevents;
};
#elif defined (USE_SELECT)
struct isc_socketwait {
124
125
	fd_set *readset;
	fd_set *writeset;
126
127
128
129
	int nfds;
	int maxfd;
};
#endif	/* USE_KQUEUE */
130
#endif /* !USE_WATCHER_THREAD */
131

Evan Hunt's avatar
Evan Hunt committed
132
133
134
135
136
137
138
/*
 * Set by the -T dscp option on the command line. If set to a value
 * other than -1, we check to make sure DSCP values match it, and
 * assert if not.
 */
int isc_dscp_check_value = -1;

139
140
/*%
 * Maximum number of allowable open sockets.  This is also the maximum
141
142
143
144
145
146
 * allowable socket file descriptor.
 *
 * Care should be taken before modifying this value for select():
 * The API standard doesn't ensure select() accept more than (the system default
 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
 * the vast majority of cases.  This constant should therefore be increased only
Automatic Updater's avatar
Automatic Updater committed
147
 * when absolutely necessary and possible, i.e., the server is exhausting all
148
149
150
151
152
153
154
155
156
 * available file descriptors (up to FD_SETSIZE) and the select() function
 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
 * always by true, but we keep using some of them to ensure as much
 * portability as possible).  Note also that overall server performance
 * may be rather worsened with a larger value of this constant due to
 * inherent scalability problems of select().
 *
 * As a special note, this value shouldn't have to be touched if
 * this is a build for an authoritative only DNS server.
157
158
 */
#ifndef ISC_SOCKET_MAXSOCKETS
159
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
160
161
162
#ifdef TUNE_LARGE
#define ISC_SOCKET_MAXSOCKETS 21000
#else
163
#define ISC_SOCKET_MAXSOCKETS 4096
164
#endif /* TUNE_LARGE */
165
166
167
168
169
170
171
#elif defined(USE_SELECT)
#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
#endif	/* USE_KQUEUE... */
#endif	/* ISC_SOCKET_MAXSOCKETS */

#ifdef USE_SELECT
/*%
172
173
 * Mac OS X needs a special definition to support larger values in select().
 * We always define this because a larger value can be specified run-time.
174
175
176
177
178
179
 */
#ifdef __APPLE__
#define _DARWIN_UNLIMITED_SELECT
#endif	/* __APPLE__ */
#endif	/* USE_SELECT */

180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#ifdef ISC_SOCKET_USE_POLLWATCH
/*%
 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
 * some of the specified FD.  The idea is based on the observation that it's
 * likely for a busy server to keep receiving packets.  It specifically works
 * as follows: the socket watcher is first initialized with the state of
 * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
 * event occurs.  When it wakes up for a socket I/O event, it moves to the
 * poll_active state, and sets the poll timeout to a short period
 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
 * watcher goes to the poll_checking state with the same timeout period.
 * In this state, the watcher tries to detect whether this is a break
 * during intermittent events or the kernel bug is triggered.  If the next
 * polling reports an event within the short period, the previous timeout is
 * likely to be a kernel bug, and so the watcher goes back to the active state.
 * Otherwise, it moves to the idle state again.
 *
 * It's not clear whether this is a thread-related bug, but since we've only
 * seen this with threads, this workaround is used only when enabling threads.
 */

typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;

#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
#endif	/* ISC_SOCKET_POLLWATCH_TIMEOUT */
#endif	/* ISC_SOCKET_USE_POLLWATCH */

209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
/*%
 * Size of per-FD lock buckets.
 */
#ifdef ISC_PLATFORM_USETHREADS
#define FDLOCK_COUNT		1024
#define FDLOCK_ID(fd)		((fd) % FDLOCK_COUNT)
#else
#define FDLOCK_COUNT		1
#define FDLOCK_ID(fd)		0
#endif	/* ISC_PLATFORM_USETHREADS */

/*%
 * Maximum number of events communicated with the kernel.  There should normally
 * be no need for having a large number.
 */
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
#ifndef ISC_SOCKET_MAXEVENTS
226
227
228
#ifdef TUNE_LARGE
#define ISC_SOCKET_MAXEVENTS	2048
#else
229
#define ISC_SOCKET_MAXEVENTS	64
230
#endif /* TUNE_LARGE */
231
232
233
#endif
#endif

234
/*%
235
 * Some systems define the socket length argument as an int, some as size_t,
236
 * some as socklen_t.  This is here so it can be easily changed if needed.
237
 */
238
#ifndef ISC_SOCKADDR_LEN_T
239
#define ISC_SOCKADDR_LEN_T unsigned int
240
#endif
241

242
/*%
243
244
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
245
246
247
248
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
249
 */
250
251
252
253
#define SOFT_ERROR(e)	((e) == EAGAIN || \
			 (e) == EWOULDBLOCK || \
			 (e) == EINTR || \
			 (e) == 0)
254

Michael Graff's avatar
Michael Graff committed
255
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
256

257
/*!<
Michael Graff's avatar
Michael Graff committed
258
259
260
261
262
263
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
264
265
266
267
268
269
270
271
272
273
274
#define TRACE_LEVEL		90
#define CORRECTNESS_LEVEL	70
#define IOEVENT_LEVEL		60
#define EVENT_LEVEL		50
#define CREATION_LEVEL		20

#define TRACE		DLVL(TRACE_LEVEL)
#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
#define IOEVENT		DLVL(IOEVENT_LEVEL)
#define EVENT		DLVL(EVENT_LEVEL)
#define CREATION	DLVL(CREATION_LEVEL)
275

276
typedef isc_event_t intev_t;
Michael Graff's avatar
Michael Graff committed
277

278
#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
279
#define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
Michael Graff's avatar
Michael Graff committed
280

281
/*!
Michael Graff's avatar
Michael Graff committed
282
283
284
285
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
286
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
Michael Graff's avatar
Michael Graff committed
287
288
289
290
291
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

292
/*%
293
 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
Michael Graff's avatar
Michael Graff committed
294
295
296
297
298
299
300
301
302
 * a setsockopt() like interface to request timestamps, and if the OS
 * doesn't do it for us, call gettimeofday() on every UDP receive?
 */
#ifdef SO_TIMESTAMP
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

303
/*%
Francis Dupont's avatar
Francis Dupont committed
304
 * The size to raise the receive buffer to (from BIND 8).
305
 */
306
#ifdef TUNE_LARGE
307
308
309
#ifdef sun
#define RCVBUFSIZE (1*1024*1024)
#else
310
#define RCVBUFSIZE (16*1024*1024)
311
#endif
312
#else
313
#define RCVBUFSIZE (32*1024)
314
#endif /* TUNE_LARGE */
315

316
/*%
317
318
319
320
 * The number of times a send operation is repeated if the result is EINTR.
 */
#define NRETRIES 10

321
322
323
324
325
326
typedef struct isc__socket isc__socket_t;
typedef struct isc__socketmgr isc__socketmgr_t;

#define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)

struct isc__socket {
327
	/* Not locked. */
328
329
	isc_socket_t		common;
	isc__socketmgr_t	*manager;
330
331
	isc_mutex_t		lock;
	isc_sockettype_t	type;
332
	const isc_statscounter_t	*statsindex;
Michael Graff's avatar
Michael Graff committed
333

334
	/* Locked by socket lock. */
335
	ISC_LINK(isc__socket_t)	link;
336
337
	unsigned int		references;
	int			fd;
338
	int			pf;
339
340
341
	char				name[16];
	void *				tag;

342
	ISC_LIST(isc_socketevent_t)		send_list;
343
	ISC_LIST(isc_socketevent_t)		recv_list;
344
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
345
	ISC_LIST(isc_socket_connev_t)		connect_list;
346
347
348
349
350
351

	/*
	 * Internal events.  Posted when a descriptor is readable or
	 * writable.  These are statically allocated and never freed.
	 * They will be set to non-purgable before use.
	 */
352
353
	intev_t			readable_ev;
	intev_t			writable_ev;
354

355
	isc_sockaddr_t		peer_address;       /* remote address */
356

357
358
359
	unsigned int		pending_recv : 1,
				pending_send : 1,
				pending_accept : 1,
360
				listener : 1,       /* listener socket */
361
				connected : 1,
362
363
364
				connecting : 1,     /* connect pending */
				bound : 1,          /* bound to local addr */
				dupped : 1,
365
366
				active : 1,         /* currently active */
				pktdscp : 1;	    /* per packet dscp */
367

368
#ifdef ISC_NET_RECVOVERFLOW
369
	unsigned char		overflow; /* used for MSG_TRUNC fake */
370
#endif
371
372
373
374
375

	char			*recvcmsgbuf;
	ISC_SOCKADDR_LEN_T	recvcmsgbuflen;
	char			*sendcmsgbuf;
	ISC_SOCKADDR_LEN_T	sendcmsgbuflen;
376
377
378
379
380

	void			*fdwatcharg;
	isc_sockfdwatch_t	fdwatchcb;
	int			fdwatchflags;
	isc_task_t		*fdwatchtask;
381
	unsigned int		dscp;
382
383
};

384
385
386
#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)

387
struct isc__socketmgr {
388
	/* Not locked. */
389
	isc_socketmgr_t		common;
390
391
	isc_mem_t	       *mctx;
	isc_mutex_t		lock;
392
	isc_mutex_t		*fdlock;
393
	isc_stats_t		*stats;
394
395
396
397
398
399
400
401
402
403
404
405
#ifdef USE_KQUEUE
	int			kqueue_fd;
	int			nevents;
	struct kevent		*events;
#endif	/* USE_KQUEUE */
#ifdef USE_EPOLL
	int			epoll_fd;
	int			nevents;
	struct epoll_event	*events;
#endif	/* USE_EPOLL */
#ifdef USE_DEVPOLL
	int			devpoll_fd;
406
407
	isc_resourcevalue_t	open_max;
	unsigned int		calls;
408
409
410
	int			nevents;
	struct pollfd		*events;
#endif	/* USE_DEVPOLL */
411
412
413
#ifdef USE_SELECT
	int			fd_bufsize;
#endif	/* USE_SELECT */
414
415
416
417
418
419
	unsigned int		maxsocks;
#ifdef ISC_PLATFORM_USETHREADS
	int			pipe_fds[2];
#endif

	/* Locked by fdlock. */
420
	isc__socket_t	       **fds;
421
	int			*fdstate;
422
423
424
#if defined(USE_EPOLL)
	uint32_t		*epoll_events;
#endif
425
426
427
428
#ifdef USE_DEVPOLL
	pollinfo_t		*fdpollinfo;
#endif

429
	/* Locked by manager lock. */
430
	ISC_LIST(isc__socket_t)	socklist;
431
#ifdef USE_SELECT
432
433
434
435
	fd_set			*read_fds;
	fd_set			*read_fds_copy;
	fd_set			*write_fds;
	fd_set			*write_fds_copy;
436
	int			maxfd;
437
#endif	/* USE_SELECT */
438
	int			reserved;	/* unlocked */
439
#ifdef USE_WATCHER_THREAD
440
441
	isc_thread_t		watcher;
	isc_condition_t		shutdown_ok;
442
#else /* USE_WATCHER_THREAD */
443
	unsigned int		refs;
444
#endif /* USE_WATCHER_THREAD */
445
	int			maxudp;
446
447
};

448
449
450
#ifdef USE_SHARED_MANAGER
static isc__socketmgr_t *socketmgr = NULL;
#endif /* USE_SHARED_MANAGER */
451

452
453
454
#define CLOSED			0	/* this one must be zero */
#define MANAGED			1
#define CLOSE_PENDING		2
Michael Graff's avatar
Michael Graff committed
455

456
457
458
459
460
461
462
463
464
465
/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
#ifdef ISC_NET_RECVOVERFLOW
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
#else
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
#endif

466
467
468
469
static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
				  isc_sockettype_t type,
				  isc_socket_t **socketp,
				  isc_socket_t *dup_socket);
470
471
static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
472
static void send_connectdone_event(isc__socket_t *, isc_socket_connev_t **);
473
474
475
476
static void free_socket(isc__socket_t **);
static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
				    isc__socket_t **);
static void destroy(isc__socket_t **);
477
478
479
480
static void internal_accept(isc_task_t *, isc_event_t *);
static void internal_connect(isc_task_t *, isc_event_t *);
static void internal_recv(isc_task_t *, isc_event_t *);
static void internal_send(isc_task_t *, isc_event_t *);
481
482
static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
483
484
static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
static void build_msghdr_send(isc__socket_t *, isc_socketevent_t *,
485
			      struct msghdr *, struct iovec *, size_t *);
486
static void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *,
487
			      struct msghdr *, struct iovec *, size_t *);
488
489
490
#ifdef USE_WATCHER_THREAD
static isc_boolean_t process_ctlfd(isc__socketmgr_t *manager);
#endif
Evan Hunt's avatar
Evan Hunt committed
491
static void setdscp(isc__socket_t *sock, isc_dscp_t dscp);
492
493

/*%
494
495
496
 * The following are intended for internal use (indicated by "isc__"
 * prefix) but are not declared as static, allowing direct access from
 * unit tests etc.
497
498
 */

499
500
501
502
503
isc_result_t
isc__socket_open(isc_socket_t *sock0);
isc_result_t
isc__socket_close(isc_socket_t *sock0);
isc_result_t
504
505
isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
		   isc_socket_t **socketp);
506
void
507
isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
508
void
509
isc__socket_detach(isc_socket_t **socketp);
510
isc_result_t
511
512
isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
		 unsigned int minimum, isc_task_t *task,
Mark Andrews's avatar
Mark Andrews committed
513
		  isc_taskaction_t action, void *arg);
514
isc_result_t
515
516
isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
		 unsigned int minimum, isc_task_t *task,
Mark Andrews's avatar
Mark Andrews committed
517
		 isc_taskaction_t action, void *arg);
518
isc_result_t
519
520
521
isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
		  unsigned int minimum, isc_task_t *task,
		  isc_socketevent_t *event, unsigned int flags);
522
isc_result_t
523
isc__socket_send(isc_socket_t *sock, isc_region_t *region,
Mark Andrews's avatar
Mark Andrews committed
524
		 isc_task_t *task, isc_taskaction_t action, void *arg);
525
isc_result_t
526
isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
Mark Andrews's avatar
Mark Andrews committed
527
		   isc_task_t *task, isc_taskaction_t action, void *arg,
528
		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
529
isc_result_t
530
isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
Mark Andrews's avatar
Mark Andrews committed
531
		  isc_task_t *task, isc_taskaction_t action, void *arg);
532
isc_result_t
533
isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
Mark Andrews's avatar
Mark Andrews committed
534
		    isc_task_t *task, isc_taskaction_t action, void *arg,
535
		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
536
isc_result_t
537
isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist,
Mark Andrews's avatar
Mark Andrews committed
538
		     isc_task_t *task, isc_taskaction_t action, void *arg,
539
540
541
		     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
		     unsigned int flags);
isc_result_t
542
543
544
545
isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
		    isc_task_t *task,
		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
		    isc_socketevent_t *event, unsigned int flags);
Evan Hunt's avatar
Evan Hunt committed
546
547
548
isc_socketevent_t *
isc_socket_socketevent(isc_mem_t *mctx, void *sender,
		       isc_eventtype_t eventtype, isc_taskaction_t action,
Mark Andrews's avatar
Mark Andrews committed
549
		       void *arg);
Evan Hunt's avatar
Evan Hunt committed
550

551
void
552
isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active);
553
isc_result_t
554
555
isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
		     isc_uint32_t owner, isc_uint32_t group);
556
isc_result_t
557
558
isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
		 unsigned int options);
559
isc_result_t
560
isc__socket_filter(isc_socket_t *sock, const char *filter);
561
isc_result_t
562
isc__socket_listen(isc_socket_t *sock, unsigned int backlog);
563
isc_result_t
564
isc__socket_accept(isc_socket_t *sock,
Mark Andrews's avatar
Mark Andrews committed
565
		   isc_task_t *task, isc_taskaction_t action, void *arg);
566
isc_result_t
567
568
isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
		    isc_task_t *task, isc_taskaction_t action,
Mark Andrews's avatar
Mark Andrews committed
569
		    void *arg);
570
isc_result_t
571
isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp);
572
isc_result_t
573
isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp);
574
void
575
isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
576
isc_sockettype_t
577
isc__socket_gettype(isc_socket_t *sock);
578
isc_boolean_t
579
isc__socket_isbound(isc_socket_t *sock);
580
void
581
isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes);
582
void
Evan Hunt's avatar
Evan Hunt committed
583
isc__socket_dscp(isc_socket_t *sock, isc_dscp_t dscp);
584
isc_result_t
585
586
587
isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
			  isc_sockfdwatch_t callback, void *cbarg,
			  isc_task_t *task, isc_socket_t **socketp);
588
isc_result_t
589
isc__socket_fdwatchpoke(isc_socket_t *sock, int flags);
590
isc_result_t
591
isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp);
592
int
Mark Andrews's avatar
Mark Andrews committed
593
isc__socket_getfd(isc_socket_t *sock);
594

595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
isc_result_t
isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp);
isc_result_t
isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
		       unsigned int maxsocks);
isc_result_t
isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp);
void
isc_socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats);
void
isc__socketmgr_destroy(isc_socketmgr_t **managerp);
void
isc__socket_setname(isc_socket_t *socket0, const char *name, void *tag);
const char *
isc__socket_getname(isc_socket_t *socket0);
void *
isc__socket_gettag(isc_socket_t *socket0);

#ifdef HAVE_LIBXML2
void
isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer);
#endif
#ifdef HAVE_JSON
isc_result_t
isc__socketmgr_renderjson(isc_socketmgr_t *mgr0, json_object *stats);
#endif

622
623
624
625
626
627
628
static struct {
	isc_socketmethods_t methods;

	/*%
	 * The following are defined just for avoiding unused static functions.
	 */
	void *recvv, *send, *sendv, *sendto2, *cleanunix, *permunix, *filter,
629
	     *listen, *accept, *getpeername, *isbound;
630
631
632
633
634
635
} socketmethods = {
	{
		isc__socket_attach,
		isc__socket_detach,
		isc__socket_bind,
		isc__socket_sendto,
636
		isc__socket_sendto2,
637
638
		isc__socket_connect,
		isc__socket_recv,
639
		isc__socket_recv2,
640
641
642
		isc__socket_cancel,
		isc__socket_getsockname,
		isc__socket_gettype,
643
		isc__socket_ipv6only,
644
		isc__socket_fdwatchpoke,
Mark Andrews's avatar
Mark Andrews committed
645
		isc__socket_dup,
Evan Hunt's avatar
Evan Hunt committed
646
647
		isc__socket_getfd,
		isc__socket_dscp
648
	},
649
650
651
652
653
654
	(void *)isc__socket_recvv, (void *)isc__socket_send,
	(void *)isc__socket_sendv, (void *)isc__socket_sendto2,
	(void *)isc__socket_cleanunix, (void *)isc__socket_permunix,
	(void *)isc__socket_filter, (void *)isc__socket_listen,
	(void *)isc__socket_accept, (void *)isc__socket_getpeername,
	(void *)isc__socket_isbound
655
656
657
658
};

static isc_socketmgrmethods_t socketmgrmethods = {
	isc__socketmgr_destroy,
659
660
	isc__socket_create,
	isc__socket_fdwatchcreate
661
662
};

Michael Graff's avatar
Michael Graff committed
663
664
#define SELECT_POKE_SHUTDOWN		(-1)
#define SELECT_POKE_NOTHING		(-2)
665
#define SELECT_POKE_READ		(-3)
666
#define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
667
#define SELECT_POKE_WRITE		(-4)
668
#define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
669
#define SELECT_POKE_CLOSE		(-5)
670

671
672
#define SOCK_DEAD(s)			((s)->references == 0)

673
674
675
676
677
678
679
680
681
682
683
684
685
/*%
 * Shortcut index arrays to get access to statistics counters.
 */
enum {
	STATID_OPEN = 0,
	STATID_OPENFAIL = 1,
	STATID_CLOSE = 2,
	STATID_BINDFAIL = 3,
	STATID_CONNECTFAIL = 4,
	STATID_CONNECT = 5,
	STATID_ACCEPTFAIL = 6,
	STATID_ACCEPT = 7,
	STATID_SENDFAIL = 8,
686
687
	STATID_RECVFAIL = 9,
	STATID_ACTIVE = 10
688
};
Mark Andrews's avatar
Mark Andrews committed
689
static const isc_statscounter_t udp4statsindex[] = {
690
691
692
693
694
695
696
697
698
	isc_sockstatscounter_udp4open,
	isc_sockstatscounter_udp4openfail,
	isc_sockstatscounter_udp4close,
	isc_sockstatscounter_udp4bindfail,
	isc_sockstatscounter_udp4connectfail,
	isc_sockstatscounter_udp4connect,
	-1,
	-1,
	isc_sockstatscounter_udp4sendfail,
699
700
	isc_sockstatscounter_udp4recvfail,
	isc_sockstatscounter_udp4active
701
};
Mark Andrews's avatar
Mark Andrews committed
702
static const isc_statscounter_t udp6statsindex[] = {
703
704
705
706
707
708
709
710
711
	isc_sockstatscounter_udp6open,
	isc_sockstatscounter_udp6openfail,
	isc_sockstatscounter_udp6close,
	isc_sockstatscounter_udp6bindfail,
	isc_sockstatscounter_udp6connectfail,
	isc_sockstatscounter_udp6connect,
	-1,
	-1,
	isc_sockstatscounter_udp6sendfail,
712
713
	isc_sockstatscounter_udp6recvfail,
	isc_sockstatscounter_udp6active
714
715
716
717
718
719
720
721
722
723
724
};
static const isc_statscounter_t tcp4statsindex[] = {
	isc_sockstatscounter_tcp4open,
	isc_sockstatscounter_tcp4openfail,
	isc_sockstatscounter_tcp4close,
	isc_sockstatscounter_tcp4bindfail,
	isc_sockstatscounter_tcp4connectfail,
	isc_sockstatscounter_tcp4connect,
	isc_sockstatscounter_tcp4acceptfail,
	isc_sockstatscounter_tcp4accept,
	isc_sockstatscounter_tcp4sendfail,
725
726
	isc_sockstatscounter_tcp4recvfail,
	isc_sockstatscounter_tcp4active
727
728
729
730
731
732
733
734
735
736
737
};
static const isc_statscounter_t tcp6statsindex[] = {
	isc_sockstatscounter_tcp6open,
	isc_sockstatscounter_tcp6openfail,
	isc_sockstatscounter_tcp6close,
	isc_sockstatscounter_tcp6bindfail,
	isc_sockstatscounter_tcp6connectfail,
	isc_sockstatscounter_tcp6connect,
	isc_sockstatscounter_tcp6acceptfail,
	isc_sockstatscounter_tcp6accept,
	isc_sockstatscounter_tcp6sendfail,
738
739
	isc_sockstatscounter_tcp6recvfail,
	isc_sockstatscounter_tcp6active
740
741
742
743
744
745
746
747
748
749
750
};
static const isc_statscounter_t unixstatsindex[] = {
	isc_sockstatscounter_unixopen,
	isc_sockstatscounter_unixopenfail,
	isc_sockstatscounter_unixclose,
	isc_sockstatscounter_unixbindfail,
	isc_sockstatscounter_unixconnectfail,
	isc_sockstatscounter_unixconnect,
	isc_sockstatscounter_unixacceptfail,
	isc_sockstatscounter_unixaccept,
	isc_sockstatscounter_unixsendfail,
751
752
	isc_sockstatscounter_unixrecvfail,
	isc_sockstatscounter_unixactive
753
754
755
756
757
758
759
760
761
762
763
};
static const isc_statscounter_t fdwatchstatsindex[] = {
	-1,
	-1,
	isc_sockstatscounter_fdwatchclose,
	isc_sockstatscounter_fdwatchbindfail,
	isc_sockstatscounter_fdwatchconnectfail,
	isc_sockstatscounter_fdwatchconnect,
	-1,
	-1,
	isc_sockstatscounter_fdwatchsendfail,
764
765
	isc_sockstatscounter_fdwatchrecvfail,
	-1
766
};
767
768
769
770
771
772
773
774
775
776
777
778
779
static const isc_statscounter_t rawstatsindex[] = {
	isc_sockstatscounter_rawopen,
	isc_sockstatscounter_rawopenfail,
	isc_sockstatscounter_rawclose,
	-1,
	-1,
	-1,
	-1,
	-1,
	-1,
	isc_sockstatscounter_rawrecvfail,
	isc_sockstatscounter_rawactive
};
780

781
782
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \
    defined(USE_WATCHER_THREAD)
783
static void
784
manager_log(isc__socketmgr_t *sockmgr,
785
786
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
Michael Graff's avatar
Michael Graff committed
787
static void
788
manager_log(isc__socketmgr_t *sockmgr,
Michael Graff's avatar
Michael Graff committed
789
790
791
792
793
794
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...)
{
	char msgbuf[2048];
	va_list ap;

795
796
797
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
798
799
800
801
802
803
804
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
		      "sockmgr %p: %s", sockmgr, msgbuf);
}
805
#endif
Michael Graff's avatar
Michael Graff committed
806

807
static void
808
socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
809
810
811
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   isc_msgcat_t *msgcat, int msgset, int message,
	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
Michael Graff's avatar
Michael Graff committed
812
static void
813
socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
Michael Graff's avatar
Michael Graff committed
814
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
815
	   isc_msgcat_t *msgcat, int msgset, int message,
Michael Graff's avatar
Michael Graff committed
816
817
818
	   const char *fmt, ...)
{
	char msgbuf[2048];
819
	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
Michael Graff's avatar
Michael Graff committed
820
821
	va_list ap;

822
823
824
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
825
826
827
828
829
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
830
831
832
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p: %s", sock, msgbuf);
Michael Graff's avatar
Michael Graff committed
833
	} else {
Andreas Gustafsson's avatar
Andreas Gustafsson committed
834
		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
835
836
837
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p %s: %s", sock, peerbuf, msgbuf);
Michael Graff's avatar
Michael Graff committed
838
839
840
	}
}

841
842
843
844
845
846
847
#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
    defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
/*
 * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
 * setting IPV6_V6ONLY.
 */
static void
848
FIX_IPV6_RECVPKTINFO(isc__socket_t *sock)
849
850
851
852
853
854
855
856
857
{
	char strbuf[ISC_STRERRORSIZE];
	int on = 1;

	if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
		return;

	if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
		       (void *)&on, sizeof(on)) < 0) {
Automatic Updater's avatar
Automatic Updater committed
858

859
		isc__strerror(errno, strbuf, sizeof(strbuf));
860
861
862
863
864
865
866
867
868
869
870
871
872
873
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "setsockopt(%d, IPV6_RECVPKTINFO) "
				 "%s: %s", sock->fd,
				 isc_msgcat_get(isc_msgcat,
						ISC_MSGSET_GENERAL,
						ISC_MSG_FAILED,
						"failed"),
				 strbuf);
	}
}
#else
#define FIX_IPV6_RECVPKTINFO(sock) (void)0
#endif

874
875
876
877
878
879
880
881
882
883
884
/*%
 * Increment socket-related statistics counters.
 */
static inline void
inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
	REQUIRE(counterid != -1);

	if (stats != NULL)
		isc_stats_increment(stats, counterid);
}

885
886
887
888
889
890
891
892
893
894
895
/*%
 * Decrement socket-related statistics counters.
 */
static inline void
dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
	REQUIRE(counterid != -1);

	if (stats != NULL)
		isc_stats_decrement(stats, counterid);
}

896
static inline isc_result_t
897
watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
	isc_result_t result = ISC_R_SUCCESS;

#ifdef USE_KQUEUE
	struct kevent evchange;

	memset(&evchange, 0, sizeof(evchange));
	if (msg == SELECT_POKE_READ)
		evchange.filter = EVFILT_READ;
	else
		evchange.filter = EVFILT_WRITE;
	evchange.flags = EV_ADD;
	evchange.ident = fd;
	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
		result = isc__errno2result(errno);

	return (result);
#elif defined(USE_EPOLL)
	struct epoll_event event;
916
917
918
	uint32_t oldevents;
	int ret;
	int op;
919

920
	oldevents = manager->epoll_events[fd];
921
	if (msg == SELECT_POKE_READ)
922
		manager->epoll_events[fd] |= EPOLLIN;
923
	else
924
925
926
		manager->epoll_events[fd] |= EPOLLOUT;

	event.events = manager->epoll_events[fd];
927
	memset(&event.data, 0, sizeof(event.data));
928
	event.data.fd = fd;
929
930
931
932
933
934
935
936

	op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
	ret = epoll_ctl(manager->epoll_fd, op, fd, &event);
	if (ret == -1) {
		if (errno == EEXIST)
			UNEXPECTED_ERROR(__FILE__, __LINE__,
					 "epoll_ctl(ADD/MOD) returned "
					 "EEXIST for fd %d", fd);
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
		result = isc__errno2result(errno);
	}

	return (result);
#elif defined(USE_DEVPOLL)
	struct pollfd pfd;
	int lockid = FDLOCK_ID(fd);

	memset(&pfd, 0, sizeof(pfd));
	if (msg == SELECT_POKE_READ)
		pfd.events = POLLIN;
	else
		pfd.events = POLLOUT;
	pfd.fd = fd;
	pfd.revents = 0;
	LOCK(&manager->fdlock[lockid]);
	if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
		result = isc__errno2result(errno);
	else {
		if (msg == SELECT_POKE_READ)
			manager->fdpollinfo[fd].want_read = 1;
		else
			manager->fdpollinfo[fd].want_write = 1;
	}
	UNLOCK(&manager->fdlock[lockid]);

	return (result);
#elif defined(USE_SELECT)
	LOCK(&manager->lock);
	if (msg == SELECT_POKE_READ)
967
		FD_SET(fd, manager->read_fds);
968
	if (msg == SELECT_POKE_WRITE)
969
		FD_SET(fd, manager->write_fds);
970
971
972
973
974
975
976
	UNLOCK(&manager->lock);

	return (result);
#endif
}

static inline isc_result_t
977
unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
	isc_result_t result = ISC_R_SUCCESS;

#ifdef USE_KQUEUE
	struct kevent evchange;

	memset(&evchange, 0, sizeof(evchange));
	if (msg == SELECT_POKE_READ)
		evchange.filter = EVFILT_READ;
	else
		evchange.filter = EVFILT_WRITE;
	evchange.flags = EV_DELETE;
	evchange.ident = fd;
	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
		result = isc__errno2result(errno);

	return (result);
#elif defined(USE_EPOLL)
	struct epoll_event event;
996
997
	int ret;
	int op;
998
999

	if (msg == SELECT_POKE_READ)
1000
		manager->epoll_events[fd] &= ~(EPOLLIN);
1001
	else
1002
1003
1004
		manager->epoll_events[fd] &= ~(EPOLLOUT);

	event.events = manager->epoll_events[fd];
1005
	memset(&event.data, 0, sizeof(event.data));
1006
	event.data.fd = fd;
1007
1008
1009
1010

	op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
	ret = epoll_ctl(manager->epoll_fd, op, fd, &event);
	if (ret == -1 && errno != ENOENT) {
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
		char strbuf[ISC_STRERRORSIZE];
		isc__strerror(errno, strbuf, sizeof(strbuf));
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "epoll_ctl(DEL), %d: %s", fd, strbuf);
		result = ISC_R_UNEXPECTED;
	}
	return (result);
#elif defined(USE_DEVPOLL)
	struct pollfd pfds[2];
	size_t writelen = sizeof(pfds[0]);
	int lockid = FDLOCK_ID(fd);

	memset(pfds, 0, sizeof(pfds));
	pfds[0].events = POLLREMOVE;
	pfds[0].fd = fd;

	/*
	 * Canceling read or write polling via /dev/poll is tricky.  Since it
	 * only provides a way of canceling per FD, we may need to re-poll the
	 * socket for the other operation.
	 */
	LOCK(&manager->fdlock[lockid]);
	if (msg == SELECT_POKE_READ &&
	    manager->fdpollinfo[fd].want_write == 1) {
		pfds[1].events = POLLOUT;
		pfds[1].fd = fd;
		writelen += sizeof(pfds[1]);
	}
	if (msg == SELECT_POKE_WRITE &&
	    manager->fdpollinfo[fd].want_read == 1) {
		pfds[1].events = POLLIN;
		pfds[1].fd = fd;
		writelen += sizeof(pfds[1]);
	}

Automatic Updater's avatar
Automatic Updater committed
1046
	if (write(manager->devpoll_fd, pfds, writelen) == -1)
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
		result = isc__errno2result(errno);
	else {
		if (msg == SELECT_POKE_READ)
			manager->fdpollinfo[fd].want_read = 0;
		else
			manager->fdpollinfo[fd].want_write = 0;
	}
	UNLOCK(&manager->fdlock[lockid]);

	return (result);
#elif defined(USE_SELECT)
	LOCK(&manager->lock);
	if (msg == SELECT_POKE_READ)
1060
		FD_CLR(fd, manager->read_fds);
1061
	else if (msg == SELECT_POKE_WRITE)
1062
		FD_CLR(fd, manager->write_fds);
1063
1064
1065
1066
1067
1068
	UNLOCK(&manager->lock);

	return (result);
#endif
}

1069
static void
1070
wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
1071
1072
	isc_result_t result;
	int lockid = FDLOCK_ID(fd);
1073
1074

	/*
1075
1076
1077
	 * This is a wakeup on a socket.  If the socket is not in the
	 * process of being closed, start watching it for either reads
	 * or writes.
1078
	 */
Andreas Gustafsson's avatar
   
Andreas Gustafsson committed
1079

1080
	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
1081

1082
1083
1084
	if (msg == SELECT_POKE_CLOSE) {
		/* No one should be updating fdstate, so no need to lock it */
		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
Michael Graff's avatar
Michael Graff committed
1085
		manager->fdstate[fd] = CLOSED;
1086
1087
1088
1089
1090
1091
1092
1093
		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
		(void)close(fd);
		return;
	}

	LOCK(&manager->fdlock[lockid]);
	if (manager->fdstate[fd] == CLOSE_PENDING) {
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
		UNLOCK(&manager->fdlock[lockid]);

		/*
		 * We accept (and ignore) any error from unwatch_fd() as we are
		 * closing the socket, hoping it doesn't leave dangling state in
		 * the kernel.
		 * Note that unwatch_fd() must be called after releasing the
		 * fdlock; otherwise it could cause deadlock due to a lock order
		 * reversal.
		 */
		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);