socket.c 166 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
Mark Andrews's avatar
Mark Andrews committed
2
 * Copyright (C) 2004-2013  Internet Systems Consortium, Inc. ("ISC")
Mark Andrews's avatar
Mark Andrews committed
3
 * Copyright (C) 1998-2003  Internet Software Consortium.
4
 *
Automatic Updater's avatar
Automatic Updater committed
5
 * Permission to use, copy, modify, and/or distribute this software for any
Bob Halley's avatar
Bob Halley committed
6
7
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
8
 *
Mark Andrews's avatar
Mark Andrews committed
9
10
11
12
13
14
15
 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
Bob Halley's avatar
Bob Halley committed
16
 */
Bob Halley's avatar
Bob Halley committed
17

Evan Hunt's avatar
Evan Hunt committed
18
/* $Id$ */
19
20

/*! \file */
David Lawrence's avatar
David Lawrence committed
21

Bob Halley's avatar
Bob Halley committed
22
#include <config.h>
23

24
#include <sys/param.h>
Michael Graff's avatar
Michael Graff committed
25
#include <sys/types.h>
Michael Graff's avatar
Michael Graff committed
26
#include <sys/socket.h>
27
#include <sys/stat.h>
Michael Graff's avatar
Michael Graff committed
28
#include <sys/time.h>
Michael Graff's avatar
Michael Graff committed
29
30
#include <sys/uio.h>

31
#include <errno.h>
Andreas Gustafsson's avatar
Andreas Gustafsson committed
32
#include <fcntl.h>
33
34
35
36
37
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

38
#include <isc/buffer.h>
39
#include <isc/bufferlist.h>
40
#include <isc/condition.h>
41
#include <isc/formatcheck.h>
42
#include <isc/json.h>
43
#include <isc/list.h>
Michael Graff's avatar
Michael Graff committed
44
#include <isc/log.h>
45
#include <isc/mem.h>
46
#include <isc/msgs.h>
47
#include <isc/mutex.h>
48
#include <isc/net.h>
49
#include <isc/once.h>
50
#include <isc/platform.h>
Michael Graff's avatar
Michael Graff committed
51
#include <isc/print.h>
52
#include <isc/region.h>
53
#include <isc/socket.h>
54
#include <isc/stats.h>
55
#include <isc/strerror.h>
56
#include <isc/task.h>
57
#include <isc/thread.h>
Michael Graff's avatar
Michael Graff committed
58
#include <isc/util.h>
59
#include <isc/xml.h>
Bob Halley's avatar
Bob Halley committed
60

61
62
63
64
65
66
67
68
69
70
#ifdef ISC_PLATFORM_HAVESYSUNH
#include <sys/un.h>
#endif
#ifdef ISC_PLATFORM_HAVEKQUEUE
#include <sys/event.h>
#endif
#ifdef ISC_PLATFORM_HAVEEPOLL
#include <sys/epoll.h>
#endif
#ifdef ISC_PLATFORM_HAVEDEVPOLL
71
#if defined(HAVE_SYS_DEVPOLL_H)
72
#include <sys/devpoll.h>
73
74
75
#elif defined(HAVE_DEVPOLL_H)
#include <devpoll.h>
#endif
76
77
#endif

78
79
#include "errno2result.h"

80
81
82
83
84
85
86
87
88
89
/* See task.c about the following definition: */
#ifdef BIND9
#ifdef ISC_PLATFORM_USETHREADS
#define USE_WATCHER_THREAD
#else
#define USE_SHARED_MANAGER
#endif	/* ISC_PLATFORM_USETHREADS */
#endif	/* BIND9 */

#ifndef USE_WATCHER_THREAD
90
#include "socket_p.h"
91
#include "../task_p.h"
92
#endif /* USE_WATCHER_THREAD */
93

94
95
96
97
#if defined(SO_BSDCOMPAT) && defined(__linux__)
#include <sys/utsname.h>
#endif

98
/*%
Automatic Updater's avatar
Automatic Updater committed
99
 * Choose the most preferable multiplex method.
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
 */
#ifdef ISC_PLATFORM_HAVEKQUEUE
#define USE_KQUEUE
#elif defined (ISC_PLATFORM_HAVEEPOLL)
#define USE_EPOLL
#elif defined (ISC_PLATFORM_HAVEDEVPOLL)
#define USE_DEVPOLL
typedef struct {
	unsigned int want_read : 1,
		want_write : 1;
} pollinfo_t;
#else
#define USE_SELECT
#endif	/* ISC_PLATFORM_HAVEKQUEUE */

115
#ifndef USE_WATCHER_THREAD
116
117
118
119
120
121
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
struct isc_socketwait {
	int nevents;
};
#elif defined (USE_SELECT)
struct isc_socketwait {
122
123
	fd_set *readset;
	fd_set *writeset;
124
125
126
127
	int nfds;
	int maxfd;
};
#endif	/* USE_KQUEUE */
128
#endif /* !USE_WATCHER_THREAD */
129

Mark Andrews's avatar
Mark Andrews committed
130
131
132
133
134
135
136
/*
 * Set by the -T dscp option on the command line. If set to a value
 * other than -1, we check to make sure DSCP values match it, and
 * assert if not.
 */
int isc_dscp_check_value = -1;

137
138
/*%
 * Maximum number of allowable open sockets.  This is also the maximum
139
140
141
142
143
144
 * allowable socket file descriptor.
 *
 * Care should be taken before modifying this value for select():
 * The API standard doesn't ensure select() accept more than (the system default
 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
 * the vast majority of cases.  This constant should therefore be increased only
Automatic Updater's avatar
Automatic Updater committed
145
 * when absolutely necessary and possible, i.e., the server is exhausting all
146
147
148
149
150
151
152
153
154
 * available file descriptors (up to FD_SETSIZE) and the select() function
 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
 * always by true, but we keep using some of them to ensure as much
 * portability as possible).  Note also that overall server performance
 * may be rather worsened with a larger value of this constant due to
 * inherent scalability problems of select().
 *
 * As a special note, this value shouldn't have to be touched if
 * this is a build for an authoritative only DNS server.
155
156
 */
#ifndef ISC_SOCKET_MAXSOCKETS
157
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
158
#define ISC_SOCKET_MAXSOCKETS 4096
159
160
161
162
163
164
165
#elif defined(USE_SELECT)
#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
#endif	/* USE_KQUEUE... */
#endif	/* ISC_SOCKET_MAXSOCKETS */

#ifdef USE_SELECT
/*%
166
167
 * Mac OS X needs a special definition to support larger values in select().
 * We always define this because a larger value can be specified run-time.
168
169
170
171
172
173
 */
#ifdef __APPLE__
#define _DARWIN_UNLIMITED_SELECT
#endif	/* __APPLE__ */
#endif	/* USE_SELECT */

174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#ifdef ISC_SOCKET_USE_POLLWATCH
/*%
 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
 * some of the specified FD.  The idea is based on the observation that it's
 * likely for a busy server to keep receiving packets.  It specifically works
 * as follows: the socket watcher is first initialized with the state of
 * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
 * event occurs.  When it wakes up for a socket I/O event, it moves to the
 * poll_active state, and sets the poll timeout to a short period
 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
 * watcher goes to the poll_checking state with the same timeout period.
 * In this state, the watcher tries to detect whether this is a break
 * during intermittent events or the kernel bug is triggered.  If the next
 * polling reports an event within the short period, the previous timeout is
 * likely to be a kernel bug, and so the watcher goes back to the active state.
 * Otherwise, it moves to the idle state again.
 *
 * It's not clear whether this is a thread-related bug, but since we've only
 * seen this with threads, this workaround is used only when enabling threads.
 */

typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;

#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
#endif	/* ISC_SOCKET_POLLWATCH_TIMEOUT */
#endif	/* ISC_SOCKET_USE_POLLWATCH */

203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
/*%
 * Size of per-FD lock buckets.
 */
#ifdef ISC_PLATFORM_USETHREADS
#define FDLOCK_COUNT		1024
#define FDLOCK_ID(fd)		((fd) % FDLOCK_COUNT)
#else
#define FDLOCK_COUNT		1
#define FDLOCK_ID(fd)		0
#endif	/* ISC_PLATFORM_USETHREADS */

/*%
 * Maximum number of events communicated with the kernel.  There should normally
 * be no need for having a large number.
 */
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
#ifndef ISC_SOCKET_MAXEVENTS
#define ISC_SOCKET_MAXEVENTS	64
#endif
#endif

224
/*%
225
 * Some systems define the socket length argument as an int, some as size_t,
226
 * some as socklen_t.  This is here so it can be easily changed if needed.
227
 */
228
#ifndef ISC_SOCKADDR_LEN_T
229
#define ISC_SOCKADDR_LEN_T unsigned int
230
#endif
231

232
/*%
233
234
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
235
236
237
238
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
239
 */
240
241
242
243
#define SOFT_ERROR(e)	((e) == EAGAIN || \
			 (e) == EWOULDBLOCK || \
			 (e) == EINTR || \
			 (e) == 0)
244

Michael Graff's avatar
Michael Graff committed
245
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
246

247
/*!<
Michael Graff's avatar
Michael Graff committed
248
249
250
251
252
253
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
254
255
256
257
258
259
260
261
262
263
264
#define TRACE_LEVEL		90
#define CORRECTNESS_LEVEL	70
#define IOEVENT_LEVEL		60
#define EVENT_LEVEL		50
#define CREATION_LEVEL		20

#define TRACE		DLVL(TRACE_LEVEL)
#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
#define IOEVENT		DLVL(IOEVENT_LEVEL)
#define EVENT		DLVL(EVENT_LEVEL)
#define CREATION	DLVL(CREATION_LEVEL)
265

266
typedef isc_event_t intev_t;
Michael Graff's avatar
Michael Graff committed
267

268
#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
269
#define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
Michael Graff's avatar
Michael Graff committed
270

271
/*!
Michael Graff's avatar
Michael Graff committed
272
273
274
275
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
276
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
Michael Graff's avatar
Michael Graff committed
277
278
279
280
281
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

282
/*%
283
 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
Michael Graff's avatar
Michael Graff committed
284
285
286
287
288
289
290
291
292
 * a setsockopt() like interface to request timestamps, and if the OS
 * doesn't do it for us, call gettimeofday() on every UDP receive?
 */
#ifdef SO_TIMESTAMP
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

293
/*%
Francis Dupont's avatar
Francis Dupont committed
294
 * The size to raise the receive buffer to (from BIND 8).
295
296
297
 */
#define RCVBUFSIZE (32*1024)

298
/*%
299
300
301
302
 * The number of times a send operation is repeated if the result is EINTR.
 */
#define NRETRIES 10

303
304
305
306
307
308
typedef struct isc__socket isc__socket_t;
typedef struct isc__socketmgr isc__socketmgr_t;

#define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)

struct isc__socket {
309
	/* Not locked. */
310
311
	isc_socket_t		common;
	isc__socketmgr_t	*manager;
312
313
	isc_mutex_t		lock;
	isc_sockettype_t	type;
314
	const isc_statscounter_t	*statsindex;
Michael Graff's avatar
Michael Graff committed
315

316
	/* Locked by socket lock. */
317
	ISC_LINK(isc__socket_t)	link;
318
319
	unsigned int		references;
	int			fd;
320
	int			pf;
321
322
323
	char				name[16];
	void *				tag;

324
	ISC_LIST(isc_socketevent_t)		send_list;
325
	ISC_LIST(isc_socketevent_t)		recv_list;
326
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
327
328
329
330
331
332
333
	isc_socket_connev_t		       *connect_ev;

	/*
	 * Internal events.  Posted when a descriptor is readable or
	 * writable.  These are statically allocated and never freed.
	 * They will be set to non-purgable before use.
	 */
334
335
	intev_t			readable_ev;
	intev_t			writable_ev;
336

337
	isc_sockaddr_t		peer_address;       /* remote address */
338

339
340
341
	unsigned int		pending_recv : 1,
				pending_send : 1,
				pending_accept : 1,
342
				listener : 1,       /* listener socket */
343
				connected : 1,
344
345
346
347
				connecting : 1,     /* connect pending */
				bound : 1,          /* bound to local addr */
				dupped : 1,
				active : 1;         /* currently active */
348

349
#ifdef ISC_NET_RECVOVERFLOW
350
	unsigned char		overflow; /* used for MSG_TRUNC fake */
351
#endif
352
353
354
355
356

	char			*recvcmsgbuf;
	ISC_SOCKADDR_LEN_T	recvcmsgbuflen;
	char			*sendcmsgbuf;
	ISC_SOCKADDR_LEN_T	sendcmsgbuflen;
357
358
359
360
361

	void			*fdwatcharg;
	isc_sockfdwatch_t	fdwatchcb;
	int			fdwatchflags;
	isc_task_t		*fdwatchtask;
Mark Andrews's avatar
Mark Andrews committed
362
	int			dscp;
363
364
};

365
366
367
#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)

368
struct isc__socketmgr {
369
	/* Not locked. */
370
	isc_socketmgr_t		common;
371
372
	isc_mem_t	       *mctx;
	isc_mutex_t		lock;
373
	isc_mutex_t		*fdlock;
374
	isc_stats_t		*stats;
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
#ifdef USE_KQUEUE
	int			kqueue_fd;
	int			nevents;
	struct kevent		*events;
#endif	/* USE_KQUEUE */
#ifdef USE_EPOLL
	int			epoll_fd;
	int			nevents;
	struct epoll_event	*events;
#endif	/* USE_EPOLL */
#ifdef USE_DEVPOLL
	int			devpoll_fd;
	int			nevents;
	struct pollfd		*events;
#endif	/* USE_DEVPOLL */
390
391
392
#ifdef USE_SELECT
	int			fd_bufsize;
#endif	/* USE_SELECT */
393
394
395
396
397
398
	unsigned int		maxsocks;
#ifdef ISC_PLATFORM_USETHREADS
	int			pipe_fds[2];
#endif

	/* Locked by fdlock. */
399
	isc__socket_t	       **fds;
400
401
402
403
404
	int			*fdstate;
#ifdef USE_DEVPOLL
	pollinfo_t		*fdpollinfo;
#endif

405
	/* Locked by manager lock. */
406
	ISC_LIST(isc__socket_t)	socklist;
407
#ifdef USE_SELECT
408
409
410
411
	fd_set			*read_fds;
	fd_set			*read_fds_copy;
	fd_set			*write_fds;
	fd_set			*write_fds_copy;
412
	int			maxfd;
413
#endif	/* USE_SELECT */
414
	int			reserved;	/* unlocked */
415
#ifdef USE_WATCHER_THREAD
416
417
	isc_thread_t		watcher;
	isc_condition_t		shutdown_ok;
418
#else /* USE_WATCHER_THREAD */
419
	unsigned int		refs;
420
#endif /* USE_WATCHER_THREAD */
421
	int			maxudp;
422
423
};

424
425
426
#ifdef USE_SHARED_MANAGER
static isc__socketmgr_t *socketmgr = NULL;
#endif /* USE_SHARED_MANAGER */
427

428
429
430
#define CLOSED			0	/* this one must be zero */
#define MANAGED			1
#define CLOSE_PENDING		2
Michael Graff's avatar
Michael Graff committed
431

432
433
434
435
436
437
438
439
440
441
/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
#ifdef ISC_NET_RECVOVERFLOW
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
#else
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
#endif

442
443
444
445
static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
				  isc_sockettype_t type,
				  isc_socket_t **socketp,
				  isc_socket_t *dup_socket);
446
447
448
449
450
451
static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
static void free_socket(isc__socket_t **);
static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
				    isc__socket_t **);
static void destroy(isc__socket_t **);
452
453
454
455
static void internal_accept(isc_task_t *, isc_event_t *);
static void internal_connect(isc_task_t *, isc_event_t *);
static void internal_recv(isc_task_t *, isc_event_t *);
static void internal_send(isc_task_t *, isc_event_t *);
456
457
static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
458
459
static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
static void build_msghdr_send(isc__socket_t *, isc_socketevent_t *,
460
			      struct msghdr *, struct iovec *, size_t *);
461
static void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *,
462
			      struct msghdr *, struct iovec *, size_t *);
463
464
465
#ifdef USE_WATCHER_THREAD
static isc_boolean_t process_ctlfd(isc__socketmgr_t *manager);
#endif
Mark Andrews's avatar
Mark Andrews committed
466
static void setdscp(isc__socket_t *sock, isc_dscp_t dscp);
467
468
469
470
471
472
473
474
475

/*%
 * The following can be either static or public, depending on build environment.
 */

#ifdef BIND9
#define ISC_SOCKETFUNC_SCOPE
#else
#define ISC_SOCKETFUNC_SCOPE static
476
#endif
Michael Graff's avatar
Michael Graff committed
477

478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
		   isc_socket_t **socketp);
ISC_SOCKETFUNC_SCOPE void
isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
ISC_SOCKETFUNC_SCOPE void
isc__socket_detach(isc_socket_t **socketp);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
		       unsigned int maxsocks);
ISC_SOCKETFUNC_SCOPE void
isc__socketmgr_destroy(isc_socketmgr_t **managerp);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
		 unsigned int minimum, isc_task_t *task,
		  isc_taskaction_t action, const void *arg);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
		 unsigned int minimum, isc_task_t *task,
		 isc_taskaction_t action, const void *arg);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
		  unsigned int minimum, isc_task_t *task,
		  isc_socketevent_t *event, unsigned int flags);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_send(isc_socket_t *sock, isc_region_t *region,
		 isc_task_t *task, isc_taskaction_t action, const void *arg);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
		   isc_task_t *task, isc_taskaction_t action, const void *arg,
		   isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
		  isc_task_t *task, isc_taskaction_t action, const void *arg);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
		    isc_task_t *task, isc_taskaction_t action, const void *arg,
		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
		    isc_task_t *task,
		    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
		    isc_socketevent_t *event, unsigned int flags);
Mark Andrews's avatar
Mark Andrews committed
523
524
525
526
527
isc_socketevent_t *
isc_socket_socketevent(isc_mem_t *mctx, void *sender,
		       isc_eventtype_t eventtype, isc_taskaction_t action,
		       const void *arg);

528
529
ISC_SOCKETFUNC_SCOPE void
isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active);
530
ISC_SOCKETFUNC_SCOPE isc_result_t
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
		     isc_uint32_t owner, isc_uint32_t group);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
		 unsigned int options);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_filter(isc_socket_t *sock, const char *filter);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_listen(isc_socket_t *sock, unsigned int backlog);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_accept(isc_socket_t *sock,
		   isc_task_t *task, isc_taskaction_t action, const void *arg);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
		    isc_task_t *task, isc_taskaction_t action,
		    const void *arg);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp);
ISC_SOCKETFUNC_SCOPE void
isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
ISC_SOCKETFUNC_SCOPE isc_sockettype_t
isc__socket_gettype(isc_socket_t *sock);
ISC_SOCKETFUNC_SCOPE isc_boolean_t
isc__socket_isbound(isc_socket_t *sock);
ISC_SOCKETFUNC_SCOPE void
isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes);
Mark Andrews's avatar
Mark Andrews committed
559
560
ISC_SOCKETFUNC_SCOPE void
isc__socket_dscp(isc_socket_t *sock, isc_dscp_t dscp);
561
562
#ifdef BIND9
#ifdef HAVE_LIBXML2
563
564
565
ISC_SOCKETFUNC_SCOPE void
isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer);
#endif
566
567
568
569
570
#ifdef HAVE_JSON
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socketmgr_renderjson(isc_socketmgr_t *mgr0, json_object *stats);
#endif
#endif /* BIND9 */
571

572
573
574
575
576
577
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
			  isc_sockfdwatch_t callback, void *cbarg,
			  isc_task_t *task, isc_socket_t **socketp);
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_fdwatchpoke(isc_socket_t *sock, int flags);
578
579
ISC_SOCKETFUNC_SCOPE isc_result_t
isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp);
Mark Andrews's avatar
Mark Andrews committed
580
581
ISC_SOCKETFUNC_SCOPE int
isc__socket_getfd(isc_socket_t *sock);
582

583
584
585
586
587
588
static struct {
	isc_socketmethods_t methods;

	/*%
	 * The following are defined just for avoiding unused static functions.
	 */
589
#ifndef BIND9
590
591
	void *recvv, *send, *sendv, *sendto2, *cleanunix, *permunix, *filter,
		*listen, *accept, *getpeername, *isbound;
592
#endif
593
594
595
596
597
598
} socketmethods = {
	{
		isc__socket_attach,
		isc__socket_detach,
		isc__socket_bind,
		isc__socket_sendto,
599
		isc__socket_sendto2,
600
601
		isc__socket_connect,
		isc__socket_recv,
602
		isc__socket_recv2,
603
604
605
		isc__socket_cancel,
		isc__socket_getsockname,
		isc__socket_gettype,
606
		isc__socket_ipv6only,
607
		isc__socket_fdwatchpoke,
Mark Andrews's avatar
Mark Andrews committed
608
		isc__socket_dup,
Mark Andrews's avatar
Mark Andrews committed
609
610
		isc__socket_getfd,
		isc__socket_dscp
611
612
613
614
615
616
617
618
619
620
	}
#ifndef BIND9
	,
	(void *)isc__socket_recvv, (void *)isc__socket_send,
	(void *)isc__socket_sendv, (void *)isc__socket_sendto2,
	(void *)isc__socket_cleanunix, (void *)isc__socket_permunix,
	(void *)isc__socket_filter, (void *)isc__socket_listen,
	(void *)isc__socket_accept, (void *)isc__socket_getpeername,
	(void *)isc__socket_isbound
#endif
621
622
623
624
};

static isc_socketmgrmethods_t socketmgrmethods = {
	isc__socketmgr_destroy,
625
626
	isc__socket_create,
	isc__socket_fdwatchcreate
627
628
};

Michael Graff's avatar
Michael Graff committed
629
630
#define SELECT_POKE_SHUTDOWN		(-1)
#define SELECT_POKE_NOTHING		(-2)
631
#define SELECT_POKE_READ		(-3)
632
#define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
633
#define SELECT_POKE_WRITE		(-4)
634
#define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
635
#define SELECT_POKE_CLOSE		(-5)
636

637
638
#define SOCK_DEAD(s)			((s)->references == 0)

639
640
641
642
643
644
645
646
647
648
649
650
651
/*%
 * Shortcut index arrays to get access to statistics counters.
 */
enum {
	STATID_OPEN = 0,
	STATID_OPENFAIL = 1,
	STATID_CLOSE = 2,
	STATID_BINDFAIL = 3,
	STATID_CONNECTFAIL = 4,
	STATID_CONNECT = 5,
	STATID_ACCEPTFAIL = 6,
	STATID_ACCEPT = 7,
	STATID_SENDFAIL = 8,
Evan Hunt's avatar
Evan Hunt committed
652
653
	STATID_RECVFAIL = 9,
	STATID_ACTIVE = 10
654
};
Mark Andrews's avatar
Mark Andrews committed
655
static const isc_statscounter_t udp4statsindex[] = {
656
657
658
659
660
661
662
663
664
	isc_sockstatscounter_udp4open,
	isc_sockstatscounter_udp4openfail,
	isc_sockstatscounter_udp4close,
	isc_sockstatscounter_udp4bindfail,
	isc_sockstatscounter_udp4connectfail,
	isc_sockstatscounter_udp4connect,
	-1,
	-1,
	isc_sockstatscounter_udp4sendfail,
Evan Hunt's avatar
Evan Hunt committed
665
666
	isc_sockstatscounter_udp4recvfail,
	isc_sockstatscounter_udp4active
667
};
Mark Andrews's avatar
Mark Andrews committed
668
static const isc_statscounter_t udp6statsindex[] = {
669
670
671
672
673
674
675
676
677
	isc_sockstatscounter_udp6open,
	isc_sockstatscounter_udp6openfail,
	isc_sockstatscounter_udp6close,
	isc_sockstatscounter_udp6bindfail,
	isc_sockstatscounter_udp6connectfail,
	isc_sockstatscounter_udp6connect,
	-1,
	-1,
	isc_sockstatscounter_udp6sendfail,
Evan Hunt's avatar
Evan Hunt committed
678
679
	isc_sockstatscounter_udp6recvfail,
	isc_sockstatscounter_udp6active
680
681
682
683
684
685
686
687
688
689
690
};
static const isc_statscounter_t tcp4statsindex[] = {
	isc_sockstatscounter_tcp4open,
	isc_sockstatscounter_tcp4openfail,
	isc_sockstatscounter_tcp4close,
	isc_sockstatscounter_tcp4bindfail,
	isc_sockstatscounter_tcp4connectfail,
	isc_sockstatscounter_tcp4connect,
	isc_sockstatscounter_tcp4acceptfail,
	isc_sockstatscounter_tcp4accept,
	isc_sockstatscounter_tcp4sendfail,
Evan Hunt's avatar
Evan Hunt committed
691
692
	isc_sockstatscounter_tcp4recvfail,
	isc_sockstatscounter_tcp4active
693
694
695
696
697
698
699
700
701
702
703
};
static const isc_statscounter_t tcp6statsindex[] = {
	isc_sockstatscounter_tcp6open,
	isc_sockstatscounter_tcp6openfail,
	isc_sockstatscounter_tcp6close,
	isc_sockstatscounter_tcp6bindfail,
	isc_sockstatscounter_tcp6connectfail,
	isc_sockstatscounter_tcp6connect,
	isc_sockstatscounter_tcp6acceptfail,
	isc_sockstatscounter_tcp6accept,
	isc_sockstatscounter_tcp6sendfail,
Evan Hunt's avatar
Evan Hunt committed
704
705
	isc_sockstatscounter_tcp6recvfail,
	isc_sockstatscounter_tcp6active
706
707
708
709
710
711
712
713
714
715
716
};
static const isc_statscounter_t unixstatsindex[] = {
	isc_sockstatscounter_unixopen,
	isc_sockstatscounter_unixopenfail,
	isc_sockstatscounter_unixclose,
	isc_sockstatscounter_unixbindfail,
	isc_sockstatscounter_unixconnectfail,
	isc_sockstatscounter_unixconnect,
	isc_sockstatscounter_unixacceptfail,
	isc_sockstatscounter_unixaccept,
	isc_sockstatscounter_unixsendfail,
Evan Hunt's avatar
Evan Hunt committed
717
718
	isc_sockstatscounter_unixrecvfail,
	isc_sockstatscounter_unixactive
719
720
721
722
723
724
725
726
727
728
729
};
static const isc_statscounter_t fdwatchstatsindex[] = {
	-1,
	-1,
	isc_sockstatscounter_fdwatchclose,
	isc_sockstatscounter_fdwatchbindfail,
	isc_sockstatscounter_fdwatchconnectfail,
	isc_sockstatscounter_fdwatchconnect,
	-1,
	-1,
	isc_sockstatscounter_fdwatchsendfail,
Evan Hunt's avatar
Evan Hunt committed
730
731
	isc_sockstatscounter_fdwatchrecvfail,
	-1
732
733
};

734
735
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \
    defined(USE_WATCHER_THREAD)
736
static void
737
manager_log(isc__socketmgr_t *sockmgr,
738
739
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
Michael Graff's avatar
Michael Graff committed
740
static void
741
manager_log(isc__socketmgr_t *sockmgr,
Michael Graff's avatar
Michael Graff committed
742
743
744
745
746
747
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...)
{
	char msgbuf[2048];
	va_list ap;

748
749
750
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
751
752
753
754
755
756
757
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
		      "sockmgr %p: %s", sockmgr, msgbuf);
}
758
#endif
Michael Graff's avatar
Michael Graff committed
759

760
static void
761
socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
762
763
764
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   isc_msgcat_t *msgcat, int msgset, int message,
	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
Michael Graff's avatar
Michael Graff committed
765
static void
766
socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
Michael Graff's avatar
Michael Graff committed
767
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
768
	   isc_msgcat_t *msgcat, int msgset, int message,
Michael Graff's avatar
Michael Graff committed
769
770
771
	   const char *fmt, ...)
{
	char msgbuf[2048];
772
	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
Michael Graff's avatar
Michael Graff committed
773
774
	va_list ap;

775
776
777
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
778
779
780
781
782
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
783
784
785
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p: %s", sock, msgbuf);
Michael Graff's avatar
Michael Graff committed
786
	} else {
Andreas Gustafsson's avatar
Andreas Gustafsson committed
787
		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
788
789
790
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p %s: %s", sock, peerbuf, msgbuf);
Michael Graff's avatar
Michael Graff committed
791
792
793
	}
}

794
795
796
797
798
799
800
#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
    defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
/*
 * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
 * setting IPV6_V6ONLY.
 */
static void
801
FIX_IPV6_RECVPKTINFO(isc__socket_t *sock)
802
803
804
805
806
807
808
809
810
{
	char strbuf[ISC_STRERRORSIZE];
	int on = 1;

	if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
		return;

	if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
		       (void *)&on, sizeof(on)) < 0) {
Automatic Updater's avatar
Automatic Updater committed
811

812
		isc__strerror(errno, strbuf, sizeof(strbuf));
813
814
815
816
817
818
819
820
821
822
823
824
825
826
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "setsockopt(%d, IPV6_RECVPKTINFO) "
				 "%s: %s", sock->fd,
				 isc_msgcat_get(isc_msgcat,
						ISC_MSGSET_GENERAL,
						ISC_MSG_FAILED,
						"failed"),
				 strbuf);
	}
}
#else
#define FIX_IPV6_RECVPKTINFO(sock) (void)0
#endif

827
828
829
830
831
832
833
834
835
836
837
/*%
 * Increment socket-related statistics counters.
 */
static inline void
inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
	REQUIRE(counterid != -1);

	if (stats != NULL)
		isc_stats_increment(stats, counterid);
}

Evan Hunt's avatar
Evan Hunt committed
838
839
840
841
842
843
844
845
846
847
848
/*%
 * Decrement socket-related statistics counters.
 */
static inline void
dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
	REQUIRE(counterid != -1);

	if (stats != NULL)
		isc_stats_decrement(stats, counterid);
}

849
static inline isc_result_t
850
watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
	isc_result_t result = ISC_R_SUCCESS;

#ifdef USE_KQUEUE
	struct kevent evchange;

	memset(&evchange, 0, sizeof(evchange));
	if (msg == SELECT_POKE_READ)
		evchange.filter = EVFILT_READ;
	else
		evchange.filter = EVFILT_WRITE;
	evchange.flags = EV_ADD;
	evchange.ident = fd;
	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
		result = isc__errno2result(errno);

	return (result);
#elif defined(USE_EPOLL)
	struct epoll_event event;

	if (msg == SELECT_POKE_READ)
		event.events = EPOLLIN;
	else
		event.events = EPOLLOUT;
874
	memset(&event.data, 0, sizeof(event.data));
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
	event.data.fd = fd;
	if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
	    errno != EEXIST) {
		result = isc__errno2result(errno);
	}

	return (result);
#elif defined(USE_DEVPOLL)
	struct pollfd pfd;
	int lockid = FDLOCK_ID(fd);

	memset(&pfd, 0, sizeof(pfd));
	if (msg == SELECT_POKE_READ)
		pfd.events = POLLIN;
	else
		pfd.events = POLLOUT;
	pfd.fd = fd;
	pfd.revents = 0;
	LOCK(&manager->fdlock[lockid]);
	if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
		result = isc__errno2result(errno);
	else {
		if (msg == SELECT_POKE_READ)
			manager->fdpollinfo[fd].want_read = 1;
		else
			manager->fdpollinfo[fd].want_write = 1;
	}
	UNLOCK(&manager->fdlock[lockid]);

	return (result);
#elif defined(USE_SELECT)
	LOCK(&manager->lock);
	if (msg == SELECT_POKE_READ)
908
		FD_SET(fd, manager->read_fds);
909
	if (msg == SELECT_POKE_WRITE)
910
		FD_SET(fd, manager->write_fds);
911
912
913
914
915
916
917
	UNLOCK(&manager->lock);

	return (result);
#endif
}

static inline isc_result_t
918
unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
	isc_result_t result = ISC_R_SUCCESS;

#ifdef USE_KQUEUE
	struct kevent evchange;

	memset(&evchange, 0, sizeof(evchange));
	if (msg == SELECT_POKE_READ)
		evchange.filter = EVFILT_READ;
	else
		evchange.filter = EVFILT_WRITE;
	evchange.flags = EV_DELETE;
	evchange.ident = fd;
	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
		result = isc__errno2result(errno);

	return (result);
#elif defined(USE_EPOLL)
	struct epoll_event event;

	if (msg == SELECT_POKE_READ)
		event.events = EPOLLIN;
	else
		event.events = EPOLLOUT;
942
	memset(&event.data, 0, sizeof(event.data));
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
	event.data.fd = fd;
	if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
	    errno != ENOENT) {
		char strbuf[ISC_STRERRORSIZE];
		isc__strerror(errno, strbuf, sizeof(strbuf));
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "epoll_ctl(DEL), %d: %s", fd, strbuf);
		result = ISC_R_UNEXPECTED;
	}
	return (result);
#elif defined(USE_DEVPOLL)
	struct pollfd pfds[2];
	size_t writelen = sizeof(pfds[0]);
	int lockid = FDLOCK_ID(fd);

	memset(pfds, 0, sizeof(pfds));
	pfds[0].events = POLLREMOVE;
	pfds[0].fd = fd;

	/*
	 * Canceling read or write polling via /dev/poll is tricky.  Since it
	 * only provides a way of canceling per FD, we may need to re-poll the
	 * socket for the other operation.
	 */
	LOCK(&manager->fdlock[lockid]);
	if (msg == SELECT_POKE_READ &&
	    manager->fdpollinfo[fd].want_write == 1) {
		pfds[1].events = POLLOUT;
		pfds[1].fd = fd;
		writelen += sizeof(pfds[1]);
	}
	if (msg == SELECT_POKE_WRITE &&
	    manager->fdpollinfo[fd].want_read == 1) {
		pfds[1].events = POLLIN;
		pfds[1].fd = fd;
		writelen += sizeof(pfds[1]);
	}

Automatic Updater's avatar
Automatic Updater committed
981
	if (write(manager->devpoll_fd, pfds, writelen) == -1)
982
983
984
985
986
987
988
989
990
991
992
993
994
		result = isc__errno2result(errno);
	else {
		if (msg == SELECT_POKE_READ)
			manager->fdpollinfo[fd].want_read = 0;
		else
			manager->fdpollinfo[fd].want_write = 0;
	}
	UNLOCK(&manager->fdlock[lockid]);

	return (result);
#elif defined(USE_SELECT)
	LOCK(&manager->lock);
	if (msg == SELECT_POKE_READ)
995
		FD_CLR(fd, manager->read_fds);
996
	else if (msg == SELECT_POKE_WRITE)
997
		FD_CLR(fd, manager->write_fds);
998
999
1000
1001
1002
1003
	UNLOCK(&manager->lock);

	return (result);
#endif
}

1004
static void
1005
wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
1006
1007
	isc_result_t result;
	int lockid = FDLOCK_ID(fd);
1008
1009

	/*
1010
1011
1012
	 * This is a wakeup on a socket.  If the socket is not in the
	 * process of being closed, start watching it for either reads
	 * or writes.
1013
	 */
Andreas Gustafsson's avatar
   
Andreas Gustafsson committed
1014

1015
	INSIST(fd >= 0 && fd < (int)manager->maxsocks);
1016

1017
1018
1019
	if (msg == SELECT_POKE_CLOSE) {
		/* No one should be updating fdstate, so no need to lock it */
		INSIST(manager->fdstate[fd] == CLOSE_PENDING);
Michael Graff's avatar
Michael Graff committed
1020
		manager->fdstate[fd] = CLOSED;
1021
1022
1023
1024
1025
1026
1027
1028
		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
		(void)close(fd);
		return;
	}

	LOCK(&manager->fdlock[lockid]);
	if (manager->fdstate[fd] == CLOSE_PENDING) {
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
		UNLOCK(&manager->fdlock[lockid]);

		/*
		 * We accept (and ignore) any error from unwatch_fd() as we are
		 * closing the socket, hoping it doesn't leave dangling state in
		 * the kernel.
		 * Note that unwatch_fd() must be called after releasing the
		 * fdlock; otherwise it could cause deadlock due to a lock order
		 * reversal.
		 */
		(void)unwatch_fd(manager, fd, SELECT_POKE_READ);
		(void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
1041
1042
		return;
	}
1043
1044
	if (manager->fdstate[fd] != MANAGED) {
		UNLOCK(&manager->fdlock[lockid]);
1045
		return;
1046
1047
	}
	UNLOCK(&manager->fdlock[lockid]);
1048
1049

	/*
Mark Andrews's avatar
Mark Andrews committed
1050
	 * Set requested bit.
1051
	 */
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
	result = watch_fd(manager, fd, msg);
	if (result != ISC_R_SUCCESS) {
		/*
		 * XXXJT: what should we do?  Ignoring the failure of watching
		 * a socket will make the application dysfunctional, but there
		 * seems to be no reasonable recovery process.
		 */
		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
			      "failed to start watching FD (%d): %s",
			      fd, isc_result_totext(result));
	}
1064
1065
}

1066
#ifdef USE_WATCHER_THREAD
1067
/*
Michael Graff's avatar
Michael Graff committed
1068
 * Poke the select loop when there is something for us to do.
1069
1070
 * The write is required (by POSIX) to complete.  That is, we
 * will not get partial writes.
1071
1072
 */
static void
1073
select_poke(isc__socketmgr_t *mgr, int fd, int msg) {
Michael Graff's avatar
Michael Graff committed
1074
	int cc;
1075
	int buf[2];
1076
	char strbuf[ISC_STRERRORSIZE];
1077
1078
1079

	buf[0] = fd;
	buf[1] = msg;
Michael Graff's avatar
Michael Graff committed
1080

1081
	do {
1082
		cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
#ifdef ENOSR
		/*
		 * Treat ENOSR as EAGAIN but loop slowly as it is
		 * unlikely to clear fast.
		 */
		if (cc < 0 && errno == ENOSR) {
			sleep(1);
			errno = EAGAIN;
		}
#endif
1093
	} while (cc < 0 && SOFT_ERROR(errno));
1094

1095
1096
	if (cc < 0) {
		isc__strerror(errno, strbuf, sizeof(strbuf));
Michael Graff's avatar
Michael Graff committed
1097
		FATAL_ERROR(__FILE__, __LINE__,
1098
1099
1100
1101
			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
					   ISC_MSG_WRITEFAILED,
					   "write() failed "
					   "during watcher poke: %s"),
1102
1103
			    strbuf);
	}
1104

1105
	INSIST(cc == sizeof(buf));
1106
1107
1108
}

/*
Andreas Gustafsson's avatar
Andreas Gustafsson committed
1109
 * Read a message on the internal fd.
1110
 */
1111
static void
1112
select_readmsg(isc__socketmgr_t *mgr, int *fd, int *msg) {
1113
	int buf[2];
Michael Graff's avatar
Michael Graff committed
1114
	int cc;
1115
	char strbuf[ISC_STRERRORSIZE];
Michael Graff's avatar
Michael Graff committed
1116

1117
	cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
Michael Graff's avatar
Michael Graff committed
1118
	if (cc < 0) {
1119
		*msg = SELECT_POKE_NOTHING;
1120
		*fd = -1;	/* Silence compiler. */
Michael Graff's avatar
Michael Graff committed
1121
		if (SOFT_ERROR(errno))
1122
			return;
Michael Graff's avatar
Michael Graff committed
1123

1124
		isc__strerror(errno, strbuf, sizeof(strbuf));
Michael Graff's avatar
Michael Graff committed
1125
		FATAL_ERROR(__FILE__, __LINE__,
1126
1127
1128
1129
			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
					   ISC_MSG_READFAILED,
					   "read() failed "
					   "during watcher poke: %s"),
1130
			    strbuf);
Automatic Updater's avatar
Automatic Updater committed
1131

1132
		return;
Michael Graff's avatar
Michael Graff committed
1133
	}
1134
	INSIST(cc == sizeof(buf));
1135

1136
1137
	*fd = buf[0];
	*msg = buf[1];
1138
}
1139
#else /* USE_WATCHER_THREAD */
1140
1141
1142
1143
/*
 * Update the state of the socketmgr when something changes.
 */
static void
1144
select_poke(isc__socketmgr_t *manager, int fd, int msg) {
1145
1146
	if (msg == SELECT_POKE_SHUTDOWN)
		return;
1147
1148
	else if (fd >= 0)
		wakeup_socket(manager, fd, msg);
1149
1150
	return;
}
1151
#endif /* USE_WATCHER_THREAD */
1152
1153

/*
Andreas Gustafsson's avatar
Andreas Gustafsson committed
1154
 * Make a fd non-blocking.
1155
 */
Michael Graff's avatar
Michael Graff committed
1156
static isc_result_t
1157
make_nonblock(int fd) {
Michael Graff's avatar
Michael Graff committed
1158
1159
	int ret;
	int flags;
1160
	char strbuf[ISC_STRERRORSIZE];
1161
1162
#ifdef USE_FIONBIO_IOCTL
	int on = 1;
1163

1164
1165
	ret = ioctl(fd, FIONBIO, (char *)&on);
#else
Michael Graff's avatar
Michael Graff committed
1166
	flags = fcntl(fd, F_GETFL, 0);
1167
	flags |= PORT_NONBLOCK;
Michael Graff's avatar
Michael Graff committed
1168
	ret = fcntl(fd, F_SETFL, flags);
1169
#endif
1170

Michael Graff's avatar
Michael Graff committed
1171
	if (ret == -1) {
1172
		isc__strerror(errno, strbuf, sizeof(strbuf));
Michael Graff's avatar
Michael Graff committed
1173
		UNEXPECTED_ERROR(__FILE__, __LINE__,
1174
1175
1176
1177
1178
1179
#ifdef USE_FIONBIO_IOCTL
				 "ioctl(%d, FIONBIO, &on): %s", fd,
#else
				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
#endif
				 strbuf);
Michael Graff's avatar
Michael Graff committed
1180

Michael Graff's avatar
Michael Graff committed
1181
		return (ISC_R_UNEXPECTED);
Michael Graff's avatar
Michael Graff committed
1182
1183
	}

Michael Graff's avatar
Michael Graff committed
1184
	return (ISC_R_SUCCESS);
1185
1186
}

1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
#ifdef USE_CMSG
/*
 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
 * In order to ensure as much portability as possible, we provide wrapper
 * functions of these macros.
 * Note that cmsg_space() could run slow on OSes that do not have
 * CMSG_SPACE.
 */
static inline ISC_SOCKADDR_LEN_T
cmsg_len(ISC_SOCKADDR_LEN_T len) {
#ifdef CMSG_LEN
	return (CMSG_LEN(len));
#else
	ISC_SOCKADDR_LEN_T hdrlen;

1202
1203
1204
1205
1206
	/*
	 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
	 * is correct.
	 */
	hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
	return (hdrlen + len);
#endif
}

static inline ISC_SOCKADDR_LEN_T
cmsg_space(ISC_SOCKADDR_LEN_T len) {
#ifdef CMSG_SPACE
	return (CMSG_SPACE(len));
#else
	struct msghdr msg;
	struct cmsghdr *cmsgp;
	/*
	 * XXX: The buffer length is an ad-hoc value, but should be enough
	 * in a practical sense.
	 */
	char dummybuf[sizeof(struct cmsghdr) + 1024];

	memset(&msg, 0, sizeof(msg));
	msg.msg_control = dummybuf;
	msg.msg_controllen = sizeof(dummybuf);

	cmsgp = (struct cmsghdr *)dummybuf;
	cmsgp->cmsg_len = cmsg_len(len);

	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
	if (cmsgp != NULL)
Mark Andrews's avatar
Mark Andrews committed
1233
		return ((char *)cmsgp - (char *)msg.msg_control);
1234
1235
	else
		return (0);
Automatic Updater's avatar
Automatic Updater committed
1236
#endif
1237
1238
1239
}
#endif /* USE_CMSG */

1240
1241
1242
1243
/*
 * Process control messages received on a socket.
 */
static void
1244
process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
Michael Graff's avatar
Michael Graff committed
1245
#ifdef USE_CMSG
1246
	struct cmsghdr *cmsgp;
1247
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
Michael Graff's avatar
Michael Graff committed
1248
1249
1250
	struct in6_pktinfo *pktinfop;
#endif
#ifdef SO_TIMESTAMP
1251
	void *timevalp;
Michael Graff's avatar
Michael Graff committed
1252
1253
1254
#endif
#endif

1255
1256
1257
1258
1259
1260
	/*
	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
	 * They are all here, outside of the CPP tests, because it is
	 * more consistent with the usual ISC coding style.
	 */
1261
	UNUSED(sock);
1262
1263
1264
	UNUSED(msg);
	UNUSED(dev);

1265
#ifdef ISC_NET_BSD44MSGHDR
1266

Bob Halley's avatar
Bob Halley committed
1267
#ifdef MSG_TRUNC
1268
1269
	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
Bob Halley's avatar
Bob Halley committed
1270
#endif
1271

Bob Halley's avatar
Bob Halley committed
1272
#ifdef MSG_CTRUNC
1273
1274
	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
Bob Halley's avatar
Bob Halley committed
1275
#endif
1276

Michael Graff's avatar
Michael Graff committed
1277
1278
1279
#ifndef USE_CMSG
	return;
#else
Mark Andrews's avatar
Mark Andrews committed
1280
	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
1281
		return;
Michael Graff's avatar
Michael Graff committed
1282
1283
1284
1285

#ifdef SO_TIMESTAMP
	timevalp = NULL;
#endif
1286
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
Michael Graff's avatar
Michael Graff committed
1287
1288
1289
1290
1291
	pktinfop = NULL;
#endif

	cmsgp = CMSG_FIRSTHDR(msg);
	while (cmsgp != NULL) {
1292
		socket_log(sock, NULL, TRACE,
1293
1294
			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
			   "processing cmsg %p", cmsgp);
Michael Graff's avatar
Michael Graff committed
1295

1296
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
1297
1298
		if (cmsgp->cmsg_level == IPPROTO_IPV6
		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
1299

Michael Graff's avatar
Michael Graff committed
1300
			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
1301
1302
			memcpy(&dev->pktinfo, pktinfop,
			       sizeof(struct in6_pktinfo));
Michael Graff's avatar
Michael Graff committed
1303
			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
1304
			socket_log(sock, NULL, TRACE,
1305
1306
1307
				   isc_msgcat, ISC_MSGSET_SOCKET,
				   ISC_MSG_IFRECEIVED,
				   "interface received on ifindex %u",
David Lawrence's avatar
David Lawrence committed
1308
				   dev->pktinfo.ipi6_ifindex);
1309
			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
Automatic Updater's avatar
Automatic Updater committed
1310
				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
Michael Graff's avatar
Michael Graff committed
1311
1312
1313
1314
1315
			goto next;
		}
#endif

#ifdef SO_TIMESTAMP
1316
1317
		if (cmsgp->cmsg_level == SOL_SOCKET
		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
1318
1319
1320
1321
1322
			struct timeval tv;
			timevalp = CMSG_DATA(cmsgp);
			memcpy(&tv, timevalp, sizeof(tv));
			dev->timestamp.seconds = tv.tv_sec;
			dev->timestamp.nanoseconds = tv.tv_usec * 1000;
Michael Graff's avatar
Michael Graff committed
1323
1324
1325
1326
1327
			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
			goto next;
		}
#endif

Mark Andrews's avatar
Mark Andrews committed
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
#ifdef IPV6_TCLASS
		if (cmsgp->cmsg_level == IPPROTO_IPV6
		    && cmsgp->cmsg_type == IPV6_TCLASS) {
			dev->dscp = *(int *)CMSG_DATA(cmsgp);
			dev->dscp >>= 2;
			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
			goto next;
		}
#endif

#ifdef IP_TOS
		if (cmsgp->cmsg_level == IPPROTO_IP
		    && cmsgp->cmsg_type == IP_TOS) {
			dev->dscp = (int) *(uint8_t *)CMSG_DATA(cmsgp);
			dev->dscp >>= 2;
			dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
			goto next;
		}
#endif
Michael Graff's avatar
Michael Graff committed
1347
1348
1349
1350
1351
	next:
		cmsgp = CMSG_NXTHDR(msg, cmsgp);
	}
#endif /* USE_CMSG */