socket.c 140 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
2
 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3
 *
4
5
6
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7
8
9
 *
 * See the COPYRIGHT file distributed with this work for additional
 * information regarding copyright ownership.
Bob Halley's avatar
Bob Halley committed
10
 */
Bob Halley's avatar
Bob Halley committed
11

12
/*! \file */
David Lawrence's avatar
David Lawrence committed
13

14
15
16
#include <inttypes.h>
#include <stdbool.h>

17
#include <sys/param.h>
Michael Graff's avatar
Michael Graff committed
18
#include <sys/types.h>
Michael Graff's avatar
Michael Graff committed
19
#include <sys/socket.h>
20
#include <sys/stat.h>
21
22
23
#ifdef HAVE_SYS_SYSCTL_H
#include <sys/sysctl.h>
#endif
Michael Graff's avatar
Michael Graff committed
24
#include <sys/time.h>
Michael Graff's avatar
Michael Graff committed
25
26
#include <sys/uio.h>

Mark Andrews's avatar
Mark Andrews committed
27
28
29
30
31
#if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#endif

32
#include <errno.h>
Andreas Gustafsson's avatar
Andreas Gustafsson committed
33
#include <fcntl.h>
34
35
36
37
#include <stddef.h>
#include <stdlib.h>
#include <unistd.h>

38
#include <isc/app.h>
39
#include <isc/buffer.h>
40
#include <isc/condition.h>
41
#include <isc/formatcheck.h>
42
#include <isc/json.h>
43
#include <isc/list.h>
Michael Graff's avatar
Michael Graff committed
44
#include <isc/log.h>
45
#include <isc/mem.h>
46
#include <isc/mutex.h>
47
#include <isc/net.h>
48
#include <isc/once.h>
49
#include <isc/platform.h>
Michael Graff's avatar
Michael Graff committed
50
#include <isc/print.h>
51
#include <isc/refcount.h>
52
#include <isc/region.h>
53
#include <isc/resource.h>
54
#include <isc/socket.h>
55
#include <isc/stats.h>
56
#include <isc/strerr.h>
Evan Hunt's avatar
Evan Hunt committed
57
#include <isc/string.h>
58
#include <isc/task.h>
59
#include <isc/thread.h>
Michael Graff's avatar
Michael Graff committed
60
#include <isc/util.h>
61
#include <isc/xml.h>
Bob Halley's avatar
Bob Halley committed
62

63
64
65
#ifdef ISC_PLATFORM_HAVESYSUNH
#include <sys/un.h>
#endif
66
#ifdef HAVE_KQUEUE
67
68
#include <sys/event.h>
#endif
69
#ifdef HAVE_EPOLL_CREATE1
70
71
#include <sys/epoll.h>
#endif
72
#if defined(HAVE_SYS_DEVPOLL_H)
73
#include <sys/devpoll.h>
74
75
76
#elif defined(HAVE_DEVPOLL_H)
#include <devpoll.h>
#endif
77

78
79
#include <netinet/tcp.h>

80
81
#include "errno2result.h"

82
83
84
85
#if defined(SO_BSDCOMPAT) && defined(__linux__)
#include <sys/utsname.h>
#endif

86
#ifdef ENABLE_TCP_FASTOPEN
87
88
89
#include <netinet/tcp.h>
#endif

90
/*%
Automatic Updater's avatar
Automatic Updater committed
91
 * Choose the most preferable multiplex method.
92
 */
93
#if defined(HAVE_KQUEUE)
94
#define USE_KQUEUE
95
#elif defined(HAVE_EPOLL_CREATE1)
96
#define USE_EPOLL
97
#elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H)
98
99
100
101
102
103
104
#define USE_DEVPOLL
typedef struct {
	unsigned int want_read : 1,
		want_write : 1;
} pollinfo_t;
#else
#define USE_SELECT
105
#endif	/* HAVE_KQUEUE */
106

Evan Hunt's avatar
Evan Hunt committed
107
108
109
110
111
112
113
/*
 * Set by the -T dscp option on the command line. If set to a value
 * other than -1, we check to make sure DSCP values match it, and
 * assert if not.
 */
int isc_dscp_check_value = -1;

114
115
/*%
 * Maximum number of allowable open sockets.  This is also the maximum
116
117
118
119
120
121
 * allowable socket file descriptor.
 *
 * Care should be taken before modifying this value for select():
 * The API standard doesn't ensure select() accept more than (the system default
 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
 * the vast majority of cases.  This constant should therefore be increased only
Automatic Updater's avatar
Automatic Updater committed
122
 * when absolutely necessary and possible, i.e., the server is exhausting all
123
124
125
126
127
128
129
130
131
 * available file descriptors (up to FD_SETSIZE) and the select() function
 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
 * always by true, but we keep using some of them to ensure as much
 * portability as possible).  Note also that overall server performance
 * may be rather worsened with a larger value of this constant due to
 * inherent scalability problems of select().
 *
 * As a special note, this value shouldn't have to be touched if
 * this is a build for an authoritative only DNS server.
132
133
 */
#ifndef ISC_SOCKET_MAXSOCKETS
134
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
135
136
137
#ifdef TUNE_LARGE
#define ISC_SOCKET_MAXSOCKETS 21000
#else
138
#define ISC_SOCKET_MAXSOCKETS 4096
139
#endif /* TUNE_LARGE */
140
141
142
143
144
145
146
#elif defined(USE_SELECT)
#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
#endif	/* USE_KQUEUE... */
#endif	/* ISC_SOCKET_MAXSOCKETS */

#ifdef USE_SELECT
/*%
147
148
 * Mac OS X needs a special definition to support larger values in select().
 * We always define this because a larger value can be specified run-time.
149
150
151
152
153
154
 */
#ifdef __APPLE__
#define _DARWIN_UNLIMITED_SELECT
#endif	/* __APPLE__ */
#endif	/* USE_SELECT */

155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#ifdef ISC_SOCKET_USE_POLLWATCH
/*%
 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
 * some of the specified FD.  The idea is based on the observation that it's
 * likely for a busy server to keep receiving packets.  It specifically works
 * as follows: the socket watcher is first initialized with the state of
 * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
 * event occurs.  When it wakes up for a socket I/O event, it moves to the
 * poll_active state, and sets the poll timeout to a short period
 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
 * watcher goes to the poll_checking state with the same timeout period.
 * In this state, the watcher tries to detect whether this is a break
 * during intermittent events or the kernel bug is triggered.  If the next
 * polling reports an event within the short period, the previous timeout is
 * likely to be a kernel bug, and so the watcher goes back to the active state.
 * Otherwise, it moves to the idle state again.
 *
 * It's not clear whether this is a thread-related bug, but since we've only
 * seen this with threads, this workaround is used only when enabling threads.
 */

typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;

#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
#endif	/* ISC_SOCKET_POLLWATCH_TIMEOUT */
#endif	/* ISC_SOCKET_USE_POLLWATCH */

184
/*%
Witold Krecicki's avatar
Witold Krecicki committed
185
 * Per-FD lock buckets, we shuffle them around a bit as FDs come in herds.
186
 */
Witold Krecicki's avatar
Witold Krecicki committed
187
188
189
190
#define FDLOCK_BITS		10
#define FDLOCK_COUNT		(1<<FDLOCK_BITS)
#define FDLOCK_ID(fd)		(((fd)%(FDLOCK_COUNT)>>(FDLOCK_BITS/2)) |\
				 (((fd)<<(FDLOCK_BITS/2))%(FDLOCK_COUNT)))
191
192
193
194
195
196
197

/*%
 * Maximum number of events communicated with the kernel.  There should normally
 * be no need for having a large number.
 */
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
#ifndef ISC_SOCKET_MAXEVENTS
198
199
200
#ifdef TUNE_LARGE
#define ISC_SOCKET_MAXEVENTS	2048
#else
201
#define ISC_SOCKET_MAXEVENTS	64
202
#endif /* TUNE_LARGE */
203
204
205
#endif
#endif

206
/*%
207
 * Some systems define the socket length argument as an int, some as size_t,
208
 * some as socklen_t.  This is here so it can be easily changed if needed.
209
 */
210
211
#ifndef socklen_t
#define socklen_t unsigned int
212
#endif
213

214
/*%
215
216
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
217
218
219
220
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
221
 */
222
223
#define SOFT_ERROR(e)	((e) == EAGAIN || \
			 (e) == EWOULDBLOCK || \
Ondřej Surý's avatar
Ondřej Surý committed
224
			 (e) == ENOBUFS || \
225
226
			 (e) == EINTR || \
			 (e) == 0)
227

Michael Graff's avatar
Michael Graff committed
228
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
229

230
/*!<
Michael Graff's avatar
Michael Graff committed
231
232
233
234
235
236
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
237
238
239
240
241
242
243
244
245
246
247
#define TRACE_LEVEL		90
#define CORRECTNESS_LEVEL	70
#define IOEVENT_LEVEL		60
#define EVENT_LEVEL		50
#define CREATION_LEVEL		20

#define TRACE		DLVL(TRACE_LEVEL)
#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
#define IOEVENT		DLVL(IOEVENT_LEVEL)
#define EVENT		DLVL(EVENT_LEVEL)
#define CREATION	DLVL(CREATION_LEVEL)
248

249
typedef isc_event_t intev_t;
Michael Graff's avatar
Michael Graff committed
250

251
#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
252
#define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
Michael Graff's avatar
Michael Graff committed
253

254
/*!
Michael Graff's avatar
Michael Graff committed
255
256
257
258
259
260
261
262
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
#ifndef USE_CMSG
#define USE_CMSG	1
#endif

263
/*%
264
 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
Michael Graff's avatar
Michael Graff committed
265
266
267
268
269
270
271
272
273
 * a setsockopt() like interface to request timestamps, and if the OS
 * doesn't do it for us, call gettimeofday() on every UDP receive?
 */
#ifdef SO_TIMESTAMP
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

274
/*%
Francis Dupont's avatar
Francis Dupont committed
275
 * The size to raise the receive buffer to (from BIND 8).
276
 */
277
#ifdef TUNE_LARGE
278
279
#ifdef sun
#define RCVBUFSIZE (1*1024*1024)
Mukund Sivaraman's avatar
Mukund Sivaraman committed
280
#define SNDBUFSIZE (1*1024*1024)
281
#else
282
#define RCVBUFSIZE (16*1024*1024)
Mukund Sivaraman's avatar
Mukund Sivaraman committed
283
#define SNDBUFSIZE (16*1024*1024)
284
#endif
285
#else
286
#define RCVBUFSIZE (32*1024)
Mukund Sivaraman's avatar
Mukund Sivaraman committed
287
#define SNDBUFSIZE (32*1024)
288
#endif /* TUNE_LARGE */
289

290
291
292
293
294
295
/*%
 * Instead of calculating the cmsgbuf lengths every time we take
 * a rule of thumb approach - sizes are taken from x86_64 linux,
 * multiplied by 2, everything should fit. Those sizes are not
 * large enough to cause any concern.
 */
296
#if defined(USE_CMSG)
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
#define CMSG_SP_IN6PKT 40
#else
#define CMSG_SP_IN6PKT 0
#endif

#if defined(USE_CMSG) && defined(SO_TIMESTAMP)
#define CMSG_SP_TIMESTAMP 32
#else
#define CMSG_SP_TIMESTAMP 0
#endif

#if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
#define CMSG_SP_TCTOS 24
#else
#define CMSG_SP_TCTOS 0
#endif

#define CMSG_SP_INT 24

316
/* Align cmsg buffers to be safe on SPARC etc. */
317
318
#define RECVCMSGBUFLEN ISC_ALIGN(2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1, sizeof(void*))
#define SENDCMSGBUFLEN ISC_ALIGN(2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1, sizeof(void*))
319

320
/*%
321
322
323
324
 * The number of times a send operation is repeated if the result is EINTR.
 */
#define NRETRIES 10

325
326
typedef struct isc__socket isc__socket_t;
typedef struct isc__socketmgr isc__socketmgr_t;
327
typedef struct isc__socketthread isc__socketthread_t;
328
329
330
331

#define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)

struct isc__socket {
332
	/* Not locked. */
333
334
	isc_socket_t		common;
	isc__socketmgr_t	*manager;
335
336
	isc_mutex_t		lock;
	isc_sockettype_t	type;
337
	const isc_statscounter_t	*statsindex;
338
	isc_refcount_t		references;
Michael Graff's avatar
Michael Graff committed
339

340
	/* Locked by socket lock. */
341
	ISC_LINK(isc__socket_t)	link;
342
	int			fd;
343
	int			pf;
344
	int			threadid;
345
346
347
	char				name[16];
	void *				tag;

348
	ISC_LIST(isc_socketevent_t)		send_list;
349
	ISC_LIST(isc_socketevent_t)		recv_list;
350
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
351
	ISC_LIST(isc_socket_connev_t)		connect_list;
352

353
	isc_sockaddr_t		peer_address;       /* remote address */
354

355
	unsigned int		listener : 1,       /* listener socket */
356
				connected : 1,
357
358
359
				connecting : 1,     /* connect pending */
				bound : 1,          /* bound to local addr */
				dupped : 1,
360
361
				active : 1,         /* currently active */
				pktdscp : 1;	    /* per packet dscp */
362

363
#ifdef ISC_PLATFORM_RECVOVERFLOW
364
	unsigned char		overflow; /* used for MSG_TRUNC fake */
365
#endif
366

367
	unsigned int		dscp;
368
369
};

370
371
372
#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)

373
struct isc__socketmgr {
374
	/* Not locked. */
375
	isc_socketmgr_t		common;
376
377
	isc_mem_t	       *mctx;
	isc_mutex_t		lock;
378
	isc_stats_t		*stats;
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
	int			nthreads;
	isc__socketthread_t	*threads;
	unsigned int		maxsocks;
	/* Locked by manager lock. */
	ISC_LIST(isc__socket_t)	socklist;
	int			reserved;	/* unlocked */
	isc_condition_t		shutdown_ok;
	int			maxudp;
};

struct isc__socketthread {
	isc__socketmgr_t *	manager;
	int			threadid;
	isc_thread_t		thread;
	int			pipe_fds[2];
	isc_mutex_t		*fdlock;
	/* Locked by fdlock. */
	isc__socket_t	       **fds;
	int			*fdstate;
398
399
400
401
402
403
404
405
406
#ifdef USE_KQUEUE
	int			kqueue_fd;
	int			nevents;
	struct kevent		*events;
#endif	/* USE_KQUEUE */
#ifdef USE_EPOLL
	int			epoll_fd;
	int			nevents;
	struct epoll_event	*events;
407
	uint32_t		*epoll_events;
408
409
410
#endif	/* USE_EPOLL */
#ifdef USE_DEVPOLL
	int			devpoll_fd;
411
412
	isc_resourcevalue_t	open_max;
	unsigned int		calls;
413
414
	int			nevents;
	struct pollfd		*events;
415
	pollinfo_t		*fdpollinfo;
416
#endif	/* USE_DEVPOLL */
417
418
419
420
421
422
#ifdef USE_SELECT
	int			fd_bufsize;
	fd_set			*read_fds;
	fd_set			*read_fds_copy;
	fd_set			*write_fds;
	fd_set			*write_fds_copy;
423
	int			maxfd;
424
#endif	/* USE_SELECT */
425
426
};

427

428
429
430
#define CLOSED			0	/* this one must be zero */
#define MANAGED			1
#define CLOSE_PENDING		2
Michael Graff's avatar
Michael Graff committed
431

432
433
434
435
/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
436
#ifdef ISC_PLATFORM_RECVOVERFLOW
437
438
439
440
441
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
#else
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
#endif

442
443
444
445
static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
				  isc_sockettype_t type,
				  isc_socket_t **socketp,
				  isc_socket_t *dup_socket);
446
447
static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
448
static void send_connectdone_event(isc__socket_t *, isc_socket_connev_t **);
449
450
451
452
static void free_socket(isc__socket_t **);
static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
				    isc__socket_t **);
static void destroy(isc__socket_t **);
453
454
455
456
static void internal_accept(isc__socket_t *);
static void internal_connect(isc__socket_t *);
static void internal_recv(isc__socket_t *);
static void internal_send(isc__socket_t *);
457
static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
Witold Krecicki's avatar
Witold Krecicki committed
458
static void build_msghdr_send(isc__socket_t *, char *, isc_socketevent_t *,
459
			      struct msghdr *, struct iovec *, size_t *);
Witold Krecicki's avatar
Witold Krecicki committed
460
static void build_msghdr_recv(isc__socket_t *, char *, isc_socketevent_t *,
461
			      struct msghdr *, struct iovec *, size_t *);
462
static bool process_ctlfd(isc__socketthread_t *thread);
Evan Hunt's avatar
Evan Hunt committed
463
static void setdscp(isc__socket_t *sock, isc_dscp_t dscp);
464

Michael Graff's avatar
Michael Graff committed
465
466
#define SELECT_POKE_SHUTDOWN		(-1)
#define SELECT_POKE_NOTHING		(-2)
467
#define SELECT_POKE_READ		(-3)
468
#define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
469
#define SELECT_POKE_WRITE		(-4)
470
#define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
471
#define SELECT_POKE_CLOSE		(-5)
472

473
474
475
476
477
478
479
480
481
482
483
484
485
/*%
 * Shortcut index arrays to get access to statistics counters.
 */
enum {
	STATID_OPEN = 0,
	STATID_OPENFAIL = 1,
	STATID_CLOSE = 2,
	STATID_BINDFAIL = 3,
	STATID_CONNECTFAIL = 4,
	STATID_CONNECT = 5,
	STATID_ACCEPTFAIL = 6,
	STATID_ACCEPT = 7,
	STATID_SENDFAIL = 8,
486
487
	STATID_RECVFAIL = 9,
	STATID_ACTIVE = 10
488
};
Mark Andrews's avatar
Mark Andrews committed
489
static const isc_statscounter_t udp4statsindex[] = {
490
491
492
493
494
495
496
497
498
	isc_sockstatscounter_udp4open,
	isc_sockstatscounter_udp4openfail,
	isc_sockstatscounter_udp4close,
	isc_sockstatscounter_udp4bindfail,
	isc_sockstatscounter_udp4connectfail,
	isc_sockstatscounter_udp4connect,
	-1,
	-1,
	isc_sockstatscounter_udp4sendfail,
499
500
	isc_sockstatscounter_udp4recvfail,
	isc_sockstatscounter_udp4active
501
};
Mark Andrews's avatar
Mark Andrews committed
502
static const isc_statscounter_t udp6statsindex[] = {
503
504
505
506
507
508
509
510
511
	isc_sockstatscounter_udp6open,
	isc_sockstatscounter_udp6openfail,
	isc_sockstatscounter_udp6close,
	isc_sockstatscounter_udp6bindfail,
	isc_sockstatscounter_udp6connectfail,
	isc_sockstatscounter_udp6connect,
	-1,
	-1,
	isc_sockstatscounter_udp6sendfail,
512
513
	isc_sockstatscounter_udp6recvfail,
	isc_sockstatscounter_udp6active
514
515
516
517
518
519
520
521
522
523
524
};
static const isc_statscounter_t tcp4statsindex[] = {
	isc_sockstatscounter_tcp4open,
	isc_sockstatscounter_tcp4openfail,
	isc_sockstatscounter_tcp4close,
	isc_sockstatscounter_tcp4bindfail,
	isc_sockstatscounter_tcp4connectfail,
	isc_sockstatscounter_tcp4connect,
	isc_sockstatscounter_tcp4acceptfail,
	isc_sockstatscounter_tcp4accept,
	isc_sockstatscounter_tcp4sendfail,
525
526
	isc_sockstatscounter_tcp4recvfail,
	isc_sockstatscounter_tcp4active
527
528
529
530
531
532
533
534
535
536
537
};
static const isc_statscounter_t tcp6statsindex[] = {
	isc_sockstatscounter_tcp6open,
	isc_sockstatscounter_tcp6openfail,
	isc_sockstatscounter_tcp6close,
	isc_sockstatscounter_tcp6bindfail,
	isc_sockstatscounter_tcp6connectfail,
	isc_sockstatscounter_tcp6connect,
	isc_sockstatscounter_tcp6acceptfail,
	isc_sockstatscounter_tcp6accept,
	isc_sockstatscounter_tcp6sendfail,
538
539
	isc_sockstatscounter_tcp6recvfail,
	isc_sockstatscounter_tcp6active
540
541
542
543
544
545
546
547
548
549
550
};
static const isc_statscounter_t unixstatsindex[] = {
	isc_sockstatscounter_unixopen,
	isc_sockstatscounter_unixopenfail,
	isc_sockstatscounter_unixclose,
	isc_sockstatscounter_unixbindfail,
	isc_sockstatscounter_unixconnectfail,
	isc_sockstatscounter_unixconnect,
	isc_sockstatscounter_unixacceptfail,
	isc_sockstatscounter_unixaccept,
	isc_sockstatscounter_unixsendfail,
551
552
	isc_sockstatscounter_unixrecvfail,
	isc_sockstatscounter_unixactive
553
};
554
555
556
557
558
559
560
561
562
563
564
565
566
static const isc_statscounter_t rawstatsindex[] = {
	isc_sockstatscounter_rawopen,
	isc_sockstatscounter_rawopenfail,
	isc_sockstatscounter_rawclose,
	-1,
	-1,
	-1,
	-1,
	-1,
	-1,
	isc_sockstatscounter_rawrecvfail,
	isc_sockstatscounter_rawactive
};
567

568
569
570
571
572
573
574
575
static int
gen_threadid(isc__socket_t *sock);

static int
gen_threadid(isc__socket_t *sock) {
	return sock->fd % sock->manager->nthreads;
}

576
static void
577
manager_log(isc__socketmgr_t *sockmgr,
578
579
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
Michael Graff's avatar
Michael Graff committed
580
static void
581
manager_log(isc__socketmgr_t *sockmgr,
Michael Graff's avatar
Michael Graff committed
582
583
584
585
586
587
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...)
{
	char msgbuf[2048];
	va_list ap;

588
589
590
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
591
592
593
594
595
596
597
598
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
		      "sockmgr %p: %s", sockmgr, msgbuf);
}

599
600
601
602
603
604
static void
thread_log(isc__socketthread_t *thread,
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
static void
thread_log(isc__socketthread_t *thread,
605
606
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   const char *fmt, ...)
607
608
609
610
{
	char msgbuf[2048];
	va_list ap;

611
	if (! isc_log_wouldlog(isc_lctx, level)) {
612
		return;
613
	}
614
615
616
617
618
619

	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
620
621
		      "sockmgr %p thread %d: %s",
		      thread->manager, thread->threadid, msgbuf);
622
623
}

624
static void
625
socket_log(isc__socket_t *sock, const isc_sockaddr_t *address,
626
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
Ondřej Surý's avatar
Ondřej Surý committed
627
	   const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
Michael Graff's avatar
Michael Graff committed
628
static void
629
socket_log(isc__socket_t *sock, const isc_sockaddr_t *address,
Michael Graff's avatar
Michael Graff committed
630
631
632
633
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   const char *fmt, ...)
{
	char msgbuf[2048];
634
	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
Michael Graff's avatar
Michael Graff committed
635
636
	va_list ap;

637
638
639
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
640
641
642
643
644
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
Ondřej Surý's avatar
Ondřej Surý committed
645
646
		isc_log_write(isc_lctx, category, module, level,
			      "socket %p: %s", sock, msgbuf);
Michael Graff's avatar
Michael Graff committed
647
	} else {
Andreas Gustafsson's avatar
Andreas Gustafsson committed
648
		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
Ondřej Surý's avatar
Ondřej Surý committed
649
650
		isc_log_write(isc_lctx, category, module, level,
			      "socket %p %s: %s", sock, peerbuf, msgbuf);
Michael Graff's avatar
Michael Graff committed
651
652
653
	}
}

654
655
656
657
658
659
660
661
662
663
664
/*%
 * Increment socket-related statistics counters.
 */
static inline void
inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
	REQUIRE(counterid != -1);

	if (stats != NULL)
		isc_stats_increment(stats, counterid);
}

665
666
667
668
669
670
671
672
673
674
675
/*%
 * Decrement socket-related statistics counters.
 */
static inline void
dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
	REQUIRE(counterid != -1);

	if (stats != NULL)
		isc_stats_decrement(stats, counterid);
}

676
static inline isc_result_t
677
watch_fd(isc__socketthread_t *thread, int fd, int msg) {
678
679
680
681
682
683
	isc_result_t result = ISC_R_SUCCESS;

#ifdef USE_KQUEUE
	struct kevent evchange;

	memset(&evchange, 0, sizeof(evchange));
684
	if (msg == SELECT_POKE_READ) {
685
		evchange.filter = EVFILT_READ;
686
	} else {
687
		evchange.filter = EVFILT_WRITE;
688
	}
689
690
	evchange.flags = EV_ADD;
	evchange.ident = fd;
691
	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
692
		result = isc__errno2result(errno);
693
	}
694
695
696
697

	return (result);
#elif defined(USE_EPOLL)
	struct epoll_event event;
698
699
700
	uint32_t oldevents;
	int ret;
	int op;
701

702
	oldevents = thread->epoll_events[fd];
703
	if (msg == SELECT_POKE_READ) {
704
		thread->epoll_events[fd] |= EPOLLIN;
705
	} else {
706
		thread->epoll_events[fd] |= EPOLLOUT;
707
	}
708

709
	event.events = thread->epoll_events[fd];
710
	memset(&event.data, 0, sizeof(event.data));
711
	event.data.fd = fd;
712
713

	op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
714
	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
715
	if (ret == -1) {
716
		if (errno == EEXIST) {
717
718
719
			UNEXPECTED_ERROR(__FILE__, __LINE__,
					 "epoll_ctl(ADD/MOD) returned "
					 "EEXIST for fd %d", fd);
720
		}
721
722
723
724
725
726
727
728
729
		result = isc__errno2result(errno);
	}

	return (result);
#elif defined(USE_DEVPOLL)
	struct pollfd pfd;
	int lockid = FDLOCK_ID(fd);

	memset(&pfd, 0, sizeof(pfd));
730
	if (msg == SELECT_POKE_READ) {
731
		pfd.events = POLLIN;
732
	} else {
733
		pfd.events = POLLOUT;
734
	}
735
736
	pfd.fd = fd;
	pfd.revents = 0;
737
	if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) {
738
		result = isc__errno2result(errno);
739
740
	} else {
		if (msg == SELECT_POKE_READ) {
741
			thread->fdpollinfo[fd].want_read = 1;
742
		} else {
743
			thread->fdpollinfo[fd].want_write = 1;
744
		}
745
746
747
748
	}

	return (result);
#elif defined(USE_SELECT)
749
	LOCK(&thread->manager->lock);
750
	if (msg == SELECT_POKE_READ) {
751
		FD_SET(fd, thread->read_fds);
752
753
	}
	if (msg == SELECT_POKE_WRITE) {
754
		FD_SET(fd, thread->write_fds);
755
	}
756
	UNLOCK(&thread->manager->lock);
757
758
759
760
761
762

	return (result);
#endif
}

static inline isc_result_t
763
unwatch_fd(isc__socketthread_t *thread, int fd, int msg) {
764
765
766
767
768
769
	isc_result_t result = ISC_R_SUCCESS;

#ifdef USE_KQUEUE
	struct kevent evchange;

	memset(&evchange, 0, sizeof(evchange));
770
	if (msg == SELECT_POKE_READ) {
771
		evchange.filter = EVFILT_READ;
772
	} else {
773
		evchange.filter = EVFILT_WRITE;
774
	}
775
776
	evchange.flags = EV_DELETE;
	evchange.ident = fd;
777
	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
778
		result = isc__errno2result(errno);
779
	}
780
781
782
783

	return (result);
#elif defined(USE_EPOLL)
	struct epoll_event event;
784
785
	int ret;
	int op;
786

787
788
789
790
791
	if (msg == SELECT_POKE_READ) {
		thread->epoll_events[fd] &= ~(EPOLLIN);
	} else {
		thread->epoll_events[fd] &= ~(EPOLLOUT);
	}
792

793
	event.events = thread->epoll_events[fd];
794
	memset(&event.data, 0, sizeof(event.data));
795
	event.data.fd = fd;
796
797

	op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
798
	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
799
	if (ret == -1 && errno != ENOENT) {
800
		char strbuf[ISC_STRERRORSIZE];
801
		strerror_r(errno, strbuf, sizeof(strbuf));
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "epoll_ctl(DEL), %d: %s", fd, strbuf);
		result = ISC_R_UNEXPECTED;
	}
	return (result);
#elif defined(USE_DEVPOLL)
	struct pollfd pfds[2];
	size_t writelen = sizeof(pfds[0]);
	int lockid = FDLOCK_ID(fd);

	memset(pfds, 0, sizeof(pfds));
	pfds[0].events = POLLREMOVE;
	pfds[0].fd = fd;

	/*
	 * Canceling read or write polling via /dev/poll is tricky.  Since it
	 * only provides a way of canceling per FD, we may need to re-poll the
	 * socket for the other operation.
	 */
	if (msg == SELECT_POKE_READ &&
822
823
	    thread->fdpollinfo[fd].want_write == 1)
	{
824
825
826
827
828
		pfds[1].events = POLLOUT;
		pfds[1].fd = fd;
		writelen += sizeof(pfds[1]);
	}
	if (msg == SELECT_POKE_WRITE &&
829
830
	    thread->fdpollinfo[fd].want_read == 1)
	{
831
832
833
834
835
		pfds[1].events = POLLIN;
		pfds[1].fd = fd;
		writelen += sizeof(pfds[1]);
	}

836
	if (write(thread->devpoll_fd, pfds, writelen) == -1) {
837
		result = isc__errno2result(errno);
838
839
	} else {
		if (msg == SELECT_POKE_READ) {
840
			thread->fdpollinfo[fd].want_read = 0;
841
		} else {
842
			thread->fdpollinfo[fd].want_write = 0;
843
		}
844
845
846
847
	}

	return (result);
#elif defined(USE_SELECT)
848
	LOCK(&thread->manager->lock);
849
	if (msg == SELECT_POKE_READ) {
850
		FD_CLR(fd, thread->read_fds);
851
	} else if (msg == SELECT_POKE_WRITE) {
852
		FD_CLR(fd, thread->write_fds);
853
	}
854
	UNLOCK(&thread->manager->lock);
855
856
857
858
859

	return (result);
#endif
}

Witold Krecicki's avatar
Witold Krecicki committed
860
861
862
863
/*
 * A poke message was received, perform a proper watch/unwatch
 * on a fd provided
 */
864
static void
865
wakeup_socket(isc__socketthread_t *thread, int fd, int msg) {
866
867
	isc_result_t result;
	int lockid = FDLOCK_ID(fd);
868
869

	/*
870
871
872
	 * This is a wakeup on a socket.  If the socket is not in the
	 * process of being closed, start watching it for either reads
	 * or writes.
873
	 */
Andreas Gustafsson's avatar
   
Andreas Gustafsson committed
874

875
	INSIST(fd >= 0 && fd < (int)thread->manager->maxsocks);
876

877
878
	if (msg == SELECT_POKE_CLOSE) {
		/* No one should be updating fdstate, so no need to lock it */
879
880
881
882
		INSIST(thread->fdstate[fd] == CLOSE_PENDING);
		thread->fdstate[fd] = CLOSED;
		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
883
884
885
886
		(void)close(fd);
		return;
	}

887
888
889
	LOCK(&thread->fdlock[lockid]);
	if (thread->fdstate[fd] == CLOSE_PENDING) {
		UNLOCK(&thread->fdlock[lockid]);
890
891
892
893
894
895
896
897
898

		/*
		 * We accept (and ignore) any error from unwatch_fd() as we are
		 * closing the socket, hoping it doesn't leave dangling state in
		 * the kernel.
		 * Note that unwatch_fd() must be called after releasing the
		 * fdlock; otherwise it could cause deadlock due to a lock order
		 * reversal.
		 */
899
900
		(void)unwatch_fd(thread, fd, SELECT_POKE_READ);
		(void)unwatch_fd(thread, fd, SELECT_POKE_WRITE);
901
902
		return;
	}
903
904
	if (thread->fdstate[fd] != MANAGED) {
		UNLOCK(&thread->fdlock[lockid]);
905
		return;
906
	}
907
	UNLOCK(&thread->fdlock[lockid]);
908
909

	/*
Mark Andrews's avatar
Mark Andrews committed
910
	 * Set requested bit.
911
	 */
912
	result = watch_fd(thread, fd, msg);
913
914
915
916
917
918
919
920
921
922
923
	if (result != ISC_R_SUCCESS) {
		/*
		 * XXXJT: what should we do?  Ignoring the failure of watching
		 * a socket will make the application dysfunctional, but there
		 * seems to be no reasonable recovery process.
		 */
		isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
			      ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
			      "failed to start watching FD (%d): %s",
			      fd, isc_result_totext(result));
	}
924
925
}

926
/*
Michael Graff's avatar
Michael Graff committed
927
 * Poke the select loop when there is something for us to do.
928
929
 * The write is required (by POSIX) to complete.  That is, we
 * will not get partial writes.
930
931
 */
static void
932
select_poke(isc__socketmgr_t *mgr, int threadid, int fd, int msg) {
Michael Graff's avatar
Michael Graff committed
933
	int cc;
934
	int buf[2];
935
	char strbuf[ISC_STRERRORSIZE];
936
937
938

	buf[0] = fd;
	buf[1] = msg;
Michael Graff's avatar
Michael Graff committed
939

940
	do {
941
942
		cc = write(mgr->threads[threadid].pipe_fds[1],
			   buf, sizeof(buf));
943
944
945
946
947
948
949
950
951
952
#ifdef ENOSR
		/*
		 * Treat ENOSR as EAGAIN but loop slowly as it is
		 * unlikely to clear fast.
		 */
		if (cc < 0 && errno == ENOSR) {
			sleep(1);
			errno = EAGAIN;
		}
#endif
953
	} while (cc < 0 && SOFT_ERROR(errno));
954

955
	if (cc < 0) {
956
		strerror_r(errno, strbuf, sizeof(strbuf));
Michael Graff's avatar
Michael Graff committed
957
		FATAL_ERROR(__FILE__, __LINE__,
Ondřej Surý's avatar
Ondřej Surý committed
958
			    "write() failed during watcher poke: %s",
959
960
			    strbuf);
	}
961

962
	INSIST(cc == sizeof(buf));
963
964
965
}

/*
Andreas Gustafsson's avatar
Andreas Gustafsson committed
966
 * Read a message on the internal fd.
967
 */
968
static void
969
select_readmsg(isc__socketthread_t *thread, int *fd, int *msg) {
970
	int buf[2];
Michael Graff's avatar
Michael Graff committed
971
	int cc;
972
	char strbuf[ISC_STRERRORSIZE];
Michael Graff's avatar
Michael Graff committed
973

974
	cc = read(thread->pipe_fds[0], buf, sizeof(buf));
Michael Graff's avatar
Michael Graff committed
975
	if (cc < 0) {
976
		*msg = SELECT_POKE_NOTHING;
977
		*fd = -1;	/* Silence compiler. */
Michael Graff's avatar
Michael Graff committed
978
		if (SOFT_ERROR(errno))
979
			return;
Michael Graff's avatar
Michael Graff committed
980

981
		strerror_r(errno, strbuf, sizeof(strbuf));
Michael Graff's avatar
Michael Graff committed
982
		FATAL_ERROR(__FILE__, __LINE__,
Ondřej Surý's avatar
Ondřej Surý committed
983
			    "read() failed during watcher poke: %s",
984
			    strbuf);
Michael Graff's avatar
Michael Graff committed
985
	}
986
	INSIST(cc == sizeof(buf));
987

988
989
	*fd = buf[0];
	*msg = buf[1];
990
991
992
}

/*
Andreas Gustafsson's avatar
Andreas Gustafsson committed
993
 * Make a fd non-blocking.
994
 */
Michael Graff's avatar
Michael Graff committed
995
static isc_result_t
996
make_nonblock(int fd) {
Michael Graff's avatar
Michael Graff committed
997
	int ret;
998
	char strbuf[ISC_STRERRORSIZE];
999
1000
#ifdef USE_FIONBIO_IOCTL
	int on = 1;
1001
1002
1003
#else
	int flags;
#endif
1004

1005
#ifdef USE_FIONBIO_IOCTL
1006
1007
	ret = ioctl(fd, FIONBIO, (char *)&on);
#else
Michael Graff's avatar
Michael Graff committed
1008
	flags = fcntl(fd, F_GETFL, 0);
1009
	flags |= PORT_NONBLOCK;
Michael Graff's avatar
Michael Graff committed
1010
	ret = fcntl(fd, F_SETFL, flags);