socket.c 145 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
2
 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3
 *
4 5 6
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 8 9
 *
 * See the COPYRIGHT file distributed with this work for additional
 * information regarding copyright ownership.
Bob Halley's avatar
Bob Halley committed
10
 */
Bob Halley's avatar
Bob Halley committed
11

12
/*! \file */
David Lawrence's avatar
David Lawrence committed
13

14 15
#include <inttypes.h>
#include <stdbool.h>
16
#include <sys/param.h>
Michael Graff's avatar
Michael Graff committed
17
#include <sys/socket.h>
18
#include <sys/stat.h>
19
#include <sys/types.h>
20
#if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__)
21
#include <sys/sysctl.h>
22
#endif /* if defined(HAVE_SYS_SYSCTL_H) && !defined(__linux__) */
Michael Graff's avatar
Michael Graff committed
23
#include <sys/time.h>
Michael Graff's avatar
Michael Graff committed
24 25
#include <sys/uio.h>

Mark Andrews's avatar
Mark Andrews committed
26 27 28
#if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
29 30
#endif /* if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H) \
	*/
Mark Andrews's avatar
Mark Andrews committed
31

32
#include <errno.h>
Andreas Gustafsson's avatar
Andreas Gustafsson committed
33
#include <fcntl.h>
34 35
#include <stddef.h>
#include <stdlib.h>
36
#include <sys/un.h>
37 38
#include <unistd.h>

39
#include <isc/app.h>
40
#include <isc/buffer.h>
41
#include <isc/condition.h>
42
#include <isc/formatcheck.h>
43
#include <isc/list.h>
Michael Graff's avatar
Michael Graff committed
44
#include <isc/log.h>
45
#include <isc/mem.h>
46
#include <isc/mutex.h>
47
#include <isc/net.h>
48
#include <isc/once.h>
49
#include <isc/platform.h>
Michael Graff's avatar
Michael Graff committed
50
#include <isc/print.h>
51
#include <isc/refcount.h>
52
#include <isc/region.h>
53
#include <isc/resource.h>
54
#include <isc/socket.h>
55
#include <isc/stats.h>
56
#include <isc/strerr.h>
Evan Hunt's avatar
Evan Hunt committed
57
#include <isc/string.h>
58
#include <isc/task.h>
59
#include <isc/thread.h>
Michael Graff's avatar
Michael Graff committed
60
#include <isc/util.h>
Bob Halley's avatar
Bob Halley committed
61

62
#ifdef HAVE_KQUEUE
63
#include <sys/event.h>
64
#endif /* ifdef HAVE_KQUEUE */
65
#ifdef HAVE_EPOLL_CREATE1
66
#include <sys/epoll.h>
67
#endif /* ifdef HAVE_EPOLL_CREATE1 */
68
#if defined(HAVE_SYS_DEVPOLL_H)
69
#include <sys/devpoll.h>
70 71
#elif defined(HAVE_DEVPOLL_H)
#include <devpoll.h>
72
#endif /* if defined(HAVE_SYS_DEVPOLL_H) */
73

74 75
#include <netinet/tcp.h>

76 77
#include "errno2result.h"

78
#ifdef ENABLE_TCP_FASTOPEN
79
#include <netinet/tcp.h>
80
#endif /* ifdef ENABLE_TCP_FASTOPEN */
81

82 83 84 85
#ifdef HAVE_JSON_C
#include <json_object.h>
#endif /* HAVE_JSON_C */

86 87 88 89 90
#ifdef HAVE_LIBXML2
#include <libxml/xmlwriter.h>
#define ISC_XMLCHAR (const xmlChar *)
#endif /* HAVE_LIBXML2 */

91
/*%
Automatic Updater's avatar
Automatic Updater committed
92
 * Choose the most preferable multiplex method.
93
 */
94
#if defined(HAVE_KQUEUE)
95
#define USE_KQUEUE
96
#elif defined(HAVE_EPOLL_CREATE1)
97
#define USE_EPOLL
98
#elif defined(HAVE_SYS_DEVPOLL_H) || defined(HAVE_DEVPOLL_H)
99 100
#define USE_DEVPOLL
typedef struct {
101
	unsigned int want_read : 1, want_write : 1;
102
} pollinfo_t;
103
#else /* if defined(HAVE_KQUEUE) */
104
#define USE_SELECT
105
#endif /* HAVE_KQUEUE */
106

Evan Hunt's avatar
Evan Hunt committed
107 108 109 110 111 112 113
/*
 * Set by the -T dscp option on the command line. If set to a value
 * other than -1, we check to make sure DSCP values match it, and
 * assert if not.
 */
int isc_dscp_check_value = -1;

114 115
/*%
 * Maximum number of allowable open sockets.  This is also the maximum
116 117 118 119 120 121
 * allowable socket file descriptor.
 *
 * Care should be taken before modifying this value for select():
 * The API standard doesn't ensure select() accept more than (the system default
 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
 * the vast majority of cases.  This constant should therefore be increased only
Automatic Updater's avatar
Automatic Updater committed
122
 * when absolutely necessary and possible, i.e., the server is exhausting all
123 124 125 126 127 128 129 130 131
 * available file descriptors (up to FD_SETSIZE) and the select() function
 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
 * always by true, but we keep using some of them to ensure as much
 * portability as possible).  Note also that overall server performance
 * may be rather worsened with a larger value of this constant due to
 * inherent scalability problems of select().
 *
 * As a special note, this value shouldn't have to be touched if
 * this is a build for an authoritative only DNS server.
132 133
 */
#ifndef ISC_SOCKET_MAXSOCKETS
134
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
135 136
#ifdef TUNE_LARGE
#define ISC_SOCKET_MAXSOCKETS 21000
137
#else /* ifdef TUNE_LARGE */
138
#define ISC_SOCKET_MAXSOCKETS 4096
139
#endif /* TUNE_LARGE */
140 141
#elif defined(USE_SELECT)
#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
142 143
#endif /* USE_KQUEUE... */
#endif /* ISC_SOCKET_MAXSOCKETS */
144 145 146

#ifdef USE_SELECT
/*%
147 148
 * Mac OS X needs a special definition to support larger values in select().
 * We always define this because a larger value can be specified run-time.
149 150 151
 */
#ifdef __APPLE__
#define _DARWIN_UNLIMITED_SELECT
152 153
#endif /* __APPLE__ */
#endif /* USE_SELECT */
154

155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
#ifdef ISC_SOCKET_USE_POLLWATCH
/*%
 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
 * some of the specified FD.  The idea is based on the observation that it's
 * likely for a busy server to keep receiving packets.  It specifically works
 * as follows: the socket watcher is first initialized with the state of
 * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
 * event occurs.  When it wakes up for a socket I/O event, it moves to the
 * poll_active state, and sets the poll timeout to a short period
 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
 * watcher goes to the poll_checking state with the same timeout period.
 * In this state, the watcher tries to detect whether this is a break
 * during intermittent events or the kernel bug is triggered.  If the next
 * polling reports an event within the short period, the previous timeout is
 * likely to be a kernel bug, and so the watcher goes back to the active state.
 * Otherwise, it moves to the idle state again.
 *
 * It's not clear whether this is a thread-related bug, but since we've only
 * seen this with threads, this workaround is used only when enabling threads.
 */

typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;

#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
181 182
#endif /* ISC_SOCKET_POLLWATCH_TIMEOUT */
#endif /* ISC_SOCKET_USE_POLLWATCH */
183

184
/*%
Witold Krecicki's avatar
Witold Krecicki committed
185
 * Per-FD lock buckets, we shuffle them around a bit as FDs come in herds.
186
 */
Evan Hunt's avatar
Evan Hunt committed
187
#define FDLOCK_BITS  10
188 189 190 191
#define FDLOCK_COUNT (1 << FDLOCK_BITS)
#define FDLOCK_ID(fd)                                   \
	(((fd) % (FDLOCK_COUNT) >> (FDLOCK_BITS / 2)) | \
	 (((fd) << (FDLOCK_BITS / 2)) % (FDLOCK_COUNT)))
192 193 194 195 196 197 198

/*%
 * Maximum number of events communicated with the kernel.  There should normally
 * be no need for having a large number.
 */
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
#ifndef ISC_SOCKET_MAXEVENTS
199
#ifdef TUNE_LARGE
200
#define ISC_SOCKET_MAXEVENTS 2048
201
#else /* ifdef TUNE_LARGE */
202
#define ISC_SOCKET_MAXEVENTS 64
203
#endif /* TUNE_LARGE */
204 205 206
#endif /* ifndef ISC_SOCKET_MAXEVENTS */
#endif /* if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) \
	* */
207

208
/*%
209
 * Some systems define the socket length argument as an int, some as size_t,
210
 * some as socklen_t.  This is here so it can be easily changed if needed.
211
 */
212 213
#ifndef socklen_t
#define socklen_t unsigned int
214
#endif /* ifndef socklen_t */
215

216
/*%
217 218
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
219 220 221 222
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
223
 */
224 225 226
#define SOFT_ERROR(e)                                             \
	((e) == EAGAIN || (e) == EWOULDBLOCK || (e) == ENOBUFS || \
	 (e) == EINTR || (e) == 0)
227

Michael Graff's avatar
Michael Graff committed
228
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
229

230
/*!<
Michael Graff's avatar
Michael Graff committed
231 232 233 234 235 236
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
Evan Hunt's avatar
Evan Hunt committed
237
#define TRACE_LEVEL	  90
238
#define CORRECTNESS_LEVEL 70
Evan Hunt's avatar
Evan Hunt committed
239 240 241
#define IOEVENT_LEVEL	  60
#define EVENT_LEVEL	  50
#define CREATION_LEVEL	  20
242

Evan Hunt's avatar
Evan Hunt committed
243
#define TRACE	    DLVL(TRACE_LEVEL)
244
#define CORRECTNESS DLVL(CORRECTNESS_LEVEL)
Evan Hunt's avatar
Evan Hunt committed
245 246 247
#define IOEVENT	    DLVL(IOEVENT_LEVEL)
#define EVENT	    DLVL(EVENT_LEVEL)
#define CREATION    DLVL(CREATION_LEVEL)
248

249
typedef isc_event_t intev_t;
Michael Graff's avatar
Michael Graff committed
250

Evan Hunt's avatar
Evan Hunt committed
251
#define SOCKET_MAGIC	ISC_MAGIC('I', 'O', 'i', 'o')
252
#define VALID_SOCKET(s) ISC_MAGIC_VALID(s, SOCKET_MAGIC)
Michael Graff's avatar
Michael Graff committed
253

254
/*!
Michael Graff's avatar
Michael Graff committed
255 256 257 258 259
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
#ifndef USE_CMSG
260
#define USE_CMSG 1
261
#endif /* ifndef USE_CMSG */
Michael Graff's avatar
Michael Graff committed
262

263
/*%
264
 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
Michael Graff's avatar
Michael Graff committed
265 266 267 268 269
 * a setsockopt() like interface to request timestamps, and if the OS
 * doesn't do it for us, call gettimeofday() on every UDP receive?
 */
#ifdef SO_TIMESTAMP
#ifndef USE_CMSG
270
#define USE_CMSG 1
271 272
#endif /* ifndef USE_CMSG */
#endif /* ifdef SO_TIMESTAMP */
Michael Graff's avatar
Michael Graff committed
273

274 275 276 277 278 279 280
#if defined(SO_RCVBUF) && defined(ISC_RECV_BUFFER_SIZE)
#define SET_RCVBUF
#endif

#if defined(SO_SNDBUF) && defined(ISC_SEND_BUFFER_SIZE)
#define SET_SNDBUF
#endif
281

282 283 284 285 286 287
/*%
 * Instead of calculating the cmsgbuf lengths every time we take
 * a rule of thumb approach - sizes are taken from x86_64 linux,
 * multiplied by 2, everything should fit. Those sizes are not
 * large enough to cause any concern.
 */
288
#if defined(USE_CMSG)
289
#define CMSG_SP_IN6PKT 40
290
#else /* if defined(USE_CMSG) */
291
#define CMSG_SP_IN6PKT 0
292
#endif /* if defined(USE_CMSG) */
293 294 295

#if defined(USE_CMSG) && defined(SO_TIMESTAMP)
#define CMSG_SP_TIMESTAMP 32
296
#else /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
297
#define CMSG_SP_TIMESTAMP 0
298
#endif /* if defined(USE_CMSG) && defined(SO_TIMESTAMP) */
299 300 301

#if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
#define CMSG_SP_TCTOS 24
302
#else /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
303
#define CMSG_SP_TCTOS 0
304
#endif /* if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS)) */
305 306 307

#define CMSG_SP_INT 24

308
/* Align cmsg buffers to be safe on SPARC etc. */
309 310 311 312 313 314 315
#define RECVCMSGBUFLEN                                                       \
	ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS) + \
			  1,                                                 \
		  sizeof(void *))
#define SENDCMSGBUFLEN                                                    \
	ISC_ALIGN(2 * (CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS) + 1, \
		  sizeof(void *))
316

317
/*%
318 319 320 321
 * The number of times a send operation is repeated if the result is EINTR.
 */
#define NRETRIES 10

Evan Hunt's avatar
Evan Hunt committed
322 323
typedef struct isc__socket isc__socket_t;
typedef struct isc__socketmgr isc__socketmgr_t;
324
typedef struct isc__socketthread isc__socketthread_t;
325 326 327 328

#define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)

struct isc__socket {
329
	/* Not locked. */
Evan Hunt's avatar
Evan Hunt committed
330 331 332 333
	isc_socket_t common;
	isc__socketmgr_t *manager;
	isc_mutex_t lock;
	isc_sockettype_t type;
334
	const isc_statscounter_t *statsindex;
Evan Hunt's avatar
Evan Hunt committed
335
	isc_refcount_t references;
Michael Graff's avatar
Michael Graff committed
336

337
	/* Locked by socket lock. */
338
	ISC_LINK(isc__socket_t) link;
Evan Hunt's avatar
Evan Hunt committed
339 340 341 342
	int fd;
	int pf;
	int threadid;
	char name[16];
343 344 345 346 347 348 349 350 351 352
	void *tag;

	ISC_LIST(isc_socketevent_t) send_list;
	ISC_LIST(isc_socketevent_t) recv_list;
	ISC_LIST(isc_socket_newconnev_t) accept_list;
	ISC_LIST(isc_socket_connev_t) connect_list;

	isc_sockaddr_t peer_address; /* remote address */

	unsigned int listener : 1,	       /* listener socket */
353 354
		connected : 1, connecting : 1, /* connect pending
						* */
355 356 357
		bound : 1,		       /* bound to local addr */
		dupped : 1, active : 1,	       /* currently active */
		pktdscp : 1;		       /* per packet dscp */
358

359
#ifdef ISC_PLATFORM_RECVOVERFLOW
360
	unsigned char overflow; /* used for MSG_TRUNC fake */
361
#endif				/* ifdef ISC_PLATFORM_RECVOVERFLOW */
362

363
	unsigned int dscp;
364 365
};

366
#define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g')
Evan Hunt's avatar
Evan Hunt committed
367
#define VALID_MANAGER(m)     ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
368

369
struct isc__socketmgr {
370
	/* Not locked. */
Evan Hunt's avatar
Evan Hunt committed
371 372 373 374 375
	isc_socketmgr_t common;
	isc_mem_t *mctx;
	isc_mutex_t lock;
	isc_stats_t *stats;
	int nthreads;
376
	isc__socketthread_t *threads;
Evan Hunt's avatar
Evan Hunt committed
377
	unsigned int maxsocks;
378
	/* Locked by manager lock. */
379
	ISC_LIST(isc__socket_t) socklist;
Evan Hunt's avatar
Evan Hunt committed
380
	int reserved; /* unlocked */
381
	isc_condition_t shutdown_ok;
Evan Hunt's avatar
Evan Hunt committed
382
	size_t maxudp;
383 384 385
};

struct isc__socketthread {
386
	isc__socketmgr_t *manager;
Evan Hunt's avatar
Evan Hunt committed
387 388 389 390
	int threadid;
	isc_thread_t thread;
	int pipe_fds[2];
	isc_mutex_t *fdlock;
391
	/* Locked by fdlock. */
392
	isc__socket_t **fds;
Evan Hunt's avatar
Evan Hunt committed
393
	int *fdstate;
394
#ifdef USE_KQUEUE
Evan Hunt's avatar
Evan Hunt committed
395 396
	int kqueue_fd;
	int nevents;
397 398
	struct kevent *events;
#endif /* USE_KQUEUE */
399
#ifdef USE_EPOLL
Evan Hunt's avatar
Evan Hunt committed
400 401
	int epoll_fd;
	int nevents;
402
	struct epoll_event *events;
Evan Hunt's avatar
Evan Hunt committed
403
	uint32_t *epoll_events;
404
#endif /* USE_EPOLL */
405
#ifdef USE_DEVPOLL
Evan Hunt's avatar
Evan Hunt committed
406
	int devpoll_fd;
407
	isc_resourcevalue_t open_max;
Evan Hunt's avatar
Evan Hunt committed
408 409 410 411
	unsigned int calls;
	int nevents;
	struct pollfd *events;
	pollinfo_t *fdpollinfo;
412
#endif /* USE_DEVPOLL */
413
#ifdef USE_SELECT
Evan Hunt's avatar
Evan Hunt committed
414
	int fd_bufsize;
415 416 417 418
	fd_set *read_fds;
	fd_set *read_fds_copy;
	fd_set *write_fds;
	fd_set *write_fds_copy;
Evan Hunt's avatar
Evan Hunt committed
419
	int maxfd;
420
#endif /* USE_SELECT */
421 422
};

Evan Hunt's avatar
Evan Hunt committed
423 424
#define CLOSED	      0 /* this one must be zero */
#define MANAGED	      1
425
#define CLOSE_PENDING 2
Michael Graff's avatar
Michael Graff committed
426

427 428 429
/*
 * send() and recv() iovec counts
 */
430
#define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER)
431
#ifdef ISC_PLATFORM_RECVOVERFLOW
432
#define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1)
433
#else /* ifdef ISC_PLATFORM_RECVOVERFLOW */
434
#define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER)
435
#endif /* ifdef ISC_PLATFORM_RECVOVERFLOW */
436

Ondřej Surý's avatar
Ondřej Surý committed
437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471
static isc_result_t
socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
	      isc_socket_t **socketp, isc_socket_t *dup_socket);
static void
send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
static void
send_senddone_event(isc__socket_t *, isc_socketevent_t **);
static void
send_connectdone_event(isc__socket_t *, isc_socket_connev_t **);
static void
free_socket(isc__socket_t **);
static isc_result_t
allocate_socket(isc__socketmgr_t *, isc_sockettype_t, isc__socket_t **);
static void
destroy(isc__socket_t **);
static void
internal_accept(isc__socket_t *);
static void
internal_connect(isc__socket_t *);
static void
internal_recv(isc__socket_t *);
static void
internal_send(isc__socket_t *);
static void
process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
static void
build_msghdr_send(isc__socket_t *, char *, isc_socketevent_t *, struct msghdr *,
		  struct iovec *, size_t *);
static void
build_msghdr_recv(isc__socket_t *, char *, isc_socketevent_t *, struct msghdr *,
		  struct iovec *, size_t *);
static bool
process_ctlfd(isc__socketthread_t *thread);
static void
setdscp(isc__socket_t *sock, isc_dscp_t dscp);
472 473

#define SELECT_POKE_SHUTDOWN (-1)
Evan Hunt's avatar
Evan Hunt committed
474 475 476 477 478 479
#define SELECT_POKE_NOTHING  (-2)
#define SELECT_POKE_READ     (-3)
#define SELECT_POKE_ACCEPT   (-3) /*%< Same as _READ */
#define SELECT_POKE_WRITE    (-4)
#define SELECT_POKE_CONNECT  (-4) /*%< Same as _WRITE */
#define SELECT_POKE_CLOSE    (-5)
480

481 482 483
/*%
 * Shortcut index arrays to get access to statistics counters.
 */
484 485 486 487 488 489 490 491 492 493 494
enum { STATID_OPEN = 0,
       STATID_OPENFAIL = 1,
       STATID_CLOSE = 2,
       STATID_BINDFAIL = 3,
       STATID_CONNECTFAIL = 4,
       STATID_CONNECT = 5,
       STATID_ACCEPTFAIL = 6,
       STATID_ACCEPT = 7,
       STATID_SENDFAIL = 8,
       STATID_RECVFAIL = 9,
       STATID_ACTIVE = 10 };
Mark Andrews's avatar
Mark Andrews committed
495
static const isc_statscounter_t udp4statsindex[] = {
496 497 498 499 500 501 502 503 504
	isc_sockstatscounter_udp4open,
	isc_sockstatscounter_udp4openfail,
	isc_sockstatscounter_udp4close,
	isc_sockstatscounter_udp4bindfail,
	isc_sockstatscounter_udp4connectfail,
	isc_sockstatscounter_udp4connect,
	-1,
	-1,
	isc_sockstatscounter_udp4sendfail,
505 506
	isc_sockstatscounter_udp4recvfail,
	isc_sockstatscounter_udp4active
507
};
Mark Andrews's avatar
Mark Andrews committed
508
static const isc_statscounter_t udp6statsindex[] = {
509 510 511 512 513 514 515 516 517
	isc_sockstatscounter_udp6open,
	isc_sockstatscounter_udp6openfail,
	isc_sockstatscounter_udp6close,
	isc_sockstatscounter_udp6bindfail,
	isc_sockstatscounter_udp6connectfail,
	isc_sockstatscounter_udp6connect,
	-1,
	-1,
	isc_sockstatscounter_udp6sendfail,
518 519
	isc_sockstatscounter_udp6recvfail,
	isc_sockstatscounter_udp6active
520 521
};
static const isc_statscounter_t tcp4statsindex[] = {
522 523 524 525 526
	isc_sockstatscounter_tcp4open,	      isc_sockstatscounter_tcp4openfail,
	isc_sockstatscounter_tcp4close,	      isc_sockstatscounter_tcp4bindfail,
	isc_sockstatscounter_tcp4connectfail, isc_sockstatscounter_tcp4connect,
	isc_sockstatscounter_tcp4acceptfail,  isc_sockstatscounter_tcp4accept,
	isc_sockstatscounter_tcp4sendfail,    isc_sockstatscounter_tcp4recvfail,
527
	isc_sockstatscounter_tcp4active
528 529
};
static const isc_statscounter_t tcp6statsindex[] = {
530 531 532 533 534
	isc_sockstatscounter_tcp6open,	      isc_sockstatscounter_tcp6openfail,
	isc_sockstatscounter_tcp6close,	      isc_sockstatscounter_tcp6bindfail,
	isc_sockstatscounter_tcp6connectfail, isc_sockstatscounter_tcp6connect,
	isc_sockstatscounter_tcp6acceptfail,  isc_sockstatscounter_tcp6accept,
	isc_sockstatscounter_tcp6sendfail,    isc_sockstatscounter_tcp6recvfail,
535
	isc_sockstatscounter_tcp6active
536 537
};
static const isc_statscounter_t unixstatsindex[] = {
538 539 540 541 542
	isc_sockstatscounter_unixopen,	      isc_sockstatscounter_unixopenfail,
	isc_sockstatscounter_unixclose,	      isc_sockstatscounter_unixbindfail,
	isc_sockstatscounter_unixconnectfail, isc_sockstatscounter_unixconnect,
	isc_sockstatscounter_unixacceptfail,  isc_sockstatscounter_unixaccept,
	isc_sockstatscounter_unixsendfail,    isc_sockstatscounter_unixrecvfail,
543
	isc_sockstatscounter_unixactive
544
};
545 546 547 548 549 550 551 552 553 554 555 556 557
static const isc_statscounter_t rawstatsindex[] = {
	isc_sockstatscounter_rawopen,
	isc_sockstatscounter_rawopenfail,
	isc_sockstatscounter_rawclose,
	-1,
	-1,
	-1,
	-1,
	-1,
	-1,
	isc_sockstatscounter_rawrecvfail,
	isc_sockstatscounter_rawactive
};
558

Ondřej Surý's avatar
Ondřej Surý committed
559 560
static int
gen_threadid(isc__socket_t *sock);
561 562

static int
Evan Hunt's avatar
Evan Hunt committed
563
gen_threadid(isc__socket_t *sock) {
564
	return (sock->fd % sock->manager->nthreads);
565 566
}

Ondřej Surý's avatar
Ondřej Surý committed
567 568 569 570
static void
manager_log(isc__socketmgr_t *sockmgr, isc_logcategory_t *category,
	    isc_logmodule_t *module, int level, const char *fmt, ...)
	ISC_FORMAT_PRINTF(5, 6);
571
static void
572
manager_log(isc__socketmgr_t *sockmgr, isc_logcategory_t *category,
Evan Hunt's avatar
Evan Hunt committed
573 574
	    isc_logmodule_t *module, int level, const char *fmt, ...) {
	char msgbuf[2048];
Michael Graff's avatar
Michael Graff committed
575 576
	va_list ap;

577
	if (!isc_log_wouldlog(isc_lctx, level)) {
578
		return;
579
	}
580

Michael Graff's avatar
Michael Graff committed
581 582 583 584
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

585 586
	isc_log_write(isc_lctx, category, module, level, "sockmgr %p: %s",
		      sockmgr, msgbuf);
Michael Graff's avatar
Michael Graff committed
587 588
}

Ondřej Surý's avatar
Ondřej Surý committed
589 590 591
static void
thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
	   isc_logmodule_t *module, int level, const char *fmt, ...)
592
	ISC_FORMAT_PRINTF(5, 6);
593
static void
594
thread_log(isc__socketthread_t *thread, isc_logcategory_t *category,
Evan Hunt's avatar
Evan Hunt committed
595 596
	   isc_logmodule_t *module, int level, const char *fmt, ...) {
	char msgbuf[2048];
597 598
	va_list ap;

599
	if (!isc_log_wouldlog(isc_lctx, level)) {
600
		return;
601
	}
602 603 604 605 606 607

	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
608 609
		      "sockmgr %p thread %d: %s", thread->manager,
		      thread->threadid, msgbuf);
610 611
}

Ondřej Surý's avatar
Ondřej Surý committed
612 613 614 615
static void
socket_log(isc__socket_t *sock, const isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   const char *fmt, ...) ISC_FORMAT_PRINTF(6, 7);
616
static void
617
socket_log(isc__socket_t *sock, const isc_sockaddr_t *address,
618
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
Evan Hunt's avatar
Evan Hunt committed
619 620 621
	   const char *fmt, ...) {
	char msgbuf[2048];
	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
Michael Graff's avatar
Michael Graff committed
622 623
	va_list ap;

624
	if (!isc_log_wouldlog(isc_lctx, level)) {
625
		return;
626
	}
627

Michael Graff's avatar
Michael Graff committed
628 629 630 631 632
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
Ondřej Surý's avatar
Ondřej Surý committed
633 634
		isc_log_write(isc_lctx, category, module, level,
			      "socket %p: %s", sock, msgbuf);
Michael Graff's avatar
Michael Graff committed
635
	} else {
Andreas Gustafsson's avatar
Andreas Gustafsson committed
636
		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
Ondřej Surý's avatar
Ondřej Surý committed
637 638
		isc_log_write(isc_lctx, category, module, level,
			      "socket %p %s: %s", sock, peerbuf, msgbuf);
Michael Graff's avatar
Michael Graff committed
639 640 641
	}
}

642 643 644 645
/*%
 * Increment socket-related statistics counters.
 */
static inline void
Evan Hunt's avatar
Evan Hunt committed
646
inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
647 648
	REQUIRE(counterid != -1);

649
	if (stats != NULL) {
650
		isc_stats_increment(stats, counterid);
651
	}
652 653
}

654 655 656 657
/*%
 * Decrement socket-related statistics counters.
 */
static inline void
Evan Hunt's avatar
Evan Hunt committed
658
dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
659 660
	REQUIRE(counterid != -1);

661
	if (stats != NULL) {
662
		isc_stats_decrement(stats, counterid);
663
	}
664 665
}

666
static inline isc_result_t
Evan Hunt's avatar
Evan Hunt committed
667
watch_fd(isc__socketthread_t *thread, int fd, int msg) {
668 669 670 671 672 673
	isc_result_t result = ISC_R_SUCCESS;

#ifdef USE_KQUEUE
	struct kevent evchange;

	memset(&evchange, 0, sizeof(evchange));
674
	if (msg == SELECT_POKE_READ) {
675
		evchange.filter = EVFILT_READ;
676
	} else {
677
		evchange.filter = EVFILT_WRITE;
678
	}
679 680
	evchange.flags = EV_ADD;
	evchange.ident = fd;
681
	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
682
		result = isc__errno2result(errno);
683
	}
684 685 686 687

	return (result);
#elif defined(USE_EPOLL)
	struct epoll_event event;
Evan Hunt's avatar
Evan Hunt committed
688 689 690
	uint32_t oldevents;
	int ret;
	int op;
691

692
	oldevents = thread->epoll_events[fd];
693
	if (msg == SELECT_POKE_READ) {
694
		thread->epoll_events[fd] |= EPOLLIN;
695
	} else {
696
		thread->epoll_events[fd] |= EPOLLOUT;
697
	}
698

699
	event.events = thread->epoll_events[fd];
700
	memset(&event.data, 0, sizeof(event.data));
701
	event.data.fd = fd;
702 703

	op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
704
	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
705
	if (ret == -1) {
706
		if (errno == EEXIST) {
707 708
			UNEXPECTED_ERROR(__FILE__, __LINE__,
					 "epoll_ctl(ADD/MOD) returned "
709 710
					 "EEXIST for fd %d",
					 fd);
711
		}
712 713 714 715 716 717
		result = isc__errno2result(errno);
	}

	return (result);
#elif defined(USE_DEVPOLL)
	struct pollfd pfd;
Evan Hunt's avatar
Evan Hunt committed
718
	int lockid = FDLOCK_ID(fd);
719 720

	memset(&pfd, 0, sizeof(pfd));
721
	if (msg == SELECT_POKE_READ) {
722
		pfd.events = POLLIN;
723
	} else {
724
		pfd.events = POLLOUT;
725
	}
726 727
	pfd.fd = fd;
	pfd.revents = 0;
728
	if (write(thread->devpoll_fd, &pfd, sizeof(pfd)) == -1) {
729
		result = isc__errno2result(errno);
730 731
	} else {
		if (msg == SELECT_POKE_READ) {
732
			thread->fdpollinfo[fd].want_read = 1;
733
		} else {
734
			thread->fdpollinfo[fd].want_write = 1;
735
		}
736 737 738 739
	}

	return (result);
#elif defined(USE_SELECT)
740
	LOCK(&thread->manager->lock);
741
	if (msg == SELECT_POKE_READ) {
742
		FD_SET(fd, thread->read_fds);
743 744
	}
	if (msg == SELECT_POKE_WRITE) {
745
		FD_SET(fd, thread->write_fds);
746
	}
747
	UNLOCK(&thread->manager->lock);
748 749

	return (result);
750
#endif /* ifdef USE_KQUEUE */
751 752 753
}

static inline isc_result_t
Evan Hunt's avatar
Evan Hunt committed
754
unwatch_fd(isc__socketthread_t *thread, int fd, int msg) {
755 756 757 758 759 760
	isc_result_t result = ISC_R_SUCCESS;

#ifdef USE_KQUEUE
	struct kevent evchange;

	memset(&evchange, 0, sizeof(evchange));
761
	if (msg == SELECT_POKE_READ) {
762
		evchange.filter = EVFILT_READ;
763
	} else {
764
		evchange.filter = EVFILT_WRITE;
765
	}
766 767
	evchange.flags = EV_DELETE;
	evchange.ident = fd;
768
	if (kevent(thread->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0) {
769
		result = isc__errno2result(errno);
770
	}
771 772 773 774

	return (result);
#elif defined(USE_EPOLL)
	struct epoll_event event;
Evan Hunt's avatar
Evan Hunt committed
775 776
	int ret;
	int op;
777

778 779 780 781 782
	if (msg == SELECT_POKE_READ) {
		thread->epoll_events[fd] &= ~(EPOLLIN);
	} else {
		thread->epoll_events[fd] &= ~(EPOLLOUT);
	}
783

784
	event.events = thread->epoll_events[fd];
785
	memset(&event.data, 0, sizeof(event.data));
786
	event.data.fd = fd;
787 788

	op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
789
	ret = epoll_ctl(thread->epoll_fd, op, fd, &event);
790
	if (ret == -1 && errno != ENOENT) {
791
		char strbuf[ISC_STRERRORSIZE];
792
		strerror_r(errno, strbuf, sizeof(strbuf));
793 794
		UNEXPECTED_ERROR(__FILE__, __LINE__, "epoll_ctl(DEL), %d: %s",
				 fd, strbuf);
795 796 797 798 799
		result = ISC_R_UNEXPECTED;
	}
	return (result);
#elif defined(USE_DEVPOLL)
	struct pollfd pfds[2];
Evan Hunt's avatar
Evan Hunt committed
800 801
	size_t writelen = sizeof(pfds[0]);
	int lockid = FDLOCK_ID(fd);
802 803 804 805 806 807 808 809 810 811

	memset(pfds, 0, sizeof(pfds));
	pfds[0].events = POLLREMOVE;
	pfds[0].fd = fd;

	/*
	 * Canceling read or write polling via /dev/poll is tricky.  Since it
	 * only provides a way of canceling per FD, we may need to re-poll the
	 * socket for the other operation.
	 */
812
	if (msg == SELECT_POKE_READ && thread->fdpollinfo[fd].want_write == 1) {
813 814 815 816
		pfds[1].events = POLLOUT;
		pfds[1].fd = fd;
		writelen += sizeof(pfds[1]);
	}
817
	if (msg == SELECT_POKE_WRITE && thread->fdpollinfo[fd].want_read == 1) {
818 819 820 821 822
		pfds[1].events = POLLIN;
		pfds[1].fd = fd;
		writelen += sizeof(pfds[1]);
	}

823
	if (write(thread->devpoll_fd, pfds, writelen) == -1) {
824
		result = isc__errno2result(errno);
825 826
	} else {
		if (msg == SELECT_POKE_READ) {
827
			thread->fdpollinfo[fd].want_read = 0;
828
		} else {
829
			thread->fdpollinfo[fd].want_write = 0;
830
		}
831 832 833 834
	}

	return (result);
#elif defined(USE_SELECT)
835
	LOCK(&thread->manager->lock);
836
	if (msg == SELECT_POKE_READ) {
837
		FD_CLR(fd, thread->read_fds);
838
	} else if (msg == SELECT_POKE_WRITE) {
839
		FD_CLR(fd, thread->write_fds);
840
	}
841
	UNLOCK(&thread->manager->lock);
842 843

	return (result);
844
#endif /* ifdef USE_KQUEUE */
845 846
}

Witold Krecicki's avatar
Witold Krecicki committed
847 848 849 850
/*
 * A poke message was received, perform a proper watch/unwatch
 * on a fd provided
 */
851
static void
Evan Hunt's avatar
Evan Hunt committed
852
wakeup_socket(isc__socketthread_t *thread, int fd, int msg) {
853
	isc_result_t result;
Evan Hunt's avatar
Evan Hunt committed
854
	int lockid = FDLOCK_ID(fd);
855 856

	/*
857 858 859
	 * This is a wakeup on a socket.  If the socket is not in the
	 * process of being closed, start watching it for either reads
	 * or writes.
860
	 */
Andreas Gustafsson's avatar