socket.c 164 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
2
 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3
 *
4 5 6
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 8 9
 *
 * See the COPYRIGHT file distributed with this work for additional
 * information regarding copyright ownership.
Bob Halley's avatar
Bob Halley committed
10
 */
Bob Halley's avatar
Bob Halley committed
11

12
/*! \file */
David Lawrence's avatar
David Lawrence committed
13

Bob Halley's avatar
Bob Halley committed
14
#include <config.h>
15

16 17 18
#include <inttypes.h>
#include <stdbool.h>

19
#include <sys/param.h>
Michael Graff's avatar
Michael Graff committed
20
#include <sys/types.h>
Michael Graff's avatar
Michael Graff committed
21
#include <sys/socket.h>
22
#include <sys/stat.h>
23 24 25
#ifdef HAVE_SYS_SYSCTL_H
#include <sys/sysctl.h>
#endif
Michael Graff's avatar
Michael Graff committed
26
#include <sys/time.h>
Michael Graff's avatar
Michael Graff committed
27 28
#include <sys/uio.h>

Mark Andrews's avatar
Mark Andrews committed
29 30 31 32 33
#if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#endif

34
#include <errno.h>
Andreas Gustafsson's avatar
Andreas Gustafsson committed
35
#include <fcntl.h>
36 37 38 39 40
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

41
#include <isc/buffer.h>
42
#include <isc/bufferlist.h>
43
#include <isc/condition.h>
44
#include <isc/formatcheck.h>
45
#include <isc/json.h>
46
#include <isc/list.h>
Michael Graff's avatar
Michael Graff committed
47
#include <isc/log.h>
48
#include <isc/mem.h>
49
#include <isc/msgs.h>
50
#include <isc/mutex.h>
51
#include <isc/net.h>
52
#include <isc/once.h>
53
#include <isc/platform.h>
Michael Graff's avatar
Michael Graff committed
54
#include <isc/print.h>
55
#include <isc/region.h>
56
#include <isc/resource.h>
57
#include <isc/socket.h>
58
#include <isc/stats.h>
Evan Hunt's avatar
Evan Hunt committed
59
#include <isc/string.h>
60
#include <isc/task.h>
61
#include <isc/thread.h>
Michael Graff's avatar
Michael Graff committed
62
#include <isc/util.h>
63
#include <isc/xml.h>
Bob Halley's avatar
Bob Halley committed
64

65 66 67 68 69 70 71 72 73 74
#ifdef ISC_PLATFORM_HAVESYSUNH
#include <sys/un.h>
#endif
#ifdef ISC_PLATFORM_HAVEKQUEUE
#include <sys/event.h>
#endif
#ifdef ISC_PLATFORM_HAVEEPOLL
#include <sys/epoll.h>
#endif
#ifdef ISC_PLATFORM_HAVEDEVPOLL
75
#if defined(HAVE_SYS_DEVPOLL_H)
76
#include <sys/devpoll.h>
77 78 79
#elif defined(HAVE_DEVPOLL_H)
#include <devpoll.h>
#endif
80 81
#endif

82 83
#include <netinet/tcp.h>

84 85
#include "errno2result.h"

86 87 88 89
#if defined(SO_BSDCOMPAT) && defined(__linux__)
#include <sys/utsname.h>
#endif

90 91 92 93
#ifdef ISC_PLATFORM_HAVETFO
#include <netinet/tcp.h>
#endif

94
/*%
Automatic Updater's avatar
Automatic Updater committed
95
 * Choose the most preferable multiplex method.
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
 */
#ifdef ISC_PLATFORM_HAVEKQUEUE
#define USE_KQUEUE
#elif defined (ISC_PLATFORM_HAVEEPOLL)
#define USE_EPOLL
#elif defined (ISC_PLATFORM_HAVEDEVPOLL)
#define USE_DEVPOLL
typedef struct {
	unsigned int want_read : 1,
		want_write : 1;
} pollinfo_t;
#else
#define USE_SELECT
#endif	/* ISC_PLATFORM_HAVEKQUEUE */

Evan Hunt's avatar
Evan Hunt committed
111 112 113 114 115 116 117
/*
 * Set by the -T dscp option on the command line. If set to a value
 * other than -1, we check to make sure DSCP values match it, and
 * assert if not.
 */
int isc_dscp_check_value = -1;

118 119
/*%
 * Maximum number of allowable open sockets.  This is also the maximum
120 121 122 123 124 125
 * allowable socket file descriptor.
 *
 * Care should be taken before modifying this value for select():
 * The API standard doesn't ensure select() accept more than (the system default
 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
 * the vast majority of cases.  This constant should therefore be increased only
Automatic Updater's avatar
Automatic Updater committed
126
 * when absolutely necessary and possible, i.e., the server is exhausting all
127 128 129 130 131 132 133 134 135
 * available file descriptors (up to FD_SETSIZE) and the select() function
 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
 * always by true, but we keep using some of them to ensure as much
 * portability as possible).  Note also that overall server performance
 * may be rather worsened with a larger value of this constant due to
 * inherent scalability problems of select().
 *
 * As a special note, this value shouldn't have to be touched if
 * this is a build for an authoritative only DNS server.
136 137
 */
#ifndef ISC_SOCKET_MAXSOCKETS
138
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
139 140 141
#ifdef TUNE_LARGE
#define ISC_SOCKET_MAXSOCKETS 21000
#else
142
#define ISC_SOCKET_MAXSOCKETS 4096
143
#endif /* TUNE_LARGE */
144 145 146 147 148 149 150
#elif defined(USE_SELECT)
#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
#endif	/* USE_KQUEUE... */
#endif	/* ISC_SOCKET_MAXSOCKETS */

#ifdef USE_SELECT
/*%
151 152
 * Mac OS X needs a special definition to support larger values in select().
 * We always define this because a larger value can be specified run-time.
153 154 155 156 157 158
 */
#ifdef __APPLE__
#define _DARWIN_UNLIMITED_SELECT
#endif	/* __APPLE__ */
#endif	/* USE_SELECT */

159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
#ifdef ISC_SOCKET_USE_POLLWATCH
/*%
 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
 * some of the specified FD.  The idea is based on the observation that it's
 * likely for a busy server to keep receiving packets.  It specifically works
 * as follows: the socket watcher is first initialized with the state of
 * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
 * event occurs.  When it wakes up for a socket I/O event, it moves to the
 * poll_active state, and sets the poll timeout to a short period
 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
 * watcher goes to the poll_checking state with the same timeout period.
 * In this state, the watcher tries to detect whether this is a break
 * during intermittent events or the kernel bug is triggered.  If the next
 * polling reports an event within the short period, the previous timeout is
 * likely to be a kernel bug, and so the watcher goes back to the active state.
 * Otherwise, it moves to the idle state again.
 *
 * It's not clear whether this is a thread-related bug, but since we've only
 * seen this with threads, this workaround is used only when enabling threads.
 */

typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;

#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
#endif	/* ISC_SOCKET_POLLWATCH_TIMEOUT */
#endif	/* ISC_SOCKET_USE_POLLWATCH */

188 189 190 191 192 193 194 195 196 197 198 199
/*%
 * Size of per-FD lock buckets.
 */
#define FDLOCK_COUNT		1024
#define FDLOCK_ID(fd)		((fd) % FDLOCK_COUNT)

/*%
 * Maximum number of events communicated with the kernel.  There should normally
 * be no need for having a large number.
 */
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
#ifndef ISC_SOCKET_MAXEVENTS
200 201 202
#ifdef TUNE_LARGE
#define ISC_SOCKET_MAXEVENTS	2048
#else
203
#define ISC_SOCKET_MAXEVENTS	64
204
#endif /* TUNE_LARGE */
205 206 207
#endif
#endif

208
/*%
209
 * Some systems define the socket length argument as an int, some as size_t,
210
 * some as socklen_t.  This is here so it can be easily changed if needed.
211
 */
212
#ifndef ISC_SOCKADDR_LEN_T
213
#define ISC_SOCKADDR_LEN_T unsigned int
214
#endif
215

216
/*%
217 218
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
219 220 221 222
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
223
 */
224 225
#define SOFT_ERROR(e)	((e) == EAGAIN || \
			 (e) == EWOULDBLOCK || \
Ondřej Surý's avatar
Ondřej Surý committed
226
			 (e) == ENOBUFS || \
227 228
			 (e) == EINTR || \
			 (e) == 0)
229

Michael Graff's avatar
Michael Graff committed
230
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
231

232
/*!<
Michael Graff's avatar
Michael Graff committed
233 234 235 236 237 238
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
239 240 241 242 243 244 245 246 247 248 249
#define TRACE_LEVEL		90
#define CORRECTNESS_LEVEL	70
#define IOEVENT_LEVEL		60
#define EVENT_LEVEL		50
#define CREATION_LEVEL		20

#define TRACE		DLVL(TRACE_LEVEL)
#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
#define IOEVENT		DLVL(IOEVENT_LEVEL)
#define EVENT		DLVL(EVENT_LEVEL)
#define CREATION	DLVL(CREATION_LEVEL)
250

251
typedef isc_event_t intev_t;
Michael Graff's avatar
Michael Graff committed
252

253
#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
254
#define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
Michael Graff's avatar
Michael Graff committed
255

256
/*!
Michael Graff's avatar
Michael Graff committed
257 258 259 260 261 262 263 264
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
#ifndef USE_CMSG
#define USE_CMSG	1
#endif

265
/*%
266
 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
Michael Graff's avatar
Michael Graff committed
267 268 269 270 271 272 273 274 275
 * a setsockopt() like interface to request timestamps, and if the OS
 * doesn't do it for us, call gettimeofday() on every UDP receive?
 */
#ifdef SO_TIMESTAMP
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

276
/*%
Francis Dupont's avatar
Francis Dupont committed
277
 * The size to raise the receive buffer to (from BIND 8).
278
 */
279
#ifdef TUNE_LARGE
280 281 282
#ifdef sun
#define RCVBUFSIZE (1*1024*1024)
#else
283
#define RCVBUFSIZE (16*1024*1024)
284
#endif
285
#else
286
#define RCVBUFSIZE (32*1024)
287
#endif /* TUNE_LARGE */
288

289 290 291 292 293 294
/*%
 * Instead of calculating the cmsgbuf lengths every time we take
 * a rule of thumb approach - sizes are taken from x86_64 linux,
 * multiplied by 2, everything should fit. Those sizes are not
 * large enough to cause any concern.
 */
295
#if defined(USE_CMSG)
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
#define CMSG_SP_IN6PKT 40
#else
#define CMSG_SP_IN6PKT 0
#endif

#if defined(USE_CMSG) && defined(SO_TIMESTAMP)
#define CMSG_SP_TIMESTAMP 32
#else
#define CMSG_SP_TIMESTAMP 0
#endif

#if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
#define CMSG_SP_TCTOS 24
#else
#define CMSG_SP_TCTOS 0
#endif

#define CMSG_SP_INT 24

#define RECVCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_TIMESTAMP + CMSG_SP_TCTOS)+1)
#define SENDCMSGBUFLEN (2*(CMSG_SP_IN6PKT + CMSG_SP_INT + CMSG_SP_TCTOS)+1)

318
/*%
319 320 321 322
 * The number of times a send operation is repeated if the result is EINTR.
 */
#define NRETRIES 10

323 324 325 326 327 328
typedef struct isc__socket isc__socket_t;
typedef struct isc__socketmgr isc__socketmgr_t;

#define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)

struct isc__socket {
329
	/* Not locked. */
330 331
	isc_socket_t		common;
	isc__socketmgr_t	*manager;
332 333
	isc_mutex_t		lock;
	isc_sockettype_t	type;
334
	const isc_statscounter_t	*statsindex;
Michael Graff's avatar
Michael Graff committed
335

336
	/* Locked by socket lock. */
337
	ISC_LINK(isc__socket_t)	link;
338 339
	unsigned int		references;
	int			fd;
340
	int			pf;
341 342 343
	char				name[16];
	void *				tag;

344
	ISC_LIST(isc_socketevent_t)		send_list;
345
	ISC_LIST(isc_socketevent_t)		recv_list;
346
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
347
	ISC_LIST(isc_socket_connev_t)		connect_list;
348 349 350 351 352 353

	/*
	 * Internal events.  Posted when a descriptor is readable or
	 * writable.  These are statically allocated and never freed.
	 * They will be set to non-purgable before use.
	 */
354 355
	intev_t			readable_ev;
	intev_t			writable_ev;
356

357
	isc_sockaddr_t		peer_address;       /* remote address */
358

359 360 361
	unsigned int		pending_recv : 1,
				pending_send : 1,
				pending_accept : 1,
362
				listener : 1,       /* listener socket */
363
				connected : 1,
364 365 366
				connecting : 1,     /* connect pending */
				bound : 1,          /* bound to local addr */
				dupped : 1,
367 368
				active : 1,         /* currently active */
				pktdscp : 1;	    /* per packet dscp */
369

370
#ifdef ISC_PLATFORM_RECVOVERFLOW
371
	unsigned char		overflow; /* used for MSG_TRUNC fake */
372
#endif
373

374 375 376 377
	void			*fdwatcharg;
	isc_sockfdwatch_t	fdwatchcb;
	int			fdwatchflags;
	isc_task_t		*fdwatchtask;
378
	unsigned int		dscp;
379 380
};

381 382 383
#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)

384
struct isc__socketmgr {
385
	/* Not locked. */
386
	isc_socketmgr_t		common;
387 388
	isc_mem_t	       *mctx;
	isc_mutex_t		lock;
389
	isc_mutex_t		*fdlock;
390
	isc_stats_t		*stats;
391 392 393 394 395 396 397 398 399 400 401 402
#ifdef USE_KQUEUE
	int			kqueue_fd;
	int			nevents;
	struct kevent		*events;
#endif	/* USE_KQUEUE */
#ifdef USE_EPOLL
	int			epoll_fd;
	int			nevents;
	struct epoll_event	*events;
#endif	/* USE_EPOLL */
#ifdef USE_DEVPOLL
	int			devpoll_fd;
403 404
	isc_resourcevalue_t	open_max;
	unsigned int		calls;
405 406 407
	int			nevents;
	struct pollfd		*events;
#endif	/* USE_DEVPOLL */
408 409 410
#ifdef USE_SELECT
	int			fd_bufsize;
#endif	/* USE_SELECT */
411 412 413 414
	unsigned int		maxsocks;
	int			pipe_fds[2];

	/* Locked by fdlock. */
415
	isc__socket_t	       **fds;
416
	int			*fdstate;
417 418 419
#if defined(USE_EPOLL)
	uint32_t		*epoll_events;
#endif
420 421 422 423
#ifdef USE_DEVPOLL
	pollinfo_t		*fdpollinfo;
#endif

424
	/* Locked by manager lock. */
425
	ISC_LIST(isc__socket_t)	socklist;
426
#ifdef USE_SELECT
427 428 429 430
	fd_set			*read_fds;
	fd_set			*read_fds_copy;
	fd_set			*write_fds;
	fd_set			*write_fds_copy;
431
	int			maxfd;
432
#endif	/* USE_SELECT */
433
	int			reserved;	/* unlocked */
434 435
	isc_thread_t		watcher;
	isc_condition_t		shutdown_ok;
436
	int			maxudp;
437 438
};

439 440 441
#define CLOSED			0	/* this one must be zero */
#define MANAGED			1
#define CLOSE_PENDING		2
Michael Graff's avatar
Michael Graff committed
442

443 444 445 446
/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
447
#ifdef ISC_PLATFORM_RECVOVERFLOW
448 449 450 451 452
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
#else
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
#endif

453 454 455 456
static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
				  isc_sockettype_t type,
				  isc_socket_t **socketp,
				  isc_socket_t *dup_socket);
457 458
static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
459
static void send_connectdone_event(isc__socket_t *, isc_socket_connev_t **);
460 461 462 463
static void free_socket(isc__socket_t **);
static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
				    isc__socket_t **);
static void destroy(isc__socket_t **);
464 465 466 467
static void internal_accept(isc_task_t *, isc_event_t *);
static void internal_connect(isc_task_t *, isc_event_t *);
static void internal_recv(isc_task_t *, isc_event_t *);
static void internal_send(isc_task_t *, isc_event_t *);
468 469
static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
470
static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
Witold Krecicki's avatar
Witold Krecicki committed
471
static void build_msghdr_send(isc__socket_t *, char *, isc_socketevent_t *,
472
			      struct msghdr *, struct iovec *, size_t *);
Witold Krecicki's avatar
Witold Krecicki committed
473
static void build_msghdr_recv(isc__socket_t *, char *, isc_socketevent_t *,
474
			      struct msghdr *, struct iovec *, size_t *);
475
static bool process_ctlfd(isc__socketmgr_t *manager);
Evan Hunt's avatar
Evan Hunt committed
476
static void setdscp(isc__socket_t *sock, isc_dscp_t dscp);
477 478

/*%
479 480 481
 * The following are intended for internal use (indicated by "isc__"
 * prefix) but are not declared as static, allowing direct access from
 * unit tests etc.
482 483
 */

484 485 486 487 488
isc_result_t
isc__socket_open(isc_socket_t *sock0);
isc_result_t
isc__socket_close(isc_socket_t *sock0);
isc_result_t
489 490
isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
		   isc_socket_t **socketp);
491
void
492
isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
493
void
494
isc__socket_detach(isc_socket_t **socketp);
495
isc_result_t
496 497
isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
		 unsigned int minimum, isc_task_t *task,
Mark Andrews's avatar
Mark Andrews committed
498
		  isc_taskaction_t action, void *arg);
499
isc_result_t
500 501
isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
		 unsigned int minimum, isc_task_t *task,
Mark Andrews's avatar
Mark Andrews committed
502
		 isc_taskaction_t action, void *arg);
503
isc_result_t
504 505 506
isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
		  unsigned int minimum, isc_task_t *task,
		  isc_socketevent_t *event, unsigned int flags);
507
isc_result_t
508
isc__socket_send(isc_socket_t *sock, isc_region_t *region,
Mark Andrews's avatar
Mark Andrews committed
509
		 isc_task_t *task, isc_taskaction_t action, void *arg);
510
isc_result_t
511
isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
Mark Andrews's avatar
Mark Andrews committed
512
		   isc_task_t *task, isc_taskaction_t action, void *arg,
513
		   const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
514
isc_result_t
515
isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
Mark Andrews's avatar
Mark Andrews committed
516
		  isc_task_t *task, isc_taskaction_t action, void *arg);
517
isc_result_t
518
isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
Mark Andrews's avatar
Mark Andrews committed
519
		    isc_task_t *task, isc_taskaction_t action, void *arg,
520
		    const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
521
isc_result_t
522
isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist,
Mark Andrews's avatar
Mark Andrews committed
523
		     isc_task_t *task, isc_taskaction_t action, void *arg,
524
		     const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
525 526
		     unsigned int flags);
isc_result_t
527 528
isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
		    isc_task_t *task,
529
		    const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
530
		    isc_socketevent_t *event, unsigned int flags);
Evan Hunt's avatar
Evan Hunt committed
531 532 533
isc_socketevent_t *
isc_socket_socketevent(isc_mem_t *mctx, void *sender,
		       isc_eventtype_t eventtype, isc_taskaction_t action,
Mark Andrews's avatar
Mark Andrews committed
534
		       void *arg);
Evan Hunt's avatar
Evan Hunt committed
535

536
void
537
isc__socket_cleanunix(const isc_sockaddr_t *sockaddr, bool active);
538
isc_result_t
539 540
isc__socket_permunix(const isc_sockaddr_t *sockaddr, uint32_t perm,
		     uint32_t owner, uint32_t group);
541
isc_result_t
542
isc__socket_bind(isc_socket_t *sock, const isc_sockaddr_t *sockaddr,
543
		 unsigned int options);
544
isc_result_t
545
isc__socket_filter(isc_socket_t *sock, const char *filter);
546
isc_result_t
547
isc__socket_listen(isc_socket_t *sock, unsigned int backlog);
548
isc_result_t
549
isc__socket_accept(isc_socket_t *sock,
Mark Andrews's avatar
Mark Andrews committed
550
		   isc_task_t *task, isc_taskaction_t action, void *arg);
551
isc_result_t
552
isc__socket_connect(isc_socket_t *sock, const isc_sockaddr_t *addr,
553
		    isc_task_t *task, isc_taskaction_t action,
Mark Andrews's avatar
Mark Andrews committed
554
		    void *arg);
555
isc_result_t
556
isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp);
557
isc_result_t
558
isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp);
559
void
560
isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
561
isc_sockettype_t
562
isc__socket_gettype(isc_socket_t *sock);
563
bool
564
isc__socket_isbound(isc_socket_t *sock);
565
void
566
isc__socket_ipv6only(isc_socket_t *sock, bool yes);
567
void
Evan Hunt's avatar
Evan Hunt committed
568
isc__socket_dscp(isc_socket_t *sock, isc_dscp_t dscp);
569
isc_result_t
570 571 572
isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
			  isc_sockfdwatch_t callback, void *cbarg,
			  isc_task_t *task, isc_socket_t **socketp);
573
isc_result_t
574
isc__socket_fdwatchpoke(isc_socket_t *sock, int flags);
575
isc_result_t
576
isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp);
577
int
Mark Andrews's avatar
Mark Andrews committed
578
isc__socket_getfd(isc_socket_t *sock);
579

580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606
isc_result_t
isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp);
isc_result_t
isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
		       unsigned int maxsocks);
isc_result_t
isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp);
void
isc_socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats);
void
isc__socketmgr_destroy(isc_socketmgr_t **managerp);
void
isc__socket_setname(isc_socket_t *socket0, const char *name, void *tag);
const char *
isc__socket_getname(isc_socket_t *socket0);
void *
isc__socket_gettag(isc_socket_t *socket0);

#ifdef HAVE_LIBXML2
void
isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer);
#endif
#ifdef HAVE_JSON
isc_result_t
isc__socketmgr_renderjson(isc_socketmgr_t *mgr0, json_object *stats);
#endif

607 608 609 610 611 612 613
static struct {
	isc_socketmethods_t methods;

	/*%
	 * The following are defined just for avoiding unused static functions.
	 */
	void *recvv, *send, *sendv, *sendto2, *cleanunix, *permunix, *filter,
614
	     *listen, *accept, *getpeername, *isbound;
615 616 617 618 619 620
} socketmethods = {
	{
		isc__socket_attach,
		isc__socket_detach,
		isc__socket_bind,
		isc__socket_sendto,
621
		isc__socket_sendto2,
622 623
		isc__socket_connect,
		isc__socket_recv,
624
		isc__socket_recv2,
625 626 627
		isc__socket_cancel,
		isc__socket_getsockname,
		isc__socket_gettype,
628
		isc__socket_ipv6only,
629
		isc__socket_fdwatchpoke,
Mark Andrews's avatar
Mark Andrews committed
630
		isc__socket_dup,
Evan Hunt's avatar
Evan Hunt committed
631 632
		isc__socket_getfd,
		isc__socket_dscp
633
	},
634 635 636 637 638 639
	(void *)isc__socket_recvv, (void *)isc__socket_send,
	(void *)isc__socket_sendv, (void *)isc__socket_sendto2,
	(void *)isc__socket_cleanunix, (void *)isc__socket_permunix,
	(void *)isc__socket_filter, (void *)isc__socket_listen,
	(void *)isc__socket_accept, (void *)isc__socket_getpeername,
	(void *)isc__socket_isbound
640 641 642 643
};

static isc_socketmgrmethods_t socketmgrmethods = {
	isc__socketmgr_destroy,
644 645
	isc__socket_create,
	isc__socket_fdwatchcreate
646 647
};

Michael Graff's avatar
Michael Graff committed
648 649
#define SELECT_POKE_SHUTDOWN		(-1)
#define SELECT_POKE_NOTHING		(-2)
650
#define SELECT_POKE_READ		(-3)
651
#define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
652
#define SELECT_POKE_WRITE		(-4)
653
#define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
654
#define SELECT_POKE_CLOSE		(-5)
655

656 657
#define SOCK_DEAD(s)			((s)->references == 0)

658 659 660 661 662 663 664 665 666 667 668 669 670
/*%
 * Shortcut index arrays to get access to statistics counters.
 */
enum {
	STATID_OPEN = 0,
	STATID_OPENFAIL = 1,
	STATID_CLOSE = 2,
	STATID_BINDFAIL = 3,
	STATID_CONNECTFAIL = 4,
	STATID_CONNECT = 5,
	STATID_ACCEPTFAIL = 6,
	STATID_ACCEPT = 7,
	STATID_SENDFAIL = 8,
671 672
	STATID_RECVFAIL = 9,
	STATID_ACTIVE = 10
673
};
Mark Andrews's avatar
Mark Andrews committed
674
static const isc_statscounter_t udp4statsindex[] = {
675 676 677 678 679 680 681 682 683
	isc_sockstatscounter_udp4open,
	isc_sockstatscounter_udp4openfail,
	isc_sockstatscounter_udp4close,
	isc_sockstatscounter_udp4bindfail,
	isc_sockstatscounter_udp4connectfail,
	isc_sockstatscounter_udp4connect,
	-1,
	-1,
	isc_sockstatscounter_udp4sendfail,
684 685
	isc_sockstatscounter_udp4recvfail,
	isc_sockstatscounter_udp4active
686
};
Mark Andrews's avatar
Mark Andrews committed
687
static const isc_statscounter_t udp6statsindex[] = {
688 689 690 691 692 693 694 695 696
	isc_sockstatscounter_udp6open,
	isc_sockstatscounter_udp6openfail,
	isc_sockstatscounter_udp6close,
	isc_sockstatscounter_udp6bindfail,
	isc_sockstatscounter_udp6connectfail,
	isc_sockstatscounter_udp6connect,
	-1,
	-1,
	isc_sockstatscounter_udp6sendfail,
697 698
	isc_sockstatscounter_udp6recvfail,
	isc_sockstatscounter_udp6active
699 700 701 702 703 704 705 706 707 708 709
};
static const isc_statscounter_t tcp4statsindex[] = {
	isc_sockstatscounter_tcp4open,
	isc_sockstatscounter_tcp4openfail,
	isc_sockstatscounter_tcp4close,
	isc_sockstatscounter_tcp4bindfail,
	isc_sockstatscounter_tcp4connectfail,
	isc_sockstatscounter_tcp4connect,
	isc_sockstatscounter_tcp4acceptfail,
	isc_sockstatscounter_tcp4accept,
	isc_sockstatscounter_tcp4sendfail,
710 711
	isc_sockstatscounter_tcp4recvfail,
	isc_sockstatscounter_tcp4active
712 713 714 715 716 717 718 719 720 721 722
};
static const isc_statscounter_t tcp6statsindex[] = {
	isc_sockstatscounter_tcp6open,
	isc_sockstatscounter_tcp6openfail,
	isc_sockstatscounter_tcp6close,
	isc_sockstatscounter_tcp6bindfail,
	isc_sockstatscounter_tcp6connectfail,
	isc_sockstatscounter_tcp6connect,
	isc_sockstatscounter_tcp6acceptfail,
	isc_sockstatscounter_tcp6accept,
	isc_sockstatscounter_tcp6sendfail,
723 724
	isc_sockstatscounter_tcp6recvfail,
	isc_sockstatscounter_tcp6active
725 726 727 728 729 730 731 732 733 734 735
};
static const isc_statscounter_t unixstatsindex[] = {
	isc_sockstatscounter_unixopen,
	isc_sockstatscounter_unixopenfail,
	isc_sockstatscounter_unixclose,
	isc_sockstatscounter_unixbindfail,
	isc_sockstatscounter_unixconnectfail,
	isc_sockstatscounter_unixconnect,
	isc_sockstatscounter_unixacceptfail,
	isc_sockstatscounter_unixaccept,
	isc_sockstatscounter_unixsendfail,
736 737
	isc_sockstatscounter_unixrecvfail,
	isc_sockstatscounter_unixactive
738 739 740 741 742 743 744 745 746 747 748
};
static const isc_statscounter_t fdwatchstatsindex[] = {
	-1,
	-1,
	isc_sockstatscounter_fdwatchclose,
	isc_sockstatscounter_fdwatchbindfail,
	isc_sockstatscounter_fdwatchconnectfail,
	isc_sockstatscounter_fdwatchconnect,
	-1,
	-1,
	isc_sockstatscounter_fdwatchsendfail,
749 750
	isc_sockstatscounter_fdwatchrecvfail,
	-1
751
};
752 753 754 755 756 757 758 759 760 761 762 763 764
static const isc_statscounter_t rawstatsindex[] = {
	isc_sockstatscounter_rawopen,
	isc_sockstatscounter_rawopenfail,
	isc_sockstatscounter_rawclose,
	-1,
	-1,
	-1,
	-1,
	-1,
	-1,
	isc_sockstatscounter_rawrecvfail,
	isc_sockstatscounter_rawactive
};
765

766
static void
767
manager_log(isc__socketmgr_t *sockmgr,
768 769
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
Michael Graff's avatar
Michael Graff committed
770
static void
771
manager_log(isc__socketmgr_t *sockmgr,
Michael Graff's avatar
Michael Graff committed
772 773 774 775 776 777
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...)
{
	char msgbuf[2048];
	va_list ap;

778 779 780
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
781 782 783 784 785 786 787 788
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
		      "sockmgr %p: %s", sockmgr, msgbuf);
}

789
static void
790
socket_log(isc__socket_t *sock, const isc_sockaddr_t *address,
791 792 793
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   isc_msgcat_t *msgcat, int msgset, int message,
	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
Michael Graff's avatar
Michael Graff committed
794
static void
795
socket_log(isc__socket_t *sock, const isc_sockaddr_t *address,
Michael Graff's avatar
Michael Graff committed
796
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
797
	   isc_msgcat_t *msgcat, int msgset, int message,
Michael Graff's avatar
Michael Graff committed
798 799 800
	   const char *fmt, ...)
{
	char msgbuf[2048];
801
	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
Michael Graff's avatar
Michael Graff committed
802 803
	va_list ap;

804 805 806
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
807 808 809 810 811
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
812 813 814
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p: %s", sock, msgbuf);
Michael Graff's avatar
Michael Graff committed
815
	} else {
Andreas Gustafsson's avatar
Andreas Gustafsson committed
816
		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
817 818 819
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p %s: %s", sock, peerbuf, msgbuf);
Michael Graff's avatar
Michael Graff committed
820 821 822
	}
}

823 824 825 826 827 828 829 830 831 832 833
/*%
 * Increment socket-related statistics counters.
 */
static inline void
inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
	REQUIRE(counterid != -1);

	if (stats != NULL)
		isc_stats_increment(stats, counterid);
}

834 835 836 837 838 839 840 841 842 843 844
/*%
 * Decrement socket-related statistics counters.
 */
static inline void
dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
	REQUIRE(counterid != -1);

	if (stats != NULL)
		isc_stats_decrement(stats, counterid);
}

845
static inline isc_result_t
846
watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864
	isc_result_t result = ISC_R_SUCCESS;

#ifdef USE_KQUEUE
	struct kevent evchange;

	memset(&evchange, 0, sizeof(evchange));
	if (msg == SELECT_POKE_READ)
		evchange.filter = EVFILT_READ;
	else
		evchange.filter = EVFILT_WRITE;
	evchange.flags = EV_ADD;
	evchange.ident = fd;
	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
		result = isc__errno2result(errno);

	return (result);
#elif defined(USE_EPOLL)
	struct epoll_event event;
865 866 867
	uint32_t oldevents;
	int ret;
	int op;
868

869
	oldevents = manager->epoll_events[fd];
870
	if (msg == SELECT_POKE_READ)
871
		manager->epoll_events[fd] |= EPOLLIN;
872
	else
873 874 875
		manager->epoll_events[fd] |= EPOLLOUT;

	event.events = manager->epoll_events[fd];
876
	memset(&event.data, 0, sizeof(event.data));
877
	event.data.fd = fd;
878 879 880 881 882 883 884 885

	op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
	ret = epoll_ctl(manager->epoll_fd, op, fd, &event);
	if (ret == -1) {
		if (errno == EEXIST)
			UNEXPECTED_ERROR(__FILE__, __LINE__,
					 "epoll_ctl(ADD/MOD) returned "
					 "EEXIST for fd %d", fd);
886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915
		result = isc__errno2result(errno);
	}

	return (result);
#elif defined(USE_DEVPOLL)
	struct pollfd pfd;
	int lockid = FDLOCK_ID(fd);

	memset(&pfd, 0, sizeof(pfd));
	if (msg == SELECT_POKE_READ)
		pfd.events = POLLIN;
	else
		pfd.events = POLLOUT;
	pfd.fd = fd;
	pfd.revents = 0;
	LOCK(&manager->fdlock[lockid]);
	if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
		result = isc__errno2result(errno);
	else {
		if (msg == SELECT_POKE_READ)
			manager->fdpollinfo[fd].want_read = 1;
		else
			manager->fdpollinfo[fd].want_write = 1;
	}
	UNLOCK(&manager->fdlock[lockid]);

	return (result);
#elif defined(USE_SELECT)
	LOCK(&manager->lock);
	if (msg == SELECT_POKE_READ)
916
		FD_SET(fd, manager->read_fds);
917
	if (msg == SELECT_POKE_WRITE)
918
		FD_SET(fd, manager->write_fds);
919 920 921 922 923 924 925
	UNLOCK(&manager->lock);

	return (result);
#endif
}

static inline isc_result_t
926
unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944
	isc_result_t result = ISC_R_SUCCESS;

#ifdef USE_KQUEUE
	struct kevent evchange;

	memset(&evchange, 0, sizeof(evchange));
	if (msg == SELECT_POKE_READ)
		evchange.filter = EVFILT_READ;
	else
		evchange.filter = EVFILT_WRITE;
	evchange.flags = EV_DELETE;
	evchange.ident = fd;
	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
		result = isc__errno2result(errno);

	return (result);
#elif defined(USE_EPOLL)
	struct epoll_event event;
945 946
	int ret;
	int op;
947 948

	if (msg == SELECT_POKE_READ)
949
		manager->epoll_events[fd] &= ~(EPOLLIN);
950
	else
951 952 953
		manager->epoll_events[fd] &= ~(EPOLLOUT);

	event.events = manager->epoll_events[fd];
954
	memset(&event.data, 0, sizeof(event.data));
955
	event.data.fd = fd;
956 957 958 959

	op = (event.events == 0U) ? EPOLL_CTL_DEL : EPOLL_CTL_MOD;
	ret = epoll_ctl(manager->epoll_fd, op, fd, &event);
	if (ret == -1 && errno != ENOENT) {
960
		char strbuf[ISC_STRERRORSIZE];
961
		strerror_r(errno, strbuf, sizeof(strbuf));
962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "epoll_ctl(DEL), %d: %s", fd, strbuf);
		result = ISC_R_UNEXPECTED;
	}
	return (result);
#elif defined(USE_DEVPOLL)
	struct pollfd pfds[2];
	size_t writelen = sizeof(pfds[0]);
	int lockid = FDLOCK_ID(fd);

	memset(pfds, 0, sizeof(pfds));
	pfds[0].events = POLLREMOVE;
	pfds[0].fd = fd;

	/*
	 * Canceling read or write polling via /dev/poll is tricky.  Since it
	 * only provides a way of canceling per FD, we may need to re-poll the
	 * socket for the other operation.
	 */
	LOCK(&manager->fdlock[lockid]);
	if (msg == SELECT_POKE_READ &&
	    manager->fdpollinfo[fd].want_write == 1) {
		pfds[1].events = POLLOUT;
		pfds[1].fd = fd;
		writelen += sizeof(pfds[1]);
	}
	if (msg == SELECT_POKE_WRITE &&
	    manager->fdpollinfo[fd].want_read == 1) {
		pfds[1].events = POLLIN;
		pfds[1].fd = fd;
		writelen += sizeof(pfds[1]);
	}

Automatic Updater's avatar
Automatic Updater committed
995
	if (write(manager->devpoll_fd, pfds, writelen) == -1)
996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
		result = isc__errno2result(errno);
	else {
		if (msg == SELECT_POKE_READ)
			manager->fdpollinfo[fd].want_read = 0;
		else
			manager->fdpollinfo[fd].want_write = 0;
	}
	UNLOCK(&manager->fdlock[lockid]);

	return (result);
#elif defined(USE_SELECT)
	LOCK(&manager->lock);
	if (msg == SELECT_POKE_READ)
1009
		FD_CLR(fd, manager->read_fds);
1010
	else if (msg == SELECT_POKE_WRITE)
1011
		FD_CLR(fd, manager->write_fds);
1012 1013 1014 1015 1016 1017
	UNLOCK(&manager->lock);

	return (result);
#endif
}

1018
static void
1019
wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
1020 1021
	isc_result_t result;
	int lockid = FDLOCK_ID(fd);
1022 1023

	/*
1024 1025 1026
	 * This is a wakeup on a socket.  If the socket is not in the
	 * process of being closed, start watching it for either reads
	 * or writes.
1027
	 */
Andreas Gustafsson's avatar
 
Andreas Gustafsson committed
1028

Tatuya JINMEI 神明達哉's avatar