socket.c 174 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
2
 * Copyright (C) 1998-2016  Internet Systems Consortium, Inc. ("ISC")
3
 *
4 5 6
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
Bob Halley's avatar
Bob Halley committed
7
 */
Bob Halley's avatar
Bob Halley committed
8

9
/*! \file */
David Lawrence's avatar
David Lawrence committed
10

Bob Halley's avatar
Bob Halley committed
11
#include <config.h>
12

13
#include <sys/param.h>
Michael Graff's avatar
Michael Graff committed
14
#include <sys/types.h>
Michael Graff's avatar
Michael Graff committed
15
#include <sys/socket.h>
16
#include <sys/stat.h>
Michael Graff's avatar
Michael Graff committed
17
#include <sys/time.h>
Michael Graff's avatar
Michael Graff committed
18 19
#include <sys/uio.h>

Mark Andrews's avatar
Mark Andrews committed
20 21 22 23 24
#if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#endif

25
#include <errno.h>
Andreas Gustafsson's avatar
Andreas Gustafsson committed
26
#include <fcntl.h>
27 28 29 30
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
Mark Andrews's avatar
Mark Andrews committed
31 32 33
#ifdef HAVE_INTTYPES_H
#include <inttypes.h> /* uintptr_t */
#endif
34

35
#include <isc/buffer.h>
36
#include <isc/bufferlist.h>
37
#include <isc/condition.h>
38
#include <isc/formatcheck.h>
39
#include <isc/json.h>
40
#include <isc/list.h>
Michael Graff's avatar
Michael Graff committed
41
#include <isc/log.h>
42
#include <isc/mem.h>
43
#include <isc/msgs.h>
44
#include <isc/mutex.h>
45
#include <isc/net.h>
46
#include <isc/once.h>
47
#include <isc/platform.h>
Michael Graff's avatar
Michael Graff committed
48
#include <isc/print.h>
49
#include <isc/region.h>
50
#include <isc/resource.h>
51
#include <isc/socket.h>
52
#include <isc/stats.h>
53
#include <isc/strerror.h>
54
#include <isc/task.h>
55
#include <isc/thread.h>
Michael Graff's avatar
Michael Graff committed
56
#include <isc/util.h>
57
#include <isc/xml.h>
Bob Halley's avatar
Bob Halley committed
58

59 60 61 62 63 64 65 66 67 68
#ifdef ISC_PLATFORM_HAVESYSUNH
#include <sys/un.h>
#endif
#ifdef ISC_PLATFORM_HAVEKQUEUE
#include <sys/event.h>
#endif
#ifdef ISC_PLATFORM_HAVEEPOLL
#include <sys/epoll.h>
#endif
#ifdef ISC_PLATFORM_HAVEDEVPOLL
69
#if defined(HAVE_SYS_DEVPOLL_H)
70
#include <sys/devpoll.h>
71 72 73
#elif defined(HAVE_DEVPOLL_H)
#include <devpoll.h>
#endif
74 75
#endif

76 77
#include <netinet/tcp.h>

78 79
#include "errno2result.h"

80 81 82 83 84 85 86 87
/* See task.c about the following definition: */
#ifdef ISC_PLATFORM_USETHREADS
#define USE_WATCHER_THREAD
#else
#define USE_SHARED_MANAGER
#endif	/* ISC_PLATFORM_USETHREADS */

#ifndef USE_WATCHER_THREAD
88
#include "socket_p.h"
89
#include "../task_p.h"
90
#endif /* USE_WATCHER_THREAD */
91

92 93 94 95
#if defined(SO_BSDCOMPAT) && defined(__linux__)
#include <sys/utsname.h>
#endif

96 97 98 99
#ifdef ISC_PLATFORM_HAVETFO
#include <netinet/tcp.h>
#endif

100
/*%
Automatic Updater's avatar
Automatic Updater committed
101
 * Choose the most preferable multiplex method.
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
 */
#ifdef ISC_PLATFORM_HAVEKQUEUE
#define USE_KQUEUE
#elif defined (ISC_PLATFORM_HAVEEPOLL)
#define USE_EPOLL
#elif defined (ISC_PLATFORM_HAVEDEVPOLL)
#define USE_DEVPOLL
typedef struct {
	unsigned int want_read : 1,
		want_write : 1;
} pollinfo_t;
#else
#define USE_SELECT
#endif	/* ISC_PLATFORM_HAVEKQUEUE */

117
#ifndef USE_WATCHER_THREAD
118 119 120 121 122 123
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
struct isc_socketwait {
	int nevents;
};
#elif defined (USE_SELECT)
struct isc_socketwait {
124 125
	fd_set *readset;
	fd_set *writeset;
126 127 128 129
	int nfds;
	int maxfd;
};
#endif	/* USE_KQUEUE */
130
#endif /* !USE_WATCHER_THREAD */
131

Evan Hunt's avatar
Evan Hunt committed
132 133 134 135 136 137 138
/*
 * Set by the -T dscp option on the command line. If set to a value
 * other than -1, we check to make sure DSCP values match it, and
 * assert if not.
 */
int isc_dscp_check_value = -1;

139 140
/*%
 * Maximum number of allowable open sockets.  This is also the maximum
141 142 143 144 145 146
 * allowable socket file descriptor.
 *
 * Care should be taken before modifying this value for select():
 * The API standard doesn't ensure select() accept more than (the system default
 * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
 * the vast majority of cases.  This constant should therefore be increased only
Automatic Updater's avatar
Automatic Updater committed
147
 * when absolutely necessary and possible, i.e., the server is exhausting all
148 149 150 151 152 153 154 155 156
 * available file descriptors (up to FD_SETSIZE) and the select() function
 * and FD_xxx macros support larger values than FD_SETSIZE (which may not
 * always by true, but we keep using some of them to ensure as much
 * portability as possible).  Note also that overall server performance
 * may be rather worsened with a larger value of this constant due to
 * inherent scalability problems of select().
 *
 * As a special note, this value shouldn't have to be touched if
 * this is a build for an authoritative only DNS server.
157 158
 */
#ifndef ISC_SOCKET_MAXSOCKETS
159
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
160 161 162
#ifdef TUNE_LARGE
#define ISC_SOCKET_MAXSOCKETS 21000
#else
163
#define ISC_SOCKET_MAXSOCKETS 4096
164
#endif /* TUNE_LARGE */
165 166 167 168 169 170 171
#elif defined(USE_SELECT)
#define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
#endif	/* USE_KQUEUE... */
#endif	/* ISC_SOCKET_MAXSOCKETS */

#ifdef USE_SELECT
/*%
172 173
 * Mac OS X needs a special definition to support larger values in select().
 * We always define this because a larger value can be specified run-time.
174 175 176 177 178 179
 */
#ifdef __APPLE__
#define _DARWIN_UNLIMITED_SELECT
#endif	/* __APPLE__ */
#endif	/* USE_SELECT */

180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
#ifdef ISC_SOCKET_USE_POLLWATCH
/*%
 * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
 * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
 * some of the specified FD.  The idea is based on the observation that it's
 * likely for a busy server to keep receiving packets.  It specifically works
 * as follows: the socket watcher is first initialized with the state of
 * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
 * event occurs.  When it wakes up for a socket I/O event, it moves to the
 * poll_active state, and sets the poll timeout to a short period
 * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
 * watcher goes to the poll_checking state with the same timeout period.
 * In this state, the watcher tries to detect whether this is a break
 * during intermittent events or the kernel bug is triggered.  If the next
 * polling reports an event within the short period, the previous timeout is
 * likely to be a kernel bug, and so the watcher goes back to the active state.
 * Otherwise, it moves to the idle state again.
 *
 * It's not clear whether this is a thread-related bug, but since we've only
 * seen this with threads, this workaround is used only when enabling threads.
 */

typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;

#ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
#define ISC_SOCKET_POLLWATCH_TIMEOUT 10
#endif	/* ISC_SOCKET_POLLWATCH_TIMEOUT */
#endif	/* ISC_SOCKET_USE_POLLWATCH */

209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
/*%
 * Size of per-FD lock buckets.
 */
#ifdef ISC_PLATFORM_USETHREADS
#define FDLOCK_COUNT		1024
#define FDLOCK_ID(fd)		((fd) % FDLOCK_COUNT)
#else
#define FDLOCK_COUNT		1
#define FDLOCK_ID(fd)		0
#endif	/* ISC_PLATFORM_USETHREADS */

/*%
 * Maximum number of events communicated with the kernel.  There should normally
 * be no need for having a large number.
 */
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
#ifndef ISC_SOCKET_MAXEVENTS
226 227 228
#ifdef TUNE_LARGE
#define ISC_SOCKET_MAXEVENTS	2048
#else
229
#define ISC_SOCKET_MAXEVENTS	64
230
#endif /* TUNE_LARGE */
231 232 233
#endif
#endif

234
/*%
235
 * Some systems define the socket length argument as an int, some as size_t,
236
 * some as socklen_t.  This is here so it can be easily changed if needed.
237
 */
238
#ifndef ISC_SOCKADDR_LEN_T
239
#define ISC_SOCKADDR_LEN_T unsigned int
240
#endif
241

242
/*%
243 244
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
245 246 247 248
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
249
 */
250 251 252 253
#define SOFT_ERROR(e)	((e) == EAGAIN || \
			 (e) == EWOULDBLOCK || \
			 (e) == EINTR || \
			 (e) == 0)
254

Michael Graff's avatar
Michael Graff committed
255
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
256

257
/*!<
Michael Graff's avatar
Michael Graff committed
258 259 260 261 262 263
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
264 265 266 267 268 269 270 271 272 273 274
#define TRACE_LEVEL		90
#define CORRECTNESS_LEVEL	70
#define IOEVENT_LEVEL		60
#define EVENT_LEVEL		50
#define CREATION_LEVEL		20

#define TRACE		DLVL(TRACE_LEVEL)
#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
#define IOEVENT		DLVL(IOEVENT_LEVEL)
#define EVENT		DLVL(EVENT_LEVEL)
#define CREATION	DLVL(CREATION_LEVEL)
275

276
typedef isc_event_t intev_t;
Michael Graff's avatar
Michael Graff committed
277

278
#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
279
#define VALID_SOCKET(s)		ISC_MAGIC_VALID(s, SOCKET_MAGIC)
Michael Graff's avatar
Michael Graff committed
280

281
/*!
Michael Graff's avatar
Michael Graff committed
282 283 284 285
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
286
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
Michael Graff's avatar
Michael Graff committed
287 288 289 290 291
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

292
/*%
293
 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
Michael Graff's avatar
Michael Graff committed
294 295 296 297 298 299 300 301 302
 * a setsockopt() like interface to request timestamps, and if the OS
 * doesn't do it for us, call gettimeofday() on every UDP receive?
 */
#ifdef SO_TIMESTAMP
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

303
/*%
Francis Dupont's avatar
Francis Dupont committed
304
 * The size to raise the receive buffer to (from BIND 8).
305
 */
306
#ifdef TUNE_LARGE
307 308 309
#ifdef sun
#define RCVBUFSIZE (1*1024*1024)
#else
310
#define RCVBUFSIZE (16*1024*1024)
311
#endif
312
#else
313
#define RCVBUFSIZE (32*1024)
314
#endif /* TUNE_LARGE */
315

316
/*%
317 318 319 320
 * The number of times a send operation is repeated if the result is EINTR.
 */
#define NRETRIES 10

321 322 323 324 325 326
typedef struct isc__socket isc__socket_t;
typedef struct isc__socketmgr isc__socketmgr_t;

#define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)

struct isc__socket {
327
	/* Not locked. */
328 329
	isc_socket_t		common;
	isc__socketmgr_t	*manager;
330 331
	isc_mutex_t		lock;
	isc_sockettype_t	type;
332
	const isc_statscounter_t	*statsindex;
Michael Graff's avatar
Michael Graff committed
333

334
	/* Locked by socket lock. */
335
	ISC_LINK(isc__socket_t)	link;
336 337
	unsigned int		references;
	int			fd;
338
	int			pf;
339 340 341
	char				name[16];
	void *				tag;

342
	ISC_LIST(isc_socketevent_t)		send_list;
343
	ISC_LIST(isc_socketevent_t)		recv_list;
344
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
345
	ISC_LIST(isc_socket_connev_t)		connect_list;
346 347 348 349 350 351

	/*
	 * Internal events.  Posted when a descriptor is readable or
	 * writable.  These are statically allocated and never freed.
	 * They will be set to non-purgable before use.
	 */
352 353
	intev_t			readable_ev;
	intev_t			writable_ev;
354

355
	isc_sockaddr_t		peer_address;       /* remote address */
356

357 358 359
	unsigned int		pending_recv : 1,
				pending_send : 1,
				pending_accept : 1,
360
				listener : 1,       /* listener socket */
361
				connected : 1,
362 363 364
				connecting : 1,     /* connect pending */
				bound : 1,          /* bound to local addr */
				dupped : 1,
365 366
				active : 1,         /* currently active */
				pktdscp : 1;	    /* per packet dscp */
367

368
#ifdef ISC_NET_RECVOVERFLOW
369
	unsigned char		overflow; /* used for MSG_TRUNC fake */
370
#endif
371 372 373 374 375

	char			*recvcmsgbuf;
	ISC_SOCKADDR_LEN_T	recvcmsgbuflen;
	char			*sendcmsgbuf;
	ISC_SOCKADDR_LEN_T	sendcmsgbuflen;
376 377 378 379 380

	void			*fdwatcharg;
	isc_sockfdwatch_t	fdwatchcb;
	int			fdwatchflags;
	isc_task_t		*fdwatchtask;
381
	unsigned int		dscp;
382 383
};

384 385 386
#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)

387
struct isc__socketmgr {
388
	/* Not locked. */
389
	isc_socketmgr_t		common;
390 391
	isc_mem_t	       *mctx;
	isc_mutex_t		lock;
392
	isc_mutex_t		*fdlock;
393
	isc_stats_t		*stats;
394 395 396 397 398 399 400 401 402 403 404 405
#ifdef USE_KQUEUE
	int			kqueue_fd;
	int			nevents;
	struct kevent		*events;
#endif	/* USE_KQUEUE */
#ifdef USE_EPOLL
	int			epoll_fd;
	int			nevents;
	struct epoll_event	*events;
#endif	/* USE_EPOLL */
#ifdef USE_DEVPOLL
	int			devpoll_fd;
406 407
	isc_resourcevalue_t	open_max;
	unsigned int		calls;
408 409 410
	int			nevents;
	struct pollfd		*events;
#endif	/* USE_DEVPOLL */
411 412 413
#ifdef USE_SELECT
	int			fd_bufsize;
#endif	/* USE_SELECT */
414 415 416 417 418 419
	unsigned int		maxsocks;
#ifdef ISC_PLATFORM_USETHREADS
	int			pipe_fds[2];
#endif

	/* Locked by fdlock. */
420
	isc__socket_t	       **fds;
421
	int			*fdstate;
422 423 424
#if defined(USE_EPOLL)
	uint32_t		*epoll_events;
#endif
425 426 427 428
#ifdef USE_DEVPOLL
	pollinfo_t		*fdpollinfo;
#endif

429
	/* Locked by manager lock. */
430
	ISC_LIST(isc__socket_t)	socklist;
431
#ifdef USE_SELECT
432 433 434 435
	fd_set			*read_fds;
	fd_set			*read_fds_copy;
	fd_set			*write_fds;
	fd_set			*write_fds_copy;
436
	int			maxfd;
437
#endif	/* USE_SELECT */
438
	int			reserved;	/* unlocked */
439
#ifdef USE_WATCHER_THREAD
440 441
	isc_thread_t		watcher;
	isc_condition_t		shutdown_ok;
442
#else /* USE_WATCHER_THREAD */
443
	unsigned int		refs;
444
#endif /* USE_WATCHER_THREAD */
445
	int			maxudp;
446 447
};

448 449 450
#ifdef USE_SHARED_MANAGER
static isc__socketmgr_t *socketmgr = NULL;
#endif /* USE_SHARED_MANAGER */
451

452 453 454
#define CLOSED			0	/* this one must be zero */
#define MANAGED			1
#define CLOSE_PENDING		2
Michael Graff's avatar
Michael Graff committed
455

456 457 458 459 460 461 462 463 464 465
/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
#ifdef ISC_NET_RECVOVERFLOW
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
#else
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
#endif

466 467 468 469
static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
				  isc_sockettype_t type,
				  isc_socket_t **socketp,
				  isc_socket_t *dup_socket);
470 471
static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
472
static void send_connectdone_event(isc__socket_t *, isc_socket_connev_t **);
473 474 475 476
static void free_socket(isc__socket_t **);
static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
				    isc__socket_t **);
static void destroy(isc__socket_t **);
477 478 479 480
static void internal_accept(isc_task_t *, isc_event_t *);
static void internal_connect(isc_task_t *, isc_event_t *);
static void internal_recv(isc_task_t *, isc_event_t *);
static void internal_send(isc_task_t *, isc_event_t *);
481 482
static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
483 484
static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
static void build_msghdr_send(isc__socket_t *, isc_socketevent_t *,
485
			      struct msghdr *, struct iovec *, size_t *);
486
static void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *,
487
			      struct msghdr *, struct iovec *, size_t *);
488 489 490
#ifdef USE_WATCHER_THREAD
static isc_boolean_t process_ctlfd(isc__socketmgr_t *manager);
#endif
Evan Hunt's avatar
Evan Hunt committed
491
static void setdscp(isc__socket_t *sock, isc_dscp_t dscp);
492 493

/*%
494 495 496
 * The following are intended for internal use (indicated by "isc__"
 * prefix) but are not declared as static, allowing direct access from
 * unit tests etc.
497 498
 */

499 500 501 502 503
isc_result_t
isc__socket_open(isc_socket_t *sock0);
isc_result_t
isc__socket_close(isc_socket_t *sock0);
isc_result_t
504 505
isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
		   isc_socket_t **socketp);
506
void
507
isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
508
void
509
isc__socket_detach(isc_socket_t **socketp);
510
isc_result_t
511 512
isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
		 unsigned int minimum, isc_task_t *task,
Mark Andrews's avatar
Mark Andrews committed
513
		  isc_taskaction_t action, void *arg);
514
isc_result_t
515 516
isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
		 unsigned int minimum, isc_task_t *task,
Mark Andrews's avatar
Mark Andrews committed
517
		 isc_taskaction_t action, void *arg);
518
isc_result_t
519 520 521
isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
		  unsigned int minimum, isc_task_t *task,
		  isc_socketevent_t *event, unsigned int flags);
522
isc_result_t
523
isc__socket_send(isc_socket_t *sock, isc_region_t *region,
Mark Andrews's avatar
Mark Andrews committed
524
		 isc_task_t *task, isc_taskaction_t action, void *arg);
525
isc_result_t
526
isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
Mark Andrews's avatar
Mark Andrews committed
527
		   isc_task_t *task, isc_taskaction_t action, void *arg,
528
		   const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
529
isc_result_t
530
isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
Mark Andrews's avatar
Mark Andrews committed
531
		  isc_task_t *task, isc_taskaction_t action, void *arg);
532
isc_result_t
533
isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
Mark Andrews's avatar
Mark Andrews committed
534
		    isc_task_t *task, isc_taskaction_t action, void *arg,
535
		    const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
536
isc_result_t
537
isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist,
Mark Andrews's avatar
Mark Andrews committed
538
		     isc_task_t *task, isc_taskaction_t action, void *arg,
539
		     const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
540 541
		     unsigned int flags);
isc_result_t
542 543
isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
		    isc_task_t *task,
544
		    const isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
545
		    isc_socketevent_t *event, unsigned int flags);
Evan Hunt's avatar
Evan Hunt committed
546 547 548
isc_socketevent_t *
isc_socket_socketevent(isc_mem_t *mctx, void *sender,
		       isc_eventtype_t eventtype, isc_taskaction_t action,
Mark Andrews's avatar
Mark Andrews committed
549
		       void *arg);
Evan Hunt's avatar
Evan Hunt committed
550

551
void
552
isc__socket_cleanunix(const isc_sockaddr_t *sockaddr, isc_boolean_t active);
553
isc_result_t
554
isc__socket_permunix(const isc_sockaddr_t *sockaddr, isc_uint32_t perm,
555
		     isc_uint32_t owner, isc_uint32_t group);
556
isc_result_t
557
isc__socket_bind(isc_socket_t *sock, const isc_sockaddr_t *sockaddr,
558
		 unsigned int options);
559
isc_result_t
560
isc__socket_filter(isc_socket_t *sock, const char *filter);
561
isc_result_t
562
isc__socket_listen(isc_socket_t *sock, unsigned int backlog);
563
isc_result_t
564
isc__socket_accept(isc_socket_t *sock,
Mark Andrews's avatar
Mark Andrews committed
565
		   isc_task_t *task, isc_taskaction_t action, void *arg);
566
isc_result_t
567
isc__socket_connect(isc_socket_t *sock, const isc_sockaddr_t *addr,
568
		    isc_task_t *task, isc_taskaction_t action,
Mark Andrews's avatar
Mark Andrews committed
569
		    void *arg);
570
isc_result_t
571
isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp);
572
isc_result_t
573
isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp);
574
void
575
isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
576
isc_sockettype_t
577
isc__socket_gettype(isc_socket_t *sock);
578
isc_boolean_t
579
isc__socket_isbound(isc_socket_t *sock);
580
void
581
isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes);
582
void
Evan Hunt's avatar
Evan Hunt committed
583
isc__socket_dscp(isc_socket_t *sock, isc_dscp_t dscp);
584
isc_result_t
585 586 587
isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
			  isc_sockfdwatch_t callback, void *cbarg,
			  isc_task_t *task, isc_socket_t **socketp);
588
isc_result_t
589
isc__socket_fdwatchpoke(isc_socket_t *sock, int flags);
590
isc_result_t
591
isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp);
592
int
Mark Andrews's avatar
Mark Andrews committed
593
isc__socket_getfd(isc_socket_t *sock);
594

595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621
isc_result_t
isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp);
isc_result_t
isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
		       unsigned int maxsocks);
isc_result_t
isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp);
void
isc_socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats);
void
isc__socketmgr_destroy(isc_socketmgr_t **managerp);
void
isc__socket_setname(isc_socket_t *socket0, const char *name, void *tag);
const char *
isc__socket_getname(isc_socket_t *socket0);
void *
isc__socket_gettag(isc_socket_t *socket0);

#ifdef HAVE_LIBXML2
void
isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer);
#endif
#ifdef HAVE_JSON
isc_result_t
isc__socketmgr_renderjson(isc_socketmgr_t *mgr0, json_object *stats);
#endif

622 623 624 625 626 627 628
static struct {
	isc_socketmethods_t methods;

	/*%
	 * The following are defined just for avoiding unused static functions.
	 */
	void *recvv, *send, *sendv, *sendto2, *cleanunix, *permunix, *filter,
629
	     *listen, *accept, *getpeername, *isbound;
630 631 632 633 634 635
} socketmethods = {
	{
		isc__socket_attach,
		isc__socket_detach,
		isc__socket_bind,
		isc__socket_sendto,
636
		isc__socket_sendto2,
637 638
		isc__socket_connect,
		isc__socket_recv,
639
		isc__socket_recv2,
640 641 642
		isc__socket_cancel,
		isc__socket_getsockname,
		isc__socket_gettype,
643
		isc__socket_ipv6only,
644
		isc__socket_fdwatchpoke,
Mark Andrews's avatar
Mark Andrews committed
645
		isc__socket_dup,
Evan Hunt's avatar
Evan Hunt committed
646 647
		isc__socket_getfd,
		isc__socket_dscp
648
	},
649 650 651 652 653 654
	(void *)isc__socket_recvv, (void *)isc__socket_send,
	(void *)isc__socket_sendv, (void *)isc__socket_sendto2,
	(void *)isc__socket_cleanunix, (void *)isc__socket_permunix,
	(void *)isc__socket_filter, (void *)isc__socket_listen,
	(void *)isc__socket_accept, (void *)isc__socket_getpeername,
	(void *)isc__socket_isbound
655 656 657 658
};

static isc_socketmgrmethods_t socketmgrmethods = {
	isc__socketmgr_destroy,
659 660
	isc__socket_create,
	isc__socket_fdwatchcreate
661 662
};

Michael Graff's avatar
Michael Graff committed
663 664
#define SELECT_POKE_SHUTDOWN		(-1)
#define SELECT_POKE_NOTHING		(-2)
665
#define SELECT_POKE_READ		(-3)
666
#define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
667
#define SELECT_POKE_WRITE		(-4)
668
#define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
669
#define SELECT_POKE_CLOSE		(-5)
670

671 672
#define SOCK_DEAD(s)			((s)->references == 0)

673 674 675 676 677 678 679 680 681 682 683 684 685
/*%
 * Shortcut index arrays to get access to statistics counters.
 */
enum {
	STATID_OPEN = 0,
	STATID_OPENFAIL = 1,
	STATID_CLOSE = 2,
	STATID_BINDFAIL = 3,
	STATID_CONNECTFAIL = 4,
	STATID_CONNECT = 5,
	STATID_ACCEPTFAIL = 6,
	STATID_ACCEPT = 7,
	STATID_SENDFAIL = 8,
686 687
	STATID_RECVFAIL = 9,
	STATID_ACTIVE = 10
688
};
Mark Andrews's avatar
Mark Andrews committed
689
static const isc_statscounter_t udp4statsindex[] = {
690 691 692 693 694 695 696 697 698
	isc_sockstatscounter_udp4open,
	isc_sockstatscounter_udp4openfail,
	isc_sockstatscounter_udp4close,
	isc_sockstatscounter_udp4bindfail,
	isc_sockstatscounter_udp4connectfail,
	isc_sockstatscounter_udp4connect,
	-1,
	-1,
	isc_sockstatscounter_udp4sendfail,
699 700
	isc_sockstatscounter_udp4recvfail,
	isc_sockstatscounter_udp4active
701
};
Mark Andrews's avatar
Mark Andrews committed
702
static const isc_statscounter_t udp6statsindex[] = {
703 704 705 706 707 708 709 710 711
	isc_sockstatscounter_udp6open,
	isc_sockstatscounter_udp6openfail,
	isc_sockstatscounter_udp6close,
	isc_sockstatscounter_udp6bindfail,
	isc_sockstatscounter_udp6connectfail,
	isc_sockstatscounter_udp6connect,
	-1,
	-1,
	isc_sockstatscounter_udp6sendfail,
712 713
	isc_sockstatscounter_udp6recvfail,
	isc_sockstatscounter_udp6active
714 715 716 717 718 719 720 721 722 723 724
};
static const isc_statscounter_t tcp4statsindex[] = {
	isc_sockstatscounter_tcp4open,
	isc_sockstatscounter_tcp4openfail,
	isc_sockstatscounter_tcp4close,
	isc_sockstatscounter_tcp4bindfail,
	isc_sockstatscounter_tcp4connectfail,
	isc_sockstatscounter_tcp4connect,
	isc_sockstatscounter_tcp4acceptfail,
	isc_sockstatscounter_tcp4accept,
	isc_sockstatscounter_tcp4sendfail,
725 726
	isc_sockstatscounter_tcp4recvfail,
	isc_sockstatscounter_tcp4active
727 728 729 730 731 732 733 734 735 736 737
};
static const isc_statscounter_t tcp6statsindex[] = {
	isc_sockstatscounter_tcp6open,
	isc_sockstatscounter_tcp6openfail,
	isc_sockstatscounter_tcp6close,
	isc_sockstatscounter_tcp6bindfail,
	isc_sockstatscounter_tcp6connectfail,
	isc_sockstatscounter_tcp6connect,
	isc_sockstatscounter_tcp6acceptfail,
	isc_sockstatscounter_tcp6accept,
	isc_sockstatscounter_tcp6sendfail,
738 739
	isc_sockstatscounter_tcp6recvfail,
	isc_sockstatscounter_tcp6active
740 741 742 743 744 745 746 747 748 749 750
};
static const isc_statscounter_t unixstatsindex[] = {
	isc_sockstatscounter_unixopen,
	isc_sockstatscounter_unixopenfail,
	isc_sockstatscounter_unixclose,
	isc_sockstatscounter_unixbindfail,
	isc_sockstatscounter_unixconnectfail,
	isc_sockstatscounter_unixconnect,
	isc_sockstatscounter_unixacceptfail,
	isc_sockstatscounter_unixaccept,
	isc_sockstatscounter_unixsendfail,
751 752
	isc_sockstatscounter_unixrecvfail,
	isc_sockstatscounter_unixactive
753 754 755 756 757 758 759 760 761 762 763
};
static const isc_statscounter_t fdwatchstatsindex[] = {
	-1,
	-1,
	isc_sockstatscounter_fdwatchclose,
	isc_sockstatscounter_fdwatchbindfail,
	isc_sockstatscounter_fdwatchconnectfail,
	isc_sockstatscounter_fdwatchconnect,
	-1,
	-1,
	isc_sockstatscounter_fdwatchsendfail,
764 765
	isc_sockstatscounter_fdwatchrecvfail,
	-1
766
};
767 768 769 770 771 772 773 774 775 776 777 778 779
static const isc_statscounter_t rawstatsindex[] = {
	isc_sockstatscounter_rawopen,
	isc_sockstatscounter_rawopenfail,
	isc_sockstatscounter_rawclose,
	-1,
	-1,
	-1,
	-1,
	-1,
	-1,
	isc_sockstatscounter_rawrecvfail,
	isc_sockstatscounter_rawactive
};
780

781 782
#if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \
    defined(USE_WATCHER_THREAD)
783
static void
784
manager_log(isc__socketmgr_t *sockmgr,
785 786
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
Michael Graff's avatar
Michael Graff committed
787
static void
788
manager_log(isc__socketmgr_t *sockmgr,
Michael Graff's avatar
Michael Graff committed
789 790 791 792 793 794
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...)
{
	char msgbuf[2048];
	va_list ap;

795 796 797
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
798 799 800 801 802 803 804
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
		      "sockmgr %p: %s", sockmgr, msgbuf);
}
805
#endif
Michael Graff's avatar
Michael Graff committed
806

807
static void
808
socket_log(isc__socket_t *sock, const isc_sockaddr_t *address,
809 810 811
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   isc_msgcat_t *msgcat, int msgset, int message,
	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
Michael Graff's avatar
Michael Graff committed
812
static void
813
socket_log(isc__socket_t *sock, const isc_sockaddr_t *address,
Michael Graff's avatar
Michael Graff committed
814
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
815
	   isc_msgcat_t *msgcat, int msgset, int message,
Michael Graff's avatar
Michael Graff committed
816 817 818
	   const char *fmt, ...)
{
	char msgbuf[2048];
819
	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
Michael Graff's avatar
Michael Graff committed
820 821
	va_list ap;

822 823 824
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
825 826 827 828 829
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
830 831 832
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p: %s", sock, msgbuf);
Michael Graff's avatar
Michael Graff committed
833
	} else {
Andreas Gustafsson's avatar
Andreas Gustafsson committed
834
		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
835 836 837
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p %s: %s", sock, peerbuf, msgbuf);
Michael Graff's avatar
Michael Graff committed
838 839 840
	}
}

841 842 843 844 845 846 847
#if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
    defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
/*
 * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
 * setting IPV6_V6ONLY.
 */
static void
848
FIX_IPV6_RECVPKTINFO(isc__socket_t *sock)
849 850 851 852 853 854 855 856 857
{
	char strbuf[ISC_STRERRORSIZE];
	int on = 1;

	if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
		return;

	if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
		       (void *)&on, sizeof(on)) < 0) {
Automatic Updater's avatar
Automatic Updater committed
858

859
		isc__strerror(errno, strbuf, sizeof(strbuf));
860 861 862 863 864 865 866 867 868 869 870 871 872 873
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "setsockopt(%d, IPV6_RECVPKTINFO) "
				 "%s: %s", sock->fd,
				 isc_msgcat_get(isc_msgcat,
						ISC_MSGSET_GENERAL,
						ISC_MSG_FAILED,
						"failed"),
				 strbuf);
	}
}
#else
#define FIX_IPV6_RECVPKTINFO(sock) (void)0
#endif

874 875 876 877 878 879 880 881 882 883 884
/*%
 * Increment socket-related statistics counters.
 */
static inline void
inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
	REQUIRE(counterid != -1);

	if (stats != NULL)
		isc_stats_increment(stats, counterid);
}

885 886 887 888 889 890 891 892 893 894 895
/*%
 * Decrement socket-related statistics counters.
 */
static inline void
dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
	REQUIRE(counterid != -1);

	if (stats != NULL)
		isc_stats_decrement(stats, counterid);
}

896
static inline isc_result_t
897
watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915
	isc_result_t result = ISC_R_SUCCESS;

#ifdef USE_KQUEUE
	struct kevent evchange;

	memset(&evchange, 0, sizeof(evchange));
	if (msg == SELECT_POKE_READ)
		evchange.filter = EVFILT_READ;
	else
		evchange.filter = EVFILT_WRITE;
	evchange.flags = EV_ADD;
	evchange.ident = fd;
	if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
		result = isc__errno2result(errno);

	return (result);
#elif defined(USE_EPOLL)
	struct epoll_event event;
916 917 918
	uint32_t oldevents;
	int ret;
	int op;
919

920
	oldevents = manager->epoll_events[fd];
921
	if (msg == SELECT_POKE_READ)
922
		manager->epoll_events[fd] |= EPOLLIN;
923
	else
924 925 926
		manager->epoll_events[fd] |= EPOLLOUT;

	event.events = manager->epoll_events[fd];
927
	memset(&event.data, 0, sizeof(event.data));
928
	event.data.fd = fd;
929 930 931 932 933 934 935 936

	op = (oldevents == 0U) ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
	ret = epoll_ctl(manager->epoll_fd, op, fd, &event);
	if (ret == -1) {
		if (errno == EEXIST)
			UNEXPECTED_ERROR(__FILE__, __LINE__,
					 "epoll_ctl(ADD/MOD) returned "
					 "EEXIST for fd %d", fd);
937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966
		result = isc__errno2result(errno);
	}

	return (result);
#elif defined(USE_DEVPOLL)
	struct pollfd pfd;
	int lockid = FDLOCK_ID(fd);

	memset(&pfd, 0, sizeof(pfd));
	if (msg == SELECT_POKE_READ)
		pfd.events = POLLIN;
	else
		pfd.events = POLLOUT;
	pfd.fd = fd;
	pfd.revents = 0;
	LOCK(&manager->fdlock[lockid]);
	if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
		result = isc__errno2result(errno);
	else {
		if (msg == SELECT_POKE_READ)
			manager->fdpollinfo[fd].want_read = 1;
		else
			manager->fdpollinfo[fd].want_write = 1;
	}
	UNLOCK(&manager->fdlock[lockid]);

	return (result);
#elif defined(USE_SELECT)
	LOCK(&manager->lock);
	if (msg == SELECT_POKE_READ)
967
		FD_SET(fd, manager->read_fds);
968
	if (msg == SELECT_POKE_WRITE)
969
		FD_SET(fd, manager->write_fds);
970 971 972 973 974 975 976
	UNLOCK(&manager->lock);

	return (result);
#endif
}

static inline isc_result_t
977
unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
Tatuya JINMEI 神明達哉's avatar