socket.c 84.2 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
Mark Andrews's avatar
Mark Andrews committed
2
 * Copyright (C) 1998-2002  Internet Software Consortium.
3
 *
Bob Halley's avatar
Bob Halley committed
4 5 6
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
7
 *
8 9 10 11 12 13 14 15
 * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM
 * DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
 * INTERNET SOFTWARE CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
 * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
 * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
 * WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
Bob Halley's avatar
Bob Halley committed
16
 */
Bob Halley's avatar
Bob Halley committed
17

18
/* $Id: socket.c,v 1.234 2004/01/29 04:39:19 marka Exp $ */
David Lawrence's avatar
David Lawrence committed
19

Bob Halley's avatar
Bob Halley committed
20
#include <config.h>
21

22
#include <sys/param.h>
Michael Graff's avatar
Michael Graff committed
23
#include <sys/types.h>
Michael Graff's avatar
Michael Graff committed
24 25
#include <sys/socket.h>
#include <sys/time.h>
Michael Graff's avatar
Michael Graff committed
26 27
#include <sys/uio.h>

28
#include <errno.h>
Andreas Gustafsson's avatar
Andreas Gustafsson committed
29
#include <fcntl.h>
30 31 32 33 34
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

35
#include <isc/buffer.h>
36
#include <isc/bufferlist.h>
37
#include <isc/condition.h>
38
#include <isc/formatcheck.h>
39
#include <isc/list.h>
Michael Graff's avatar
Michael Graff committed
40
#include <isc/log.h>
41
#include <isc/mem.h>
42
#include <isc/msgs.h>
43
#include <isc/mutex.h>
44
#include <isc/net.h>
45
#include <isc/platform.h>
Michael Graff's avatar
Michael Graff committed
46
#include <isc/print.h>
47
#include <isc/region.h>
48
#include <isc/socket.h>
49
#include <isc/strerror.h>
50
#include <isc/task.h>
51
#include <isc/thread.h>
Michael Graff's avatar
Michael Graff committed
52
#include <isc/util.h>
Bob Halley's avatar
Bob Halley committed
53

54 55
#include "errno2result.h"

56
#ifndef ISC_PLATFORM_USETHREADS
57
#include "socket_p.h"
58
#endif /* ISC_PLATFORM_USETHREADS */
59

60 61
/*
 * Some systems define the socket length argument as an int, some as size_t,
62
 * some as socklen_t.  This is here so it can be easily changed if needed.
63
 */
64
#ifndef ISC_SOCKADDR_LEN_T
65
#define ISC_SOCKADDR_LEN_T unsigned int
66
#endif
67

68 69 70
/*
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
71 72 73 74
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
75
 */
76 77 78 79
#define SOFT_ERROR(e)	((e) == EAGAIN || \
			 (e) == EWOULDBLOCK || \
			 (e) == EINTR || \
			 (e) == 0)
80

Michael Graff's avatar
Michael Graff committed
81
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
82

Michael Graff's avatar
Michael Graff committed
83 84 85 86 87 88 89
/*
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
90 91 92 93 94 95 96 97 98 99 100
#define TRACE_LEVEL		90
#define CORRECTNESS_LEVEL	70
#define IOEVENT_LEVEL		60
#define EVENT_LEVEL		50
#define CREATION_LEVEL		20

#define TRACE		DLVL(TRACE_LEVEL)
#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
#define IOEVENT		DLVL(IOEVENT_LEVEL)
#define EVENT		DLVL(EVENT_LEVEL)
#define CREATION	DLVL(CREATION_LEVEL)
101

102
typedef isc_event_t intev_t;
Michael Graff's avatar
Michael Graff committed
103

104 105
#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
#define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
Michael Graff's avatar
Michael Graff committed
106

Michael Graff's avatar
Michael Graff committed
107 108 109 110 111 112 113 114 115 116 117 118
/*
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
#ifdef ISC_PLATFORM_HAVEIPV6
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

/*
119
 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
Michael Graff's avatar
Michael Graff committed
120 121 122 123 124 125 126 127 128
 * a setsockopt() like interface to request timestamps, and if the OS
 * doesn't do it for us, call gettimeofday() on every UDP receive?
 */
#ifdef SO_TIMESTAMP
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

129 130 131 132 133
/*
 * The number of times a send operation is repeated if the result is EINTR.
 */
#define NRETRIES 10

134 135
struct isc_socket {
	/* Not locked. */
136 137 138 139
	unsigned int		magic;
	isc_socketmgr_t	       *manager;
	isc_mutex_t		lock;
	isc_sockettype_t	type;
Michael Graff's avatar
Michael Graff committed
140

141
	/* Locked by socket lock. */
142
	ISC_LINK(isc_socket_t)	link;
143 144
	unsigned int		references;
	int			fd;
145
	int			pf;
146

147
	ISC_LIST(isc_socketevent_t)		send_list;
148
	ISC_LIST(isc_socketevent_t)		recv_list;
149
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
150 151 152 153 154 155 156
	isc_socket_connev_t		       *connect_ev;

	/*
	 * Internal events.  Posted when a descriptor is readable or
	 * writable.  These are statically allocated and never freed.
	 * They will be set to non-purgable before use.
	 */
157 158
	intev_t			readable_ev;
	intev_t			writable_ev;
159

160
	isc_sockaddr_t		address;  /* remote address */
161

162 163 164 165 166 167 168
	unsigned int		pending_recv : 1,
				pending_send : 1,
				pending_accept : 1,
				listener : 1, /* listener socket */
				connected : 1,
				connecting : 1, /* connect pending */
				bound : 1; /* bound to local addr */
169

170
#ifdef ISC_NET_RECVOVERFLOW
171
	unsigned char		overflow; /* used for MSG_TRUNC fake */
172
#endif
173 174 175 176 177

	char			*recvcmsgbuf;
	ISC_SOCKADDR_LEN_T	recvcmsgbuflen;
	char			*sendcmsgbuf;
	ISC_SOCKADDR_LEN_T	sendcmsgbuflen;
178 179
};

180 181 182
#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)

183 184
struct isc_socketmgr {
	/* Not locked. */
185 186 187
	unsigned int		magic;
	isc_mem_t	       *mctx;
	isc_mutex_t		lock;
188
	/* Locked by manager lock. */
189
	ISC_LIST(isc_socket_t)	socklist;
190 191 192 193 194
	fd_set			read_fds;
	fd_set			write_fds;
	isc_socket_t	       *fds[FD_SETSIZE];
	int			fdstate[FD_SETSIZE];
	int			maxfd;
195 196 197
#ifdef ISC_PLATFORM_USETHREADS
	isc_thread_t		watcher;
	isc_condition_t		shutdown_ok;
198
	int			pipe_fds[2];
199
#else /* ISC_PLATFORM_USETHREADS */
200
	unsigned int		refs;
201
#endif /* ISC_PLATFORM_USETHREADS */
202 203
};

204 205
#ifndef ISC_PLATFORM_USETHREADS
static isc_socketmgr_t *socketmgr = NULL;
206
#endif /* ISC_PLATFORM_USETHREADS */
207

Michael Graff's avatar
Michael Graff committed
208 209 210 211
#define CLOSED		0	/* this one must be zero */
#define MANAGED		1
#define CLOSE_PENDING	2

212 213 214 215 216 217 218 219 220 221
/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
#ifdef ISC_NET_RECVOVERFLOW
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
#else
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
#endif

222 223
static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
Bob Halley's avatar
Bob Halley committed
224 225 226 227
static void free_socket(isc_socket_t **);
static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
				    isc_socket_t **);
static void destroy(isc_socket_t **);
228 229 230 231
static void internal_accept(isc_task_t *, isc_event_t *);
static void internal_connect(isc_task_t *, isc_event_t *);
static void internal_recv(isc_task_t *, isc_event_t *);
static void internal_send(isc_task_t *, isc_event_t *);
232
static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
Michael Graff's avatar
Michael Graff committed
233
static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
234
			      struct msghdr *, struct iovec *, size_t *);
Michael Graff's avatar
Michael Graff committed
235
static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
236
			      struct msghdr *, struct iovec *, size_t *);
Michael Graff's avatar
Michael Graff committed
237 238 239

#define SELECT_POKE_SHUTDOWN		(-1)
#define SELECT_POKE_NOTHING		(-2)
240 241 242 243 244
#define SELECT_POKE_READ		(-3)
#define SELECT_POKE_ACCEPT		(-3) /* Same as _READ */
#define SELECT_POKE_WRITE		(-4)
#define SELECT_POKE_CONNECT		(-4) /* Same as _WRITE */
#define SELECT_POKE_CLOSE		(-5)
245

246 247
#define SOCK_DEAD(s)			((s)->references == 0)

248 249 250 251
static void
manager_log(isc_socketmgr_t *sockmgr,
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
Michael Graff's avatar
Michael Graff committed
252 253 254 255 256 257 258 259
static void
manager_log(isc_socketmgr_t *sockmgr,
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...)
{
	char msgbuf[2048];
	va_list ap;

260 261 262
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
263 264 265 266 267 268 269 270
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
		      "sockmgr %p: %s", sockmgr, msgbuf);
}

271 272 273 274 275
static void
socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   isc_msgcat_t *msgcat, int msgset, int message,
	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
Michael Graff's avatar
Michael Graff committed
276 277 278
static void
socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
279
	   isc_msgcat_t *msgcat, int msgset, int message,
Michael Graff's avatar
Michael Graff committed
280 281 282 283 284 285
	   const char *fmt, ...)
{
	char msgbuf[2048];
	char peerbuf[256];
	va_list ap;

286 287 288
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
289 290 291 292 293
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
294 295 296
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p: %s", sock, msgbuf);
Michael Graff's avatar
Michael Graff committed
297
	} else {
Andreas Gustafsson's avatar
Andreas Gustafsson committed
298
		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
299 300 301
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p %s: %s", sock, peerbuf, msgbuf);
Michael Graff's avatar
Michael Graff committed
302 303 304
	}
}

305
static void
306
wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
307 308 309
	isc_socket_t *sock;

	/*
310 311 312
	 * This is a wakeup on a socket.  If the socket is not in the
	 * process of being closed, start watching it for either reads
	 * or writes.
313
	 */
Andreas Gustafsson's avatar
 
Andreas Gustafsson committed
314

Mark Andrews's avatar
Mark Andrews committed
315
	INSIST(fd >= 0 && fd < (int)FD_SETSIZE);
316 317 318 319 320

	if (manager->fdstate[fd] == CLOSE_PENDING) {
		manager->fdstate[fd] = CLOSED;
		FD_CLR(fd, &manager->read_fds);
		FD_CLR(fd, &manager->write_fds);
321
		(void)close(fd);
322 323 324 325 326 327 328 329
		return;
	}
	if (manager->fdstate[fd] != MANAGED)
		return;

	sock = manager->fds[fd];

	/*
Mark Andrews's avatar
Mark Andrews committed
330
	 * Set requested bit.
331
	 */
332
	if (msg == SELECT_POKE_READ)
333
		FD_SET(sock->fd, &manager->read_fds);
334
	if (msg == SELECT_POKE_WRITE)
335 336 337 338
		FD_SET(sock->fd, &manager->write_fds);
}

#ifdef ISC_PLATFORM_USETHREADS
339
/*
Michael Graff's avatar
Michael Graff committed
340
 * Poke the select loop when there is something for us to do.
341 342
 * The write is required (by POSIX) to complete.  That is, we
 * will not get partial writes.
343 344
 */
static void
345
select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
Michael Graff's avatar
Michael Graff committed
346
	int cc;
347
	int buf[2];
348
	char strbuf[ISC_STRERRORSIZE];
349 350 351

	buf[0] = fd;
	buf[1] = msg;
Michael Graff's avatar
Michael Graff committed
352

353
	do {
354
		cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
355 356 357 358 359 360 361 362 363 364
#ifdef ENOSR
		/*
		 * Treat ENOSR as EAGAIN but loop slowly as it is
		 * unlikely to clear fast.
		 */
		if (cc < 0 && errno == ENOSR) {
			sleep(1);
			errno = EAGAIN;
		}
#endif
365
	} while (cc < 0 && SOFT_ERROR(errno));
366
			        
367 368
	if (cc < 0) {
		isc__strerror(errno, strbuf, sizeof(strbuf));
Michael Graff's avatar
Michael Graff committed
369
		FATAL_ERROR(__FILE__, __LINE__,
370 371 372 373
			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
					   ISC_MSG_WRITEFAILED,
					   "write() failed "
					   "during watcher poke: %s"),
374 375
			    strbuf);
	}
376

377
	INSIST(cc == sizeof(buf));
378 379 380
}

/*
Andreas Gustafsson's avatar
Andreas Gustafsson committed
381
 * Read a message on the internal fd.
382
 */
383 384 385
static void
select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
	int buf[2];
Michael Graff's avatar
Michael Graff committed
386
	int cc;
387
	char strbuf[ISC_STRERRORSIZE];
Michael Graff's avatar
Michael Graff committed
388

389
	cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
Michael Graff's avatar
Michael Graff committed
390
	if (cc < 0) {
391
		*msg = SELECT_POKE_NOTHING;
Michael Graff's avatar
Michael Graff committed
392
		if (SOFT_ERROR(errno))
393
			return;
Michael Graff's avatar
Michael Graff committed
394

395
		isc__strerror(errno, strbuf, sizeof(strbuf));
Michael Graff's avatar
Michael Graff committed
396
		FATAL_ERROR(__FILE__, __LINE__,
397 398 399 400
			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
					   ISC_MSG_READFAILED,
					   "read() failed "
					   "during watcher poke: %s"),
401
			    strbuf);
402
		
403
		return;
Michael Graff's avatar
Michael Graff committed
404
	}
405
	INSIST(cc == sizeof(buf));
406

407 408
	*fd = buf[0];
	*msg = buf[1];
409
}
410
#else /* ISC_PLATFORM_USETHREADS */
411 412 413 414
/*
 * Update the state of the socketmgr when something changes.
 */
static void
415
select_poke(isc_socketmgr_t *manager, int fd, int msg) {
416 417
	if (msg == SELECT_POKE_SHUTDOWN)
		return;
418 419
	else if (fd >= 0)
		wakeup_socket(manager, fd, msg);
420 421
	return;
}
422
#endif /* ISC_PLATFORM_USETHREADS */
423 424

/*
Andreas Gustafsson's avatar
Andreas Gustafsson committed
425
 * Make a fd non-blocking.
426
 */
Michael Graff's avatar
Michael Graff committed
427
static isc_result_t
428
make_nonblock(int fd) {
Michael Graff's avatar
Michael Graff committed
429 430
	int ret;
	int flags;
431
	char strbuf[ISC_STRERRORSIZE];
432

Michael Graff's avatar
Michael Graff committed
433 434 435
	flags = fcntl(fd, F_GETFL, 0);
	flags |= O_NONBLOCK;
	ret = fcntl(fd, F_SETFL, flags);
436

Michael Graff's avatar
Michael Graff committed
437
	if (ret == -1) {
438
		isc__strerror(errno, strbuf, sizeof(strbuf));
Michael Graff's avatar
Michael Graff committed
439 440
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "fcntl(%d, F_SETFL, %d): %s",
441
				 fd, flags, strbuf);
Michael Graff's avatar
Michael Graff committed
442

Michael Graff's avatar
Michael Graff committed
443
		return (ISC_R_UNEXPECTED);
Michael Graff's avatar
Michael Graff committed
444 445
	}

Michael Graff's avatar
Michael Graff committed
446
	return (ISC_R_SUCCESS);
447 448
}

449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
#ifdef USE_CMSG
/*
 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
 * In order to ensure as much portability as possible, we provide wrapper
 * functions of these macros.
 * Note that cmsg_space() could run slow on OSes that do not have
 * CMSG_SPACE.
 */
static inline ISC_SOCKADDR_LEN_T
cmsg_len(ISC_SOCKADDR_LEN_T len) {
#ifdef CMSG_LEN
	return (CMSG_LEN(len));
#else
	ISC_SOCKADDR_LEN_T hdrlen;

	hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(NULL); /* XXX */
	return (hdrlen + len);
#endif
}

static inline ISC_SOCKADDR_LEN_T
cmsg_space(ISC_SOCKADDR_LEN_T len) {
#ifdef CMSG_SPACE
	return (CMSG_SPACE(len));
#else
	struct msghdr msg;
	struct cmsghdr *cmsgp;
	/*
	 * XXX: The buffer length is an ad-hoc value, but should be enough
	 * in a practical sense.
	 */
	char dummybuf[sizeof(struct cmsghdr) + 1024];

	memset(&msg, 0, sizeof(msg));
	msg.msg_control = dummybuf;
	msg.msg_controllen = sizeof(dummybuf);

	cmsgp = (struct cmsghdr *)dummybuf;
	cmsgp->cmsg_len = cmsg_len(len);

	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
	if (cmsgp != NULL)
		return ((void *)cmsgp - (void *)msg.msg_control);
	else
		return (0);
#endif	
}
#endif /* USE_CMSG */

498 499 500 501
/*
 * Process control messages received on a socket.
 */
static void
502
process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
Michael Graff's avatar
Michael Graff committed
503
#ifdef USE_CMSG
504
	struct cmsghdr *cmsgp;
Michael Graff's avatar
Michael Graff committed
505 506 507 508 509 510 511 512
#ifdef ISC_PLATFORM_HAVEIPV6
	struct in6_pktinfo *pktinfop;
#endif
#ifdef SO_TIMESTAMP
	struct timeval *timevalp;
#endif
#endif

513 514 515 516 517 518
	/*
	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
	 * They are all here, outside of the CPP tests, because it is
	 * more consistent with the usual ISC coding style.
	 */
519
	UNUSED(sock);
520 521 522
	UNUSED(msg);
	UNUSED(dev);

523
#ifdef ISC_NET_BSD44MSGHDR
524

Bob Halley's avatar
Bob Halley committed
525
#ifdef MSG_TRUNC
526 527
	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
Bob Halley's avatar
Bob Halley committed
528
#endif
529

Bob Halley's avatar
Bob Halley committed
530
#ifdef MSG_CTRUNC
531 532
	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
Bob Halley's avatar
Bob Halley committed
533
#endif
534

Michael Graff's avatar
Michael Graff committed
535 536 537
#ifndef USE_CMSG
	return;
#else
538 539
	if (msg->msg_controllen == 0 || msg->msg_control == NULL)
		return;
Michael Graff's avatar
Michael Graff committed
540 541 542 543 544 545 546 547 548 549

#ifdef SO_TIMESTAMP
	timevalp = NULL;
#endif
#ifdef ISC_PLATFORM_HAVEIPV6
	pktinfop = NULL;
#endif

	cmsgp = CMSG_FIRSTHDR(msg);
	while (cmsgp != NULL) {
550
		socket_log(sock, NULL, TRACE,
551 552
			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
			   "processing cmsg %p", cmsgp);
Michael Graff's avatar
Michael Graff committed
553 554

#ifdef ISC_PLATFORM_HAVEIPV6
555 556
		if (cmsgp->cmsg_level == IPPROTO_IPV6
		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
557

Michael Graff's avatar
Michael Graff committed
558
			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
559 560
			memcpy(&dev->pktinfo, pktinfop,
			       sizeof(struct in6_pktinfo));
Michael Graff's avatar
Michael Graff committed
561
			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
562
			socket_log(sock, NULL, TRACE,
563 564 565
				   isc_msgcat, ISC_MSGSET_SOCKET,
				   ISC_MSG_IFRECEIVED,
				   "interface received on ifindex %u",
David Lawrence's avatar
David Lawrence committed
566
				   dev->pktinfo.ipi6_ifindex);
567 568
			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;				
Michael Graff's avatar
Michael Graff committed
569 570 571 572 573
			goto next;
		}
#endif

#ifdef SO_TIMESTAMP
574 575
		if (cmsgp->cmsg_level == SOL_SOCKET
		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
Michael Graff's avatar
Michael Graff committed
576 577 578 579 580 581 582 583 584 585 586 587 588
			timevalp = (struct timeval *)CMSG_DATA(cmsgp);
			dev->timestamp.seconds = timevalp->tv_sec;
			dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
			goto next;
		}
#endif

	next:
		cmsgp = CMSG_NXTHDR(msg, cmsgp);
	}
#endif /* USE_CMSG */

589
#endif /* ISC_NET_BSD44MSGHDR */
590 591
}

592
/*
593 594 595 596
 * Construct an iov array and attach it to the msghdr passed in.  This is
 * the SEND constructor, which will use the used region of the buffer
 * (if using a buffer list) or will use the internal region (if a single
 * buffer I/O is requested).
597 598 599
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
Michael Graff's avatar
fix  
Michael Graff committed
600 601 602
 *
 * If write_countp != NULL, *write_countp will hold the number of bytes
 * this transaction can send.
603
 */
Michael Graff's avatar
Michael Graff committed
604
static void
605
build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
606
		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
607 608 609 610
{
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t used;
611 612 613
	size_t write_count;
	size_t skip_count;

Andreas Gustafsson's avatar
Andreas Gustafsson committed
614
	memset(msg, 0, sizeof(*msg));
615 616 617 618

	if (sock->type == isc_sockettype_udp) {
		msg->msg_name = (void *)&dev->address.type.sa;
		msg->msg_namelen = dev->address.length;
Michael Graff's avatar
fix  
Michael Graff committed
619 620 621
	} else {
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
622
	}
623 624

	buffer = ISC_LIST_HEAD(dev->bufferlist);
625
	write_count = 0;
Michael Graff's avatar
fix  
Michael Graff committed
626
	iovcount = 0;
627

628
	/*
629
	 * Single buffer I/O?  Skip what we've done so far in this region.
630 631
	 */
	if (buffer == NULL) {
632 633 634
		write_count = dev->region.length - dev->n;
		iov[0].iov_base = (void *)(dev->region.base + dev->n);
		iov[0].iov_len = write_count;
Michael Graff's avatar
fix  
Michael Graff committed
635
		iovcount = 1;
636

637 638 639 640 641 642 643
		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip the data in the buffer list that we have already written.
	 */
Michael Graff's avatar
fix  
Michael Graff committed
644
	skip_count = dev->n;
645
	while (buffer != NULL) {
646
		REQUIRE(ISC_BUFFER_VALID(buffer));
647
		if (skip_count < isc_buffer_usedlength(buffer))
648
			break;
649
		skip_count -= isc_buffer_usedlength(buffer);
650
		buffer = ISC_LIST_NEXT(buffer, link);
651 652 653
	}

	while (buffer != NULL) {
654
		INSIST(iovcount < MAXSCATTERGATHER_SEND);
655

656
		isc_buffer_usedregion(buffer, &used);
657

658
		if (used.length > 0) {
659 660 661 662 663
			iov[iovcount].iov_base = (void *)(used.base
							  + skip_count);
			iov[iovcount].iov_len = used.length - skip_count;
			write_count += (used.length - skip_count);
			skip_count = 0;
664 665 666 667 668
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

669
	INSIST(skip_count == 0U);
Michael Graff's avatar
fix  
Michael Graff committed
670 671

 config:
672 673 674
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;

675 676 677 678
#ifdef ISC_NET_BSD44MSGHDR
	msg->msg_control = NULL;
	msg->msg_controllen = 0;
	msg->msg_flags = 0;
679
#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIPV6)
680 681 682
	if ((sock->type == isc_sockettype_udp)
	    && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
		struct cmsghdr *cmsgp;
683 684
		struct in6_pktinfo *pktinfop;

685
		socket_log(sock, NULL, TRACE,
686 687
			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
			   "sendto pktinfo data, ifindex %u",
David Lawrence's avatar
David Lawrence committed
688
			   dev->pktinfo.ipi6_ifindex);
689

690 691 692
		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
		INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
		msg->msg_control = (void *)sock->sendcmsgbuf;
693

694
		cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
695 696
		cmsgp->cmsg_level = IPPROTO_IPV6;
		cmsgp->cmsg_type = IPV6_PKTINFO;
697
		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
698
		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
699
		memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
700
	}
701
#endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
702
#else /* ISC_NET_BSD44MSGHDR */
703 704
	msg->msg_accrights = NULL;
	msg->msg_accrightslen = 0;
705
#endif /* ISC_NET_BSD44MSGHDR */
706 707 708

	if (write_countp != NULL)
		*write_countp = write_count;
709 710
}

Michael Graff's avatar
fix  
Michael Graff committed
711
/*
712 713 714 715
 * Construct an iov array and attach it to the msghdr passed in.  This is
 * the RECV constructor, which will use the avialable region of the buffer
 * (if using a buffer list) or will use the internal region (if a single
 * buffer I/O is requested).
Michael Graff's avatar
fix  
Michael Graff committed
716 717 718 719 720 721 722
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
 *
 * If read_countp != NULL, *read_countp will hold the number of bytes
 * this transaction can receive.
 */
Michael Graff's avatar
Michael Graff committed
723
static void
Michael Graff's avatar
fix  
Michael Graff committed
724
build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
725
		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
Michael Graff's avatar
fix  
Michael Graff committed
726 727 728 729 730 731
{
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t available;
	size_t read_count;

Andreas Gustafsson's avatar
Andreas Gustafsson committed
732
	memset(msg, 0, sizeof(struct msghdr));
Michael Graff's avatar
fix  
Michael Graff committed
733 734 735 736

	if (sock->type == isc_sockettype_udp) {
		memset(&dev->address, 0, sizeof(dev->address));
		msg->msg_name = (void *)&dev->address.type.sa;
Bob Halley's avatar
Bob Halley committed
737
		msg->msg_namelen = sizeof(dev->address.type);
738 739 740 741 742
#ifdef ISC_NET_RECVOVERFLOW
		/* If needed, steal one iovec for overflow detection. */
		maxiov--;
#endif
	} else { /* TCP */
Michael Graff's avatar
fix  
Michael Graff committed
743 744 745 746 747 748 749 750 751 752 753 754 755 756 757
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
		dev->address = sock->address;
	}

	buffer = ISC_LIST_HEAD(dev->bufferlist);
	read_count = 0;

	/*
	 * Single buffer I/O?  Skip what we've done so far in this region.
	 */
	if (buffer == NULL) {
		read_count = dev->region.length - dev->n;
		iov[0].iov_base = (void *)(dev->region.base + dev->n);
		iov[0].iov_len = read_count;
758
		iovcount = 1;
Michael Graff's avatar
fix  
Michael Graff committed
759 760 761 762 763 764 765 766 767

		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip empty buffers.
	 */
	while (buffer != NULL) {
768
		REQUIRE(ISC_BUFFER_VALID(buffer));
769
		if (isc_buffer_availablelength(buffer) != 0)
Michael Graff's avatar
fix  
Michael Graff committed
770 771 772 773 774 775
			break;
		buffer = ISC_LIST_NEXT(buffer, link);
	}

	iovcount = 0;
	while (buffer != NULL) {
776
		INSIST(iovcount < MAXSCATTERGATHER_RECV);
Michael Graff's avatar
fix  
Michael Graff committed
777

778
		isc_buffer_availableregion(buffer, &available);
Michael Graff's avatar
fix  
Michael Graff committed
779 780 781 782 783 784 785 786 787 788

		if (available.length > 0) {
			iov[iovcount].iov_base = (void *)(available.base);
			iov[iovcount].iov_len = available.length;
			read_count += available.length;
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

789 790 791 792 793 794 795 796 797 798 799 800 801 802 803
 config:

	/*
	 * If needed, set up to receive that one extra byte.  Note that
	 * we know there is at least one iov left, since we stole it
	 * at the top of this function.
	 */
#ifdef ISC_NET_RECVOVERFLOW
	if (sock->type == isc_sockettype_udp) {
		iov[iovcount].iov_base = (void *)(&sock->overflow);
		iov[iovcount].iov_len = 1;
		iovcount++;
	}
#endif

Michael Graff's avatar
fix  
Michael Graff committed
804 805 806 807 808 809
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;

#ifdef ISC_NET_BSD44MSGHDR
	msg->msg_control = NULL;
	msg->msg_controllen = 0;
810 811
	msg->msg_flags = 0;
#if defined(USE_CMSG)
Michael Graff's avatar
Michael Graff committed
812
	if (sock->type == isc_sockettype_udp) {
813 814
		msg->msg_control = sock->recvcmsgbuf;
		msg->msg_controllen = sock->recvcmsgbuflen;
Michael Graff's avatar
Michael Graff committed
815
	}
816 817
#endif /* USE_CMSG */
#else /* ISC_NET_BSD44MSGHDR */
Michael Graff's avatar
fix  
Michael Graff committed
818 819
	msg->msg_accrights = NULL;
	msg->msg_accrightslen = 0;
820
#endif /* ISC_NET_BSD44MSGHDR */
Michael Graff's avatar
fix  
Michael Graff committed
821 822 823 824 825

	if (read_countp != NULL)
		*read_countp = read_count;
}

826 827 828 829 830 831 832 833 834 835 836 837 838 839 840
static void
set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
		isc_socketevent_t *dev)
{
	if (sock->type == isc_sockettype_udp) {
		if (address != NULL)
			dev->address = *address;
		else
			dev->address = sock->address;
	} else if (sock->type == isc_sockettype_tcp) {
		INSIST(address == NULL);
		dev->address = sock->address;
	}
}

841 842
static isc_socketevent_t *
allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
David Lawrence's avatar
David Lawrence committed
843
		     isc_taskaction_t action, const void *arg)
844 845 846 847 848 849
{
	isc_socketevent_t *ev;

	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
						     sock, eventtype,
						     action, arg,
Andreas Gustafsson's avatar
Andreas Gustafsson committed
850
						     sizeof(*ev));
851 852 853 854 855

	if (ev == NULL)
		return (NULL);

	ev->result = ISC_R_UNEXPECTED;
856
	ISC_LINK_INIT(ev, ev_link);
857 858
	ISC_LIST_INIT(ev->bufferlist);
	ev->region.base = NULL;
Michael Graff's avatar
fix  
Michael Graff committed
859 860
	ev->n = 0;
	ev->offset = 0;
861
	ev->attributes = 0;
862 863 864 865

	return (ev);
}

866 867
#if defined(ISC_SOCKET_DEBUG)
static void
868
dump_msg(struct msghdr *msg) {
869 870 871 872 873
	unsigned int i;

	printf("MSGHDR %p\n", msg);
	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
874
	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
875 876 877
		printf("\t\t%d\tbase %p, len %d\n", i,
		       msg->msg_iov[i].iov_base,
		       msg->msg_iov[i].iov_len);
Michael Graff's avatar
Michael Graff committed
878 879 880 881
#ifdef ISC_NET_BSD44MSGHDR
	printf("\tcontrol %p, controllen %d\n", msg->msg_control,
	       msg->msg_controllen);
#endif
882 883 884
}
#endif

Michael Graff's avatar
Michael Graff committed
885 886 887 888 889
#define DOIO_SUCCESS		0	/* i/o ok, event sent */
#define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
#define DOIO_HARD		2	/* i/o error, event sent */
#define DOIO_EOF		3	/* EOF, no event sent */

890
static int
891
doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
892
	int cc;
893
	struct iovec iov[MAXSCATTERGATHER_RECV];
894
	size_t read_count;
895
	size_t actual_count;
896
	struct msghdr msghdr;
897
	isc_buffer_t *buffer;
898
	int recv_errno;
899
	char strbuf[ISC_STRERRORSIZE];
900

901
	build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
902 903 904