socket.c 102 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
Mark Andrews's avatar
Mark Andrews committed
2
 * Copyright (C) 2004-2007  Internet Systems Consortium, Inc. ("ISC")
Mark Andrews's avatar
Mark Andrews committed
3
 * Copyright (C) 1998-2003  Internet Software Consortium.
4
 *
Automatic Updater's avatar
Automatic Updater committed
5
 * Permission to use, copy, modify, and/or distribute this software for any
Bob Halley's avatar
Bob Halley committed
6
7
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
8
 *
Mark Andrews's avatar
Mark Andrews committed
9
10
11
12
13
14
15
 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
Bob Halley's avatar
Bob Halley committed
16
 */
Bob Halley's avatar
Bob Halley committed
17

18
/* $Id: socket.c,v 1.275 2007/12/14 03:52:40 marka Exp $ */
19
20

/*! \file */
David Lawrence's avatar
David Lawrence committed
21

Bob Halley's avatar
Bob Halley committed
22
#include <config.h>
23

24
#include <sys/param.h>
Michael Graff's avatar
Michael Graff committed
25
#include <sys/types.h>
Michael Graff's avatar
Michael Graff committed
26
#include <sys/socket.h>
27
28
29
30
#include <sys/stat.h>
#ifdef ISC_PLATFORM_HAVESYSUNH
#include <sys/un.h>
#endif
Michael Graff's avatar
Michael Graff committed
31
#include <sys/time.h>
Michael Graff's avatar
Michael Graff committed
32
33
#include <sys/uio.h>

34
#include <errno.h>
Andreas Gustafsson's avatar
Andreas Gustafsson committed
35
#include <fcntl.h>
36
37
38
39
40
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

41
#include <isc/buffer.h>
42
#include <isc/bufferlist.h>
43
#include <isc/condition.h>
44
#include <isc/formatcheck.h>
45
#include <isc/list.h>
Michael Graff's avatar
Michael Graff committed
46
#include <isc/log.h>
47
#include <isc/mem.h>
48
#include <isc/msgs.h>
49
#include <isc/mutex.h>
50
#include <isc/net.h>
51
#include <isc/once.h>
52
#include <isc/platform.h>
Michael Graff's avatar
Michael Graff committed
53
#include <isc/print.h>
54
#include <isc/region.h>
55
#include <isc/socket.h>
56
#include <isc/strerror.h>
57
#include <isc/task.h>
58
#include <isc/thread.h>
Michael Graff's avatar
Michael Graff committed
59
#include <isc/util.h>
60
#include <isc/xml.h>
Bob Halley's avatar
Bob Halley committed
61

62
63
#include "errno2result.h"

64
#ifndef ISC_PLATFORM_USETHREADS
65
#include "socket_p.h"
66
#endif /* ISC_PLATFORM_USETHREADS */
67

68
69
70
71
72
/*
 * Support names for sockets.
 */
#define ISC_SOCKET_NAMES 1

73
74
75
76
77

#if defined(SO_BSDCOMPAT) && defined(__linux__)
#include <sys/utsname.h>
#endif

78
/*%
79
 * Some systems define the socket length argument as an int, some as size_t,
80
 * some as socklen_t.  This is here so it can be easily changed if needed.
81
 */
82
#ifndef ISC_SOCKADDR_LEN_T
83
#define ISC_SOCKADDR_LEN_T unsigned int
84
#endif
85

86
/*%
87
88
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
89
90
91
92
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
93
 */
94
95
96
97
#define SOFT_ERROR(e)	((e) == EAGAIN || \
			 (e) == EWOULDBLOCK || \
			 (e) == EINTR || \
			 (e) == 0)
98

Michael Graff's avatar
Michael Graff committed
99
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
100

101
/*!<
Michael Graff's avatar
Michael Graff committed
102
103
104
105
106
107
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
108
109
110
111
112
113
114
115
116
117
118
#define TRACE_LEVEL		90
#define CORRECTNESS_LEVEL	70
#define IOEVENT_LEVEL		60
#define EVENT_LEVEL		50
#define CREATION_LEVEL		20

#define TRACE		DLVL(TRACE_LEVEL)
#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
#define IOEVENT		DLVL(IOEVENT_LEVEL)
#define EVENT		DLVL(EVENT_LEVEL)
#define CREATION	DLVL(CREATION_LEVEL)
119

120
typedef isc_event_t intev_t;
Michael Graff's avatar
Michael Graff committed
121

122
123
#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
#define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
Michael Graff's avatar
Michael Graff committed
124

125
/*!
Michael Graff's avatar
Michael Graff committed
126
127
128
129
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
130
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
Michael Graff's avatar
Michael Graff committed
131
132
133
134
135
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

136
/*%
137
 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
Michael Graff's avatar
Michael Graff committed
138
139
140
141
142
143
144
145
146
 * a setsockopt() like interface to request timestamps, and if the OS
 * doesn't do it for us, call gettimeofday() on every UDP receive?
 */
#ifdef SO_TIMESTAMP
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

147
148
149
150
151
/*%
 * The size to raise the recieve buffer to (from BIND 8).
 */
#define RCVBUFSIZE (32*1024)

152
/*%
153
154
155
156
 * The number of times a send operation is repeated if the result is EINTR.
 */
#define NRETRIES 10

157
158
struct isc_socket {
	/* Not locked. */
159
160
161
162
	unsigned int		magic;
	isc_socketmgr_t	       *manager;
	isc_mutex_t		lock;
	isc_sockettype_t	type;
Michael Graff's avatar
Michael Graff committed
163

164
	/* Locked by socket lock. */
165
	ISC_LINK(isc_socket_t)	link;
166
167
	unsigned int		references;
	int			fd;
168
	int			pf;
169

170
171
172
173
174
#ifdef ISC_SOCKET_NAMES
	char				name[16];
	void *				tag;
#endif

175
	ISC_LIST(isc_socketevent_t)		send_list;
176
	ISC_LIST(isc_socketevent_t)		recv_list;
177
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
178
179
180
181
182
183
184
	isc_socket_connev_t		       *connect_ev;

	/*
	 * Internal events.  Posted when a descriptor is readable or
	 * writable.  These are statically allocated and never freed.
	 * They will be set to non-purgable before use.
	 */
185
186
	intev_t			readable_ev;
	intev_t			writable_ev;
187

188
	isc_sockaddr_t		peer_address;  /* remote address */
189

190
191
192
193
194
195
196
	unsigned int		pending_recv : 1,
				pending_send : 1,
				pending_accept : 1,
				listener : 1, /* listener socket */
				connected : 1,
				connecting : 1, /* connect pending */
				bound : 1; /* bound to local addr */
197

198
#ifdef ISC_NET_RECVOVERFLOW
199
	unsigned char		overflow; /* used for MSG_TRUNC fake */
200
#endif
201
202
203
204
205

	char			*recvcmsgbuf;
	ISC_SOCKADDR_LEN_T	recvcmsgbuflen;
	char			*sendcmsgbuf;
	ISC_SOCKADDR_LEN_T	sendcmsgbuflen;
206
207
208
209
210

	void			*fdwatcharg;
	isc_sockfdwatch_t	fdwatchcb;
	int			fdwatchflags;
	isc_task_t		*fdwatchtask;
211
212
};

213
214
215
#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)

216
217
struct isc_socketmgr {
	/* Not locked. */
218
219
220
	unsigned int		magic;
	isc_mem_t	       *mctx;
	isc_mutex_t		lock;
221
	/* Locked by manager lock. */
222
	ISC_LIST(isc_socket_t)	socklist;
223
224
225
226
227
	fd_set			read_fds;
	fd_set			write_fds;
	isc_socket_t	       *fds[FD_SETSIZE];
	int			fdstate[FD_SETSIZE];
	int			maxfd;
228
229
230
#ifdef ISC_PLATFORM_USETHREADS
	isc_thread_t		watcher;
	isc_condition_t		shutdown_ok;
231
	int			pipe_fds[2];
232
#else /* ISC_PLATFORM_USETHREADS */
233
	unsigned int		refs;
234
#endif /* ISC_PLATFORM_USETHREADS */
235
236
};

237
238
#ifndef ISC_PLATFORM_USETHREADS
static isc_socketmgr_t *socketmgr = NULL;
239
#endif /* ISC_PLATFORM_USETHREADS */
240

241
242
243
244
#define CLOSED			0	/* this one must be zero */
#define MANAGED			1
#define CLOSE_PENDING		2
#define MANAGER_CLOSE_PENDING	3
Michael Graff's avatar
Michael Graff committed
245

246
247
248
249
250
251
252
253
254
255
/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
#ifdef ISC_NET_RECVOVERFLOW
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
#else
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
#endif

256
257
static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **);
static void send_senddone_event(isc_socket_t *, isc_socketevent_t **);
Bob Halley's avatar
Bob Halley committed
258
259
260
261
static void free_socket(isc_socket_t **);
static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
				    isc_socket_t **);
static void destroy(isc_socket_t **);
262
263
264
265
static void internal_accept(isc_task_t *, isc_event_t *);
static void internal_connect(isc_task_t *, isc_event_t *);
static void internal_recv(isc_task_t *, isc_event_t *);
static void internal_send(isc_task_t *, isc_event_t *);
266
267
static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
268
static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
Michael Graff's avatar
Michael Graff committed
269
static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
270
			      struct msghdr *, struct iovec *, size_t *);
Michael Graff's avatar
Michael Graff committed
271
static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
272
			      struct msghdr *, struct iovec *, size_t *);
Michael Graff's avatar
Michael Graff committed
273
274
275

#define SELECT_POKE_SHUTDOWN		(-1)
#define SELECT_POKE_NOTHING		(-2)
276
#define SELECT_POKE_READ		(-3)
277
#define SELECT_POKE_ACCEPT		(-3) /*%< Same as _READ */
278
#define SELECT_POKE_WRITE		(-4)
279
#define SELECT_POKE_CONNECT		(-4) /*%< Same as _WRITE */
280
#define SELECT_POKE_CLOSE		(-5)
281

282
283
#define SOCK_DEAD(s)			((s)->references == 0)

284
285
286
287
static void
manager_log(isc_socketmgr_t *sockmgr,
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
Michael Graff's avatar
Michael Graff committed
288
289
290
291
292
293
294
295
static void
manager_log(isc_socketmgr_t *sockmgr,
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...)
{
	char msgbuf[2048];
	va_list ap;

296
297
298
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
299
300
301
302
303
304
305
306
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
		      "sockmgr %p: %s", sockmgr, msgbuf);
}

307
308
309
310
311
static void
socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   isc_msgcat_t *msgcat, int msgset, int message,
	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
Michael Graff's avatar
Michael Graff committed
312
313
314
static void
socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
315
	   isc_msgcat_t *msgcat, int msgset, int message,
Michael Graff's avatar
Michael Graff committed
316
317
318
	   const char *fmt, ...)
{
	char msgbuf[2048];
319
	char peerbuf[ISC_SOCKADDR_FORMATSIZE];
Michael Graff's avatar
Michael Graff committed
320
321
	va_list ap;

322
323
324
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
325
326
327
328
329
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
330
331
332
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p: %s", sock, msgbuf);
Michael Graff's avatar
Michael Graff committed
333
	} else {
Andreas Gustafsson's avatar
Andreas Gustafsson committed
334
		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
335
336
337
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p %s: %s", sock, peerbuf, msgbuf);
Michael Graff's avatar
Michael Graff committed
338
339
340
	}
}

341
static void
342
wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) {
343
344
345
	isc_socket_t *sock;

	/*
346
347
348
	 * This is a wakeup on a socket.  If the socket is not in the
	 * process of being closed, start watching it for either reads
	 * or writes.
349
	 */
Andreas Gustafsson's avatar
   
Andreas Gustafsson committed
350

Mark Andrews's avatar
Mark Andrews committed
351
	INSIST(fd >= 0 && fd < (int)FD_SETSIZE);
352

353
354
	if (manager->fdstate[fd] == CLOSE_PENDING
	    || manager->fdstate[fd] == MANAGER_CLOSE_PENDING) {
355
356
		FD_CLR(fd, &manager->read_fds);
		FD_CLR(fd, &manager->write_fds);
357
358
		if (manager->fdstate[fd] == CLOSE_PENDING)
			(void)close(fd);
Michael Graff's avatar
Michael Graff committed
359
		manager->fdstate[fd] = CLOSED;
360
361
362
363
364
365
366
367
		return;
	}
	if (manager->fdstate[fd] != MANAGED)
		return;

	sock = manager->fds[fd];

	/*
Mark Andrews's avatar
Mark Andrews committed
368
	 * Set requested bit.
369
	 */
370
	if (msg == SELECT_POKE_READ)
371
		FD_SET(sock->fd, &manager->read_fds);
372
	if (msg == SELECT_POKE_WRITE)
373
374
375
376
		FD_SET(sock->fd, &manager->write_fds);
}

#ifdef ISC_PLATFORM_USETHREADS
377
/*
Michael Graff's avatar
Michael Graff committed
378
 * Poke the select loop when there is something for us to do.
379
380
 * The write is required (by POSIX) to complete.  That is, we
 * will not get partial writes.
381
382
 */
static void
383
select_poke(isc_socketmgr_t *mgr, int fd, int msg) {
Michael Graff's avatar
Michael Graff committed
384
	int cc;
385
	int buf[2];
386
	char strbuf[ISC_STRERRORSIZE];
387
388
389

	buf[0] = fd;
	buf[1] = msg;
Michael Graff's avatar
Michael Graff committed
390

391
	do {
392
		cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
393
394
395
396
397
398
399
400
401
402
#ifdef ENOSR
		/*
		 * Treat ENOSR as EAGAIN but loop slowly as it is
		 * unlikely to clear fast.
		 */
		if (cc < 0 && errno == ENOSR) {
			sleep(1);
			errno = EAGAIN;
		}
#endif
403
	} while (cc < 0 && SOFT_ERROR(errno));
404

405
406
	if (cc < 0) {
		isc__strerror(errno, strbuf, sizeof(strbuf));
Michael Graff's avatar
Michael Graff committed
407
		FATAL_ERROR(__FILE__, __LINE__,
408
409
410
411
			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
					   ISC_MSG_WRITEFAILED,
					   "write() failed "
					   "during watcher poke: %s"),
412
413
			    strbuf);
	}
414

415
	INSIST(cc == sizeof(buf));
416
417
418
}

/*
Andreas Gustafsson's avatar
Andreas Gustafsson committed
419
 * Read a message on the internal fd.
420
 */
421
422
423
static void
select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) {
	int buf[2];
Michael Graff's avatar
Michael Graff committed
424
	int cc;
425
	char strbuf[ISC_STRERRORSIZE];
Michael Graff's avatar
Michael Graff committed
426

427
	cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
Michael Graff's avatar
Michael Graff committed
428
	if (cc < 0) {
429
		*msg = SELECT_POKE_NOTHING;
430
		*fd = -1;	/* Silence compiler. */
Michael Graff's avatar
Michael Graff committed
431
		if (SOFT_ERROR(errno))
432
			return;
Michael Graff's avatar
Michael Graff committed
433

434
		isc__strerror(errno, strbuf, sizeof(strbuf));
Michael Graff's avatar
Michael Graff committed
435
		FATAL_ERROR(__FILE__, __LINE__,
436
437
438
439
			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
					   ISC_MSG_READFAILED,
					   "read() failed "
					   "during watcher poke: %s"),
440
			    strbuf);
441
		
442
		return;
Michael Graff's avatar
Michael Graff committed
443
	}
444
	INSIST(cc == sizeof(buf));
445

446
447
	*fd = buf[0];
	*msg = buf[1];
448
}
449
#else /* ISC_PLATFORM_USETHREADS */
450
451
452
453
/*
 * Update the state of the socketmgr when something changes.
 */
static void
454
select_poke(isc_socketmgr_t *manager, int fd, int msg) {
455
456
	if (msg == SELECT_POKE_SHUTDOWN)
		return;
457
458
	else if (fd >= 0)
		wakeup_socket(manager, fd, msg);
459
460
	return;
}
461
#endif /* ISC_PLATFORM_USETHREADS */
462
463

/*
Andreas Gustafsson's avatar
Andreas Gustafsson committed
464
 * Make a fd non-blocking.
465
 */
Michael Graff's avatar
Michael Graff committed
466
static isc_result_t
467
make_nonblock(int fd) {
Michael Graff's avatar
Michael Graff committed
468
469
	int ret;
	int flags;
470
	char strbuf[ISC_STRERRORSIZE];
471
472
#ifdef USE_FIONBIO_IOCTL
	int on = 1;
473

474
475
	ret = ioctl(fd, FIONBIO, (char *)&on);
#else
Michael Graff's avatar
Michael Graff committed
476
	flags = fcntl(fd, F_GETFL, 0);
477
	flags |= PORT_NONBLOCK;
Michael Graff's avatar
Michael Graff committed
478
	ret = fcntl(fd, F_SETFL, flags);
479
#endif
480

Michael Graff's avatar
Michael Graff committed
481
	if (ret == -1) {
482
		isc__strerror(errno, strbuf, sizeof(strbuf));
Michael Graff's avatar
Michael Graff committed
483
		UNEXPECTED_ERROR(__FILE__, __LINE__,
484
485
486
487
488
489
#ifdef USE_FIONBIO_IOCTL
				 "ioctl(%d, FIONBIO, &on): %s", fd,
#else
				 "fcntl(%d, F_SETFL, %d): %s", fd, flags,
#endif
				 strbuf);
Michael Graff's avatar
Michael Graff committed
490

Michael Graff's avatar
Michael Graff committed
491
		return (ISC_R_UNEXPECTED);
Michael Graff's avatar
Michael Graff committed
492
493
	}

Michael Graff's avatar
Michael Graff committed
494
	return (ISC_R_SUCCESS);
495
496
}

497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
#ifdef USE_CMSG
/*
 * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
 * In order to ensure as much portability as possible, we provide wrapper
 * functions of these macros.
 * Note that cmsg_space() could run slow on OSes that do not have
 * CMSG_SPACE.
 */
static inline ISC_SOCKADDR_LEN_T
cmsg_len(ISC_SOCKADDR_LEN_T len) {
#ifdef CMSG_LEN
	return (CMSG_LEN(len));
#else
	ISC_SOCKADDR_LEN_T hdrlen;

512
513
514
515
516
	/*
	 * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
	 * is correct.
	 */
	hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
	return (hdrlen + len);
#endif
}

static inline ISC_SOCKADDR_LEN_T
cmsg_space(ISC_SOCKADDR_LEN_T len) {
#ifdef CMSG_SPACE
	return (CMSG_SPACE(len));
#else
	struct msghdr msg;
	struct cmsghdr *cmsgp;
	/*
	 * XXX: The buffer length is an ad-hoc value, but should be enough
	 * in a practical sense.
	 */
	char dummybuf[sizeof(struct cmsghdr) + 1024];

	memset(&msg, 0, sizeof(msg));
	msg.msg_control = dummybuf;
	msg.msg_controllen = sizeof(dummybuf);

	cmsgp = (struct cmsghdr *)dummybuf;
	cmsgp->cmsg_len = cmsg_len(len);

	cmsgp = CMSG_NXTHDR(&msg, cmsgp);
	if (cmsgp != NULL)
Mark Andrews's avatar
Mark Andrews committed
543
		return ((char *)cmsgp - (char *)msg.msg_control);
544
545
546
547
548
549
	else
		return (0);
#endif	
}
#endif /* USE_CMSG */

550
551
552
553
/*
 * Process control messages received on a socket.
 */
static void
554
process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
Michael Graff's avatar
Michael Graff committed
555
#ifdef USE_CMSG
556
	struct cmsghdr *cmsgp;
557
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
Michael Graff's avatar
Michael Graff committed
558
559
560
561
562
563
564
	struct in6_pktinfo *pktinfop;
#endif
#ifdef SO_TIMESTAMP
	struct timeval *timevalp;
#endif
#endif

565
566
567
568
569
570
	/*
	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
	 * They are all here, outside of the CPP tests, because it is
	 * more consistent with the usual ISC coding style.
	 */
571
	UNUSED(sock);
572
573
574
	UNUSED(msg);
	UNUSED(dev);

575
#ifdef ISC_NET_BSD44MSGHDR
576

Bob Halley's avatar
Bob Halley committed
577
#ifdef MSG_TRUNC
578
579
	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
Bob Halley's avatar
Bob Halley committed
580
#endif
581

Bob Halley's avatar
Bob Halley committed
582
#ifdef MSG_CTRUNC
583
584
	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
Bob Halley's avatar
Bob Halley committed
585
#endif
586

Michael Graff's avatar
Michael Graff committed
587
588
589
#ifndef USE_CMSG
	return;
#else
Mark Andrews's avatar
Mark Andrews committed
590
	if (msg->msg_controllen == 0U || msg->msg_control == NULL)
591
		return;
Michael Graff's avatar
Michael Graff committed
592
593
594
595

#ifdef SO_TIMESTAMP
	timevalp = NULL;
#endif
596
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
Michael Graff's avatar
Michael Graff committed
597
598
599
600
601
	pktinfop = NULL;
#endif

	cmsgp = CMSG_FIRSTHDR(msg);
	while (cmsgp != NULL) {
602
		socket_log(sock, NULL, TRACE,
603
604
			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
			   "processing cmsg %p", cmsgp);
Michael Graff's avatar
Michael Graff committed
605

606
#ifdef ISC_PLATFORM_HAVEIN6PKTINFO
607
608
		if (cmsgp->cmsg_level == IPPROTO_IPV6
		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
609

Michael Graff's avatar
Michael Graff committed
610
			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
611
612
			memcpy(&dev->pktinfo, pktinfop,
			       sizeof(struct in6_pktinfo));
Michael Graff's avatar
Michael Graff committed
613
			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
614
			socket_log(sock, NULL, TRACE,
615
616
617
				   isc_msgcat, ISC_MSGSET_SOCKET,
				   ISC_MSG_IFRECEIVED,
				   "interface received on ifindex %u",
David Lawrence's avatar
David Lawrence committed
618
				   dev->pktinfo.ipi6_ifindex);
619
620
			if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
				dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;				
Michael Graff's avatar
Michael Graff committed
621
622
623
624
625
			goto next;
		}
#endif

#ifdef SO_TIMESTAMP
626
627
		if (cmsgp->cmsg_level == SOL_SOCKET
		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
Michael Graff's avatar
Michael Graff committed
628
629
630
631
632
633
634
635
636
637
638
639
640
			timevalp = (struct timeval *)CMSG_DATA(cmsgp);
			dev->timestamp.seconds = timevalp->tv_sec;
			dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
			goto next;
		}
#endif

	next:
		cmsgp = CMSG_NXTHDR(msg, cmsgp);
	}
#endif /* USE_CMSG */

641
#endif /* ISC_NET_BSD44MSGHDR */
642
643
}

644
/*
645
646
647
648
 * Construct an iov array and attach it to the msghdr passed in.  This is
 * the SEND constructor, which will use the used region of the buffer
 * (if using a buffer list) or will use the internal region (if a single
 * buffer I/O is requested).
649
650
651
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
Michael Graff's avatar
fix    
Michael Graff committed
652
653
654
 *
 * If write_countp != NULL, *write_countp will hold the number of bytes
 * this transaction can send.
655
 */
Michael Graff's avatar
Michael Graff committed
656
static void
657
build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
658
		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
659
660
661
662
{
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t used;
663
664
665
	size_t write_count;
	size_t skip_count;

Andreas Gustafsson's avatar
Andreas Gustafsson committed
666
	memset(msg, 0, sizeof(*msg));
667
668
669
670

	if (sock->type == isc_sockettype_udp) {
		msg->msg_name = (void *)&dev->address.type.sa;
		msg->msg_namelen = dev->address.length;
Michael Graff's avatar
fix    
Michael Graff committed
671
672
673
	} else {
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
674
	}
675
676

	buffer = ISC_LIST_HEAD(dev->bufferlist);
677
	write_count = 0;
Michael Graff's avatar
fix    
Michael Graff committed
678
	iovcount = 0;
679

680
	/*
681
	 * Single buffer I/O?  Skip what we've done so far in this region.
682
683
	 */
	if (buffer == NULL) {
684
685
686
		write_count = dev->region.length - dev->n;
		iov[0].iov_base = (void *)(dev->region.base + dev->n);
		iov[0].iov_len = write_count;
Michael Graff's avatar
fix    
Michael Graff committed
687
		iovcount = 1;
688

689
690
691
692
693
694
695
		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip the data in the buffer list that we have already written.
	 */
Michael Graff's avatar
fix    
Michael Graff committed
696
	skip_count = dev->n;
697
	while (buffer != NULL) {
698
		REQUIRE(ISC_BUFFER_VALID(buffer));
699
		if (skip_count < isc_buffer_usedlength(buffer))
700
			break;
701
		skip_count -= isc_buffer_usedlength(buffer);
702
		buffer = ISC_LIST_NEXT(buffer, link);
703
704
705
	}

	while (buffer != NULL) {
706
		INSIST(iovcount < MAXSCATTERGATHER_SEND);
707

708
		isc_buffer_usedregion(buffer, &used);
709

710
		if (used.length > 0) {
711
712
713
714
715
			iov[iovcount].iov_base = (void *)(used.base
							  + skip_count);
			iov[iovcount].iov_len = used.length - skip_count;
			write_count += (used.length - skip_count);
			skip_count = 0;
716
717
718
719
720
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

721
	INSIST(skip_count == 0U);
Michael Graff's avatar
fix    
Michael Graff committed
722
723

 config:
724
725
726
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;

727
728
729
730
#ifdef ISC_NET_BSD44MSGHDR
	msg->msg_control = NULL;
	msg->msg_controllen = 0;
	msg->msg_flags = 0;
731
#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
732
733
734
	if ((sock->type == isc_sockettype_udp)
	    && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
		struct cmsghdr *cmsgp;
735
736
		struct in6_pktinfo *pktinfop;

737
		socket_log(sock, NULL, TRACE,
738
739
			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
			   "sendto pktinfo data, ifindex %u",
David Lawrence's avatar
David Lawrence committed
740
			   dev->pktinfo.ipi6_ifindex);
741

742
743
744
		msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
		INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
		msg->msg_control = (void *)sock->sendcmsgbuf;
745

746
		cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
747
748
		cmsgp->cmsg_level = IPPROTO_IPV6;
		cmsgp->cmsg_type = IPV6_PKTINFO;
749
		cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
750
		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
751
		memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
752
	}
753
#endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
754
#else /* ISC_NET_BSD44MSGHDR */
755
756
	msg->msg_accrights = NULL;
	msg->msg_accrightslen = 0;
757
#endif /* ISC_NET_BSD44MSGHDR */
758
759
760

	if (write_countp != NULL)
		*write_countp = write_count;
761
762
}

Michael Graff's avatar
fix    
Michael Graff committed
763
/*
764
765
766
767
 * Construct an iov array and attach it to the msghdr passed in.  This is
 * the RECV constructor, which will use the avialable region of the buffer
 * (if using a buffer list) or will use the internal region (if a single
 * buffer I/O is requested).
Michael Graff's avatar
fix    
Michael Graff committed
768
769
770
771
772
773
774
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
 *
 * If read_countp != NULL, *read_countp will hold the number of bytes
 * this transaction can receive.
 */
Michael Graff's avatar
Michael Graff committed
775
static void
Michael Graff's avatar
fix    
Michael Graff committed
776
build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
777
		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
Michael Graff's avatar
fix    
Michael Graff committed
778
779
780
781
782
783
{
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t available;
	size_t read_count;

Andreas Gustafsson's avatar
Andreas Gustafsson committed
784
	memset(msg, 0, sizeof(struct msghdr));
Michael Graff's avatar
fix    
Michael Graff committed
785
786
787

	if (sock->type == isc_sockettype_udp) {
		memset(&dev->address, 0, sizeof(dev->address));
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
#ifdef BROKEN_RECVMSG
		if (sock->pf == AF_INET) {
			msg->msg_name = (void *)&dev->address.type.sin;
			msg->msg_namelen = sizeof(dev->address.type.sin6);
		} else if (sock->pf == AF_INET6) {
			msg->msg_name = (void *)&dev->address.type.sin6;
			msg->msg_namelen = sizeof(dev->address.type.sin6);
#ifdef ISC_PLATFORM_HAVESYSUNH
		} else if (sock->pf == AF_UNIX) {
			msg->msg_name = (void *)&dev->address.type.sunix;
			msg->msg_namelen = sizeof(dev->address.type.sunix);
#endif
		} else {
			msg->msg_name = (void *)&dev->address.type.sa;
			msg->msg_namelen = sizeof(dev->address.type);
		}
#else
Michael Graff's avatar
fix    
Michael Graff committed
805
		msg->msg_name = (void *)&dev->address.type.sa;
Bob Halley's avatar
Bob Halley committed
806
		msg->msg_namelen = sizeof(dev->address.type);
807
#endif
808
809
810
811
812
#ifdef ISC_NET_RECVOVERFLOW
		/* If needed, steal one iovec for overflow detection. */
		maxiov--;
#endif
	} else { /* TCP */
Michael Graff's avatar
fix    
Michael Graff committed
813
814
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
815
		dev->address = sock->peer_address;
Michael Graff's avatar
fix    
Michael Graff committed
816
817
818
819
820
821
822
823
824
825
826
827
	}

	buffer = ISC_LIST_HEAD(dev->bufferlist);
	read_count = 0;

	/*
	 * Single buffer I/O?  Skip what we've done so far in this region.
	 */
	if (buffer == NULL) {
		read_count = dev->region.length - dev->n;
		iov[0].iov_base = (void *)(dev->region.base + dev->n);
		iov[0].iov_len = read_count;
828
		iovcount = 1;
Michael Graff's avatar
fix    
Michael Graff committed
829
830
831
832
833
834
835
836
837

		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip empty buffers.
	 */
	while (buffer != NULL) {
838
		REQUIRE(ISC_BUFFER_VALID(buffer));
839
		if (isc_buffer_availablelength(buffer) != 0)
Michael Graff's avatar
fix    
Michael Graff committed
840
841
842
843
844
845
			break;
		buffer = ISC_LIST_NEXT(buffer, link);
	}

	iovcount = 0;
	while (buffer != NULL) {
846
		INSIST(iovcount < MAXSCATTERGATHER_RECV);
Michael Graff's avatar
fix    
Michael Graff committed
847

848
		isc_buffer_availableregion(buffer, &available);
Michael Graff's avatar
fix    
Michael Graff committed
849
850
851
852
853
854
855
856
857
858

		if (available.length > 0) {
			iov[iovcount].iov_base = (void *)(available.base);
			iov[iovcount].iov_len = available.length;
			read_count += available.length;
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
 config:

	/*
	 * If needed, set up to receive that one extra byte.  Note that
	 * we know there is at least one iov left, since we stole it
	 * at the top of this function.
	 */
#ifdef ISC_NET_RECVOVERFLOW
	if (sock->type == isc_sockettype_udp) {
		iov[iovcount].iov_base = (void *)(&sock->overflow);
		iov[iovcount].iov_len = 1;
		iovcount++;
	}
#endif

Michael Graff's avatar
fix    
Michael Graff committed
874
875
876
877
878
879
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;

#ifdef ISC_NET_BSD44MSGHDR
	msg->msg_control = NULL;
	msg->msg_controllen = 0;
880
881
	msg->msg_flags = 0;
#if defined(USE_CMSG)
Michael Graff's avatar
Michael Graff committed
882
	if (sock->type == isc_sockettype_udp) {
883
884
		msg->msg_control = sock->recvcmsgbuf;
		msg->msg_controllen = sock->recvcmsgbuflen;
Michael Graff's avatar
Michael Graff committed
885
	}
886
887
#endif /* USE_CMSG */
#else /* ISC_NET_BSD44MSGHDR */
Michael Graff's avatar
fix    
Michael Graff committed
888
889
	msg->msg_accrights = NULL;
	msg->msg_accrightslen = 0;
890
#endif /* ISC_NET_BSD44MSGHDR */
Michael Graff's avatar
fix    
Michael Graff committed
891
892
893
894
895

	if (read_countp != NULL)
		*read_countp = read_count;
}

896
897
898
899
900
901
902
903
static void
set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
		isc_socketevent_t *dev)
{
	if (sock->type == isc_sockettype_udp) {
		if (address != NULL)
			dev->address = *address;
		else
904
			dev->address = sock->peer_address;
905
906
	} else if (sock->type == isc_sockettype_tcp) {
		INSIST(address == NULL);
907
		dev->address = sock->peer_address;
908
909
910
	}
}

911
912
913
914
915
916
917
918
919
static void
destroy_socketevent(isc_event_t *event) {
	isc_socketevent_t *ev = (isc_socketevent_t *)event;

	INSIST(ISC_LIST_EMPTY(ev->bufferlist));

	(ev->destroy)(event);
}

920
921
static isc_socketevent_t *
allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
David Lawrence's avatar
David Lawrence committed
922
		     isc_taskaction_t action, const void *arg)
923
924
925
926
927
928
{
	isc_socketevent_t *ev;

	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
						     sock, eventtype,
						     action, arg,
Andreas Gustafsson's avatar
Andreas Gustafsson committed
929
						     sizeof(*ev));
930
931
932
933
934

	if (ev == NULL)
		return (NULL);

	ev->result = ISC_R_UNEXPECTED;
935
	ISC_LINK_INIT(ev, ev_link);
936
937
	ISC_LIST_INIT(ev->bufferlist);
	ev->region.base = NULL;
Michael Graff's avatar
fix    
Michael Graff committed
938
939
	ev->n = 0;
	ev->offset = 0;
940
	ev->attributes = 0;
941
942
	ev->destroy = ev->ev_destroy;
	ev->ev_destroy = destroy_socketevent;
943
944
945
946

	return (ev);
}

947
948
#if defined(ISC_SOCKET_DEBUG)
static void
949
dump_msg(struct msghdr *msg) {
950
951
952
953
954
	unsigned int i;

	printf("MSGHDR %p\n", msg);
	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
955
	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
956
957
958
		printf("\t\t%d\tbase %p, len %d\n", i,
		       msg->msg_iov[i].iov_base,
		       msg->msg_iov[i].iov_len);
Michael Graff's avatar
Michael Graff committed
959
960
961
962
#ifdef ISC_NET_BSD44MSGHDR
	printf("\tcontrol %p, controllen %d\n", msg->msg_control,
	       msg->msg_controllen);
#endif
963
964
965
}
#endif

Michael Graff's avatar
Michael Graff committed
966
967
968
969
970
#define DOIO_SUCCESS		0	/* i/o ok, event sent */
#define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
#define DOIO_HARD		2	/* i/o error, event sent */
#define DOIO_EOF		3	/* EOF, no event sent */

971
static int
972
doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
973
	int cc;
974
	struct iovec iov[MAXSCATTERGATHER_RECV];
975
	size_t read_count;
976
	size_t actual_count;
977
	struct msghdr msghdr;
978
	isc_buffer_t *buffer;
979
	int recv_errno;
980
	char strbuf[ISC_STRERRORSIZE];
981

982
	build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
983
984
985
986

#if defined(ISC_SOCKET_DEBUG)
	dump_msg(&msghdr);
#endif
987
988

	cc = recvmsg(sock->fd, &msghdr, 0);
989
	recv_errno = errno;
990

991
992
993
994
#if defined(ISC_SOCKET_DEBUG)
	dump_msg(&msghdr);
#endif

995
	if (cc < 0) {
996
		if (SOFT_ERROR(recv_errno))
997
998
			return (DOIO_SOFT);

999
		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
1000
			isc__strerror(recv_errno, strbuf, sizeof(strbuf));
1001
			socket_log(sock, NULL, IOEVENT,
1002
1003
1004
				   isc_msgcat, ISC_MSGSET_SOCKET,
				   ISC_MSG_DOIORECV, 
				  "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
1005
				   sock->fd, cc, recv_errno, strbuf);
1006
		}
1007

1008
#define SOFT_OR_HARD(_system, _isc) \
1009
	if (recv_errno == _system) { \
Michael Graff's avatar