socket.c 74.7 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
David Lawrence's avatar
David Lawrence committed
2
 * Copyright (C) 1998-2000  Internet Software Consortium.
Bob Halley's avatar
Bob Halley committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
 * 
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS
 * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE
 * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
 * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
 * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 * SOFTWARE.
 */
Bob Halley's avatar
Bob Halley committed
17

18
/* $Id: socket.c,v 1.149 2000/07/13 01:13:53 bwelling Exp $ */
David Lawrence's avatar
David Lawrence committed
19

Bob Halley's avatar
Bob Halley committed
20
#include <config.h>
21

22
#include <sys/param.h>
Michael Graff's avatar
Michael Graff committed
23
#include <sys/types.h>
Michael Graff's avatar
Michael Graff committed
24
25
#include <sys/socket.h>
#include <sys/time.h>
Michael Graff's avatar
Michael Graff committed
26
27
#include <sys/uio.h>

28
29
30
31
32
#include <errno.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
Michael Graff's avatar
Michael Graff committed
33
#include <fcntl.h>
34

35
#include <isc/buffer.h>
36
#include <isc/bufferlist.h>
37
38
#include <isc/condition.h>
#include <isc/list.h>
Michael Graff's avatar
Michael Graff committed
39
#include <isc/log.h>
40
#include <isc/mem.h>
41
#include <isc/net.h>
Michael Graff's avatar
Michael Graff committed
42
#include <isc/print.h>
43
#include <isc/region.h>
44
#include <isc/socket.h>
45
#include <isc/task.h>
46
#include <isc/thread.h>
Michael Graff's avatar
Michael Graff committed
47
#include <isc/util.h>
Bob Halley's avatar
Bob Halley committed
48

49
50
/*
 * Some systems define the socket length argument as an int, some as size_t,
51
 * some as socklen_t.  This is here so it can be easily changed if needed.
52
 */
53
#ifndef ISC_SOCKADDR_LEN_T
54
#define ISC_SOCKADDR_LEN_T unsigned int
55
#endif
56

57
58
59
/*
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
60
61
62
63
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
64
 */
65
66
67
68
#define SOFT_ERROR(e)	((e) == EAGAIN || \
			 (e) == EWOULDBLOCK || \
			 (e) == EINTR || \
			 (e) == 0)
69

Michael Graff's avatar
Michael Graff committed
70
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
71

Michael Graff's avatar
Michael Graff committed
72
73
74
75
76
77
78
/*
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
79
80
81
82
83
84
85
86
87
88
89
#define TRACE_LEVEL		90
#define CORRECTNESS_LEVEL	70
#define IOEVENT_LEVEL		60
#define EVENT_LEVEL		50
#define CREATION_LEVEL		20

#define TRACE		DLVL(TRACE_LEVEL)
#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
#define IOEVENT		DLVL(IOEVENT_LEVEL)
#define EVENT		DLVL(EVENT_LEVEL)
#define CREATION	DLVL(CREATION_LEVEL)
90

91
typedef isc_event_t intev_t;
Michael Graff's avatar
Michael Graff committed
92
93
94

#define SOCKET_MAGIC		0x494f696fU	/* IOio */
#define VALID_SOCKET(t)		((t) != NULL && (t)->magic == SOCKET_MAGIC)
Michael Graff's avatar
Michael Graff committed
95

Michael Graff's avatar
Michael Graff committed
96
97
98
99
100
101
102
103
104
105
106
107
/*
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
#ifdef ISC_PLATFORM_HAVEIPV6
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

/*
108
 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
Michael Graff's avatar
Michael Graff committed
109
110
111
112
113
114
115
116
117
 * a setsockopt() like interface to request timestamps, and if the OS
 * doesn't do it for us, call gettimeofday() on every UDP receive?
 */
#ifdef SO_TIMESTAMP
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

118
119
120
121
122
123
124
125
/*
 * Check to see if we have even basic support for cracking messages from
 * the control data returned from/sent via recvmsg()/sendmsg().
 */
#if defined(USE_CMSG) && (!defined(CMSG_LEN) || !defined(CMSG_SPACE))
#undef USE_CMSG
#endif

126
127
struct isc_socket {
	/* Not locked. */
128
129
130
131
	unsigned int		magic;
	isc_socketmgr_t	       *manager;
	isc_mutex_t		lock;
	isc_sockettype_t	type;
Michael Graff's avatar
Michael Graff committed
132

133
	/* Locked by socket lock. */
134
	ISC_LINK(isc_socket_t)	link;
135
136
137
138
	unsigned int		references;
	int			fd;
	isc_result_t		recv_result;
	isc_result_t		send_result;
139

140
	ISC_LIST(isc_socketevent_t)		send_list;
141
	ISC_LIST(isc_socketevent_t)		recv_list;
142
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
143
144
145
146
147
148
149
	isc_socket_connev_t		       *connect_ev;

	/*
	 * Internal events.  Posted when a descriptor is readable or
	 * writable.  These are statically allocated and never freed.
	 * They will be set to non-purgable before use.
	 */
150
151
	intev_t			readable_ev;
	intev_t			writable_ev;
152

153
	isc_sockaddr_t		address;  /* remote address */
154

155
156
157
158
159
160
161
	unsigned int		pending_recv : 1,
				pending_send : 1,
				pending_accept : 1,
				listener : 1, /* listener socket */
				connected : 1,
				connecting : 1, /* connect pending */
				bound : 1; /* bound to local addr */
162

163
#ifdef ISC_NET_RECVOVERFLOW
164
	unsigned char		overflow; /* used for MSG_TRUNC fake */
165
#endif
Michael Graff's avatar
Michael Graff committed
166
#ifdef USE_CMSG
167
168
	unsigned char	       *cmsg;
	unsigned int		cmsglen;
169
#endif
170
171
};

172
173
174
#define SOCKET_MANAGER_MAGIC	0x494f6d67U	/* IOmg */
#define VALID_MANAGER(m)	((m) != NULL && \
				 (m)->magic == SOCKET_MANAGER_MAGIC)
175
176
struct isc_socketmgr {
	/* Not locked. */
177
178
179
	unsigned int		magic;
	isc_mem_t	       *mctx;
	isc_mutex_t		lock;
180
	/* Locked by manager lock. */
181
	ISC_LIST(isc_socket_t)	socklist;
182
183
184
185
186
187
188
189
	isc_thread_t		watcher;
	isc_condition_t		shutdown_ok;
	fd_set			read_fds;
	fd_set			write_fds;
	isc_socket_t	       *fds[FD_SETSIZE];
	int			fdstate[FD_SETSIZE];
	int			maxfd;
	int			pipe_fds[2];
190
191
};

Michael Graff's avatar
Michael Graff committed
192
193
194
195
#define CLOSED		0	/* this one must be zero */
#define MANAGED		1
#define CLOSE_PENDING	2

196
197
198
199
200
201
202
203
204
205
/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
#ifdef ISC_NET_RECVOVERFLOW
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
#else
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
#endif

206
207
static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **,
				isc_result_t);
Michael Graff's avatar
Michael Graff committed
208
209
static void send_senddone_event(isc_socket_t *, isc_socketevent_t **,
				isc_result_t);
Bob Halley's avatar
Bob Halley committed
210
211
212
213
static void free_socket(isc_socket_t **);
static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
				    isc_socket_t **);
static void destroy(isc_socket_t **);
214
215
216
217
static void internal_accept(isc_task_t *, isc_event_t *);
static void internal_connect(isc_task_t *, isc_event_t *);
static void internal_recv(isc_task_t *, isc_event_t *);
static void internal_send(isc_task_t *, isc_event_t *);
218
static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
Michael Graff's avatar
Michael Graff committed
219
static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
220
			      struct msghdr *, struct iovec *, size_t *);
Michael Graff's avatar
Michael Graff committed
221
static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
222
			      struct msghdr *, struct iovec *, size_t *);
Michael Graff's avatar
Michael Graff committed
223
224
225

#define SELECT_POKE_SHUTDOWN		(-1)
#define SELECT_POKE_NOTHING		(-2)
226

227
228
#define SOCK_DEAD(s)			((s)->references == 0)

Michael Graff's avatar
Michael Graff committed
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
static void
manager_log(isc_socketmgr_t *sockmgr,
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...)
{
	char msgbuf[2048];
	va_list ap;

	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
		      "sockmgr %p: %s", sockmgr, msgbuf);
}

static void
socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   const char *fmt, ...)
{
	char msgbuf[2048];
	char peerbuf[256];
	va_list ap;

254
255
256
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
257
258
259
260
261
262
263
264
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
		isc_log_write(isc_lctx, category, module, level,
			      "socket %p: %s", sock, msgbuf);
	} else {
265
		isc_sockaddr_format(address, peerbuf, sizeof peerbuf);
Michael Graff's avatar
Michael Graff committed
266
267
268
269
270
		isc_log_write(isc_lctx, category, module, level,
			      "socket %p %s: %s", sock, peerbuf, msgbuf);
	}
}

271
/*
Michael Graff's avatar
Michael Graff committed
272
273
274
 * Poke the select loop when there is something for us to do.
 * We assume that if a write completes here, it will be inserted into the
 * queue fully.  That is, we will not get partial writes.
275
276
 */
static void
277
select_poke(isc_socketmgr_t *mgr, int msg) {
Michael Graff's avatar
Michael Graff committed
278
279
	int cc;

280
281
282
283
284
	do {
		cc = write(mgr->pipe_fds[1], &msg, sizeof(int));
	} while (cc < 0 && SOFT_ERROR(errno));

	if (cc < 0)
Michael Graff's avatar
Michael Graff committed
285
286
287
		FATAL_ERROR(__FILE__, __LINE__,
			    "write() failed during watcher poke: %s",
			    strerror(errno));
288
289

	INSIST(cc == sizeof(int));
290
291
292
293
294
}

/*
 * read a message on the internal fd.
 */
Michael Graff's avatar
Michael Graff committed
295
static int
296
select_readmsg(isc_socketmgr_t *mgr) {
Michael Graff's avatar
Michael Graff committed
297
	int msg;
Michael Graff's avatar
Michael Graff committed
298
299
	int cc;

Michael Graff's avatar
Michael Graff committed
300
	cc = read(mgr->pipe_fds[0], &msg, sizeof(int));
Michael Graff's avatar
Michael Graff committed
301
	if (cc < 0) {
Michael Graff's avatar
Michael Graff committed
302
		if (SOFT_ERROR(errno))
Michael Graff's avatar
Michael Graff committed
303
			return (SELECT_POKE_NOTHING);
Michael Graff's avatar
Michael Graff committed
304

Michael Graff's avatar
Michael Graff committed
305
306
307
308
		FATAL_ERROR(__FILE__, __LINE__,
			    "read() failed during watcher poke: %s",
			    strerror(errno));

Michael Graff's avatar
Michael Graff committed
309
		return (SELECT_POKE_NOTHING);
Michael Graff's avatar
Michael Graff committed
310
	}
311

Michael Graff's avatar
Michael Graff committed
312
	return (msg);
313
314
315
}

/*
Michael Graff's avatar
Michael Graff committed
316
 * Make a fd non-blocking
317
 */
Michael Graff's avatar
Michael Graff committed
318
static isc_result_t
319
make_nonblock(int fd) {
Michael Graff's avatar
Michael Graff committed
320
321
	int ret;
	int flags;
322

Michael Graff's avatar
Michael Graff committed
323
324
325
	flags = fcntl(fd, F_GETFL, 0);
	flags |= O_NONBLOCK;
	ret = fcntl(fd, F_SETFL, flags);
326

Michael Graff's avatar
Michael Graff committed
327
328
329
330
	if (ret == -1) {
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "fcntl(%d, F_SETFL, %d): %s",
				 fd, flags, strerror(errno));
Michael Graff's avatar
Michael Graff committed
331

Michael Graff's avatar
Michael Graff committed
332
		return (ISC_R_UNEXPECTED);
Michael Graff's avatar
Michael Graff committed
333
334
	}

Michael Graff's avatar
Michael Graff committed
335
	return (ISC_R_SUCCESS);
336
337
}

338
339
340
341
/*
 * Process control messages received on a socket.
 */
static void
342
process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
Michael Graff's avatar
Michael Graff committed
343
#ifdef USE_CMSG
344
	struct cmsghdr *cmsgp;
Michael Graff's avatar
Michael Graff committed
345
346
347
348
349
350
351
352
#ifdef ISC_PLATFORM_HAVEIPV6
	struct in6_pktinfo *pktinfop;
#endif
#ifdef SO_TIMESTAMP
	struct timeval *timevalp;
#endif
#endif

353
354
355
356
357
358
	/*
	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
	 * They are all here, outside of the CPP tests, because it is
	 * more consistent with the usual ISC coding style.
	 */
359
	UNUSED(sock);
360
361
362
363
364
365
366
	UNUSED(msg);
	UNUSED(dev);

#ifndef ISC_NET_BSD44MSGHDR
	return;

#else  /* defined ISC_NET_BSD44MSGHDR */
367

Bob Halley's avatar
Bob Halley committed
368
#ifdef MSG_TRUNC
369
370
	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
Bob Halley's avatar
Bob Halley committed
371
#endif
372

Bob Halley's avatar
Bob Halley committed
373
#ifdef MSG_CTRUNC
374
375
	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
Bob Halley's avatar
Bob Halley committed
376
#endif
377

378
379
380
381
382
383
	/*
	 * Check for multicast.
	 */
	if (isc_sockaddr_ismulticast(&dev->address))
		dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;

Michael Graff's avatar
Michael Graff committed
384
385
386
#ifndef USE_CMSG
	return;
#else
387
388
	if (msg->msg_controllen == 0 || msg->msg_control == NULL)
		return;
Michael Graff's avatar
Michael Graff committed
389
390
391
392
393
394
395
396
397
398

#ifdef SO_TIMESTAMP
	timevalp = NULL;
#endif
#ifdef ISC_PLATFORM_HAVEIPV6
	pktinfop = NULL;
#endif

	cmsgp = CMSG_FIRSTHDR(msg);
	while (cmsgp != NULL) {
399
		socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
Michael Graff's avatar
Michael Graff committed
400
401

#ifdef ISC_PLATFORM_HAVEIPV6
402
403
		if (cmsgp->cmsg_level == IPPROTO_IPV6
		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
404

Michael Graff's avatar
Michael Graff committed
405
			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
406
407
			memcpy(&dev->pktinfo, pktinfop,
			       sizeof(struct in6_pktinfo));
Michael Graff's avatar
Michael Graff committed
408
			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
409
			socket_log(sock, NULL, TRACE,
David Lawrence's avatar
David Lawrence committed
410
411
				   "interface received on ifindex %u",
				   dev->pktinfo.ipi6_ifindex);
Michael Graff's avatar
Michael Graff committed
412
413
414
415
416
			goto next;
		}
#endif

#ifdef SO_TIMESTAMP
417
418
		if (cmsgp->cmsg_level == SOL_SOCKET
		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
Michael Graff's avatar
Michael Graff committed
419
420
421
422
423
424
425
426
427
428
429
430
431
			timevalp = (struct timeval *)CMSG_DATA(cmsgp);
			dev->timestamp.seconds = timevalp->tv_sec;
			dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
			goto next;
		}
#endif

	next:
		cmsgp = CMSG_NXTHDR(msg, cmsgp);
	}
#endif /* USE_CMSG */

432
#endif /* ISC_NET_BSD44MSGHDR */
433
434
435

}

436
437
438
/*
 * Construct an iov array and attach it to the msghdr passed in.  Return
 * 0 on success, non-zero on failure.  This is the SEND constructor, which
439
440
 * will used the used region of the buffer (if using a buffer list) or
 * will use the internal region (if a single buffer I/O is requested).
441
442
443
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
Michael Graff's avatar
fix    
Michael Graff committed
444
445
446
 *
 * If write_countp != NULL, *write_countp will hold the number of bytes
 * this transaction can send.
447
 */
Michael Graff's avatar
Michael Graff committed
448
static void
449
build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
450
		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
451
452
453
454
{
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t used;
455
456
457
	size_t write_count;
	size_t skip_count;

Michael Graff's avatar
fix    
Michael Graff committed
458
	memset(msg, 0, sizeof (*msg));
459
460
461
462

	if (sock->type == isc_sockettype_udp) {
		msg->msg_name = (void *)&dev->address.type.sa;
		msg->msg_namelen = dev->address.length;
Michael Graff's avatar
fix    
Michael Graff committed
463
464
465
	} else {
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
466
	}
467
468

	buffer = ISC_LIST_HEAD(dev->bufferlist);
469
	write_count = 0;
Michael Graff's avatar
fix    
Michael Graff committed
470
	iovcount = 0;
471

472
	/*
473
	 * Single buffer I/O?  Skip what we've done so far in this region.
474
475
	 */
	if (buffer == NULL) {
476
477
478
		write_count = dev->region.length - dev->n;
		iov[0].iov_base = (void *)(dev->region.base + dev->n);
		iov[0].iov_len = write_count;
Michael Graff's avatar
fix    
Michael Graff committed
479
		iovcount = 1;
480

481
482
483
484
485
486
487
		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip the data in the buffer list that we have already written.
	 */
Michael Graff's avatar
fix    
Michael Graff committed
488
	skip_count = dev->n;
489
	while (buffer != NULL) {
490
		REQUIRE(ISC_BUFFER_VALID(buffer));
491
		if (skip_count < isc_buffer_usedlength(buffer))
492
			break;
493
		skip_count -= isc_buffer_usedlength(buffer);
494
		buffer = ISC_LIST_NEXT(buffer, link);
495
496
497
	}

	while (buffer != NULL) {
498
		INSIST(iovcount < MAXSCATTERGATHER_SEND);
499

500
		isc_buffer_usedregion(buffer, &used);
501

502
		if (used.length > 0) {
503
504
505
506
507
			iov[iovcount].iov_base = (void *)(used.base
							  + skip_count);
			iov[iovcount].iov_len = used.length - skip_count;
			write_count += (used.length - skip_count);
			skip_count = 0;
508
509
510
511
512
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

Michael Graff's avatar
fix    
Michael Graff committed
513
514
515
	INSIST(skip_count == 0);

 config:
516
517
518
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;

519
520
521
522
#ifdef ISC_NET_BSD44MSGHDR
	msg->msg_control = NULL;
	msg->msg_controllen = 0;
	msg->msg_flags = 0;
523
524
525
526
#if defined(USE_CMSG)
	if ((sock->type == isc_sockettype_udp)
	    && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
		struct cmsghdr *cmsgp;
527
528
		struct in6_pktinfo *pktinfop;

529
		socket_log(sock, NULL, TRACE,
David Lawrence's avatar
David Lawrence committed
530
531
			   "sendto pktinfo data, ifindex %u",
			   dev->pktinfo.ipi6_ifindex);
532

533
534
535
536
537
538
539
540
		msg->msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo));
		msg->msg_control = (void *)sock->cmsg;

		cmsgp = (struct cmsghdr *)sock->cmsg;
		cmsgp->cmsg_level = IPPROTO_IPV6;
		cmsgp->cmsg_type = IPV6_PKTINFO;
		cmsgp->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
541
		memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
542
543
544
	}
#endif /* USE_CMSG */
#else /* ISC_NET_BSD44MSGHDR */
545
546
	msg->msg_accrights = NULL;
	msg->msg_accrightslen = 0;
547
#endif /* ISC_NET_BSD44MSGHDR */
548
549
550

	if (write_countp != NULL)
		*write_countp = write_count;
551
552
}

Michael Graff's avatar
fix    
Michael Graff committed
553
554
555
556
557
558
559
560
561
562
563
564
/*
 * Construct an iov array and attach it to the msghdr passed in.  Return
 * 0 on success, non-zero on failure.  This is the RECV constructor, which
 * will use the avialable region of the buffer (if using a buffer list) or
 * will use the internal region (if a single buffer I/O is requested).
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
 *
 * If read_countp != NULL, *read_countp will hold the number of bytes
 * this transaction can receive.
 */
Michael Graff's avatar
Michael Graff committed
565
static void
Michael Graff's avatar
fix    
Michael Graff committed
566
build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
567
		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
Michael Graff's avatar
fix    
Michael Graff committed
568
569
570
571
572
573
574
575
576
577
578
{
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t available;
	size_t read_count;

	memset(msg, 0, sizeof (struct msghdr));

	if (sock->type == isc_sockettype_udp) {
		memset(&dev->address, 0, sizeof(dev->address));
		msg->msg_name = (void *)&dev->address.type.sa;
Bob Halley's avatar
Bob Halley committed
579
		msg->msg_namelen = sizeof(dev->address.type);
580
581
582
583
584
#ifdef ISC_NET_RECVOVERFLOW
		/* If needed, steal one iovec for overflow detection. */
		maxiov--;
#endif
	} else { /* TCP */
Michael Graff's avatar
fix    
Michael Graff committed
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
		dev->address = sock->address;
	}

	buffer = ISC_LIST_HEAD(dev->bufferlist);
	read_count = 0;

	/*
	 * Single buffer I/O?  Skip what we've done so far in this region.
	 */
	if (buffer == NULL) {
		read_count = dev->region.length - dev->n;
		iov[0].iov_base = (void *)(dev->region.base + dev->n);
		iov[0].iov_len = read_count;
600
		iovcount = 1;
Michael Graff's avatar
fix    
Michael Graff committed
601
602
603
604
605
606
607
608
609

		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip empty buffers.
	 */
	while (buffer != NULL) {
610
		REQUIRE(ISC_BUFFER_VALID(buffer));
611
		if (isc_buffer_availablelength(buffer) != 0)
Michael Graff's avatar
fix    
Michael Graff committed
612
613
614
615
616
617
			break;
		buffer = ISC_LIST_NEXT(buffer, link);
	}

	iovcount = 0;
	while (buffer != NULL) {
618
		INSIST(iovcount < MAXSCATTERGATHER_RECV);
Michael Graff's avatar
fix    
Michael Graff committed
619

620
		isc_buffer_availableregion(buffer, &available);
Michael Graff's avatar
fix    
Michael Graff committed
621
622
623
624
625
626
627
628
629
630

		if (available.length > 0) {
			iov[iovcount].iov_base = (void *)(available.base);
			iov[iovcount].iov_len = available.length;
			read_count += available.length;
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
 config:

	/*
	 * If needed, set up to receive that one extra byte.  Note that
	 * we know there is at least one iov left, since we stole it
	 * at the top of this function.
	 */
#ifdef ISC_NET_RECVOVERFLOW
	if (sock->type == isc_sockettype_udp) {
		iov[iovcount].iov_base = (void *)(&sock->overflow);
		iov[iovcount].iov_len = 1;
		iovcount++;
	}
#endif

Michael Graff's avatar
fix    
Michael Graff committed
646
647
648
649
650
651
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;

#ifdef ISC_NET_BSD44MSGHDR
	msg->msg_control = NULL;
	msg->msg_controllen = 0;
652
653
	msg->msg_flags = 0;
#if defined(USE_CMSG)
Michael Graff's avatar
Michael Graff committed
654
	if (sock->type == isc_sockettype_udp) {
655
656
		msg->msg_control = (void *)sock->cmsg;
		msg->msg_controllen = sock->cmsglen;
Michael Graff's avatar
Michael Graff committed
657
	}
658
659
#endif /* USE_CMSG */
#else /* ISC_NET_BSD44MSGHDR */
Michael Graff's avatar
fix    
Michael Graff committed
660
661
	msg->msg_accrights = NULL;
	msg->msg_accrightslen = 0;
662
#endif /* ISC_NET_BSD44MSGHDR */
Michael Graff's avatar
fix    
Michael Graff committed
663
664
665
666
667

	if (read_countp != NULL)
		*read_countp = read_count;
}

668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
static void
set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
		isc_socketevent_t *dev)
{
	if (sock->type == isc_sockettype_udp) {
		if (address != NULL)
			dev->address = *address;
		else
			dev->address = sock->address;
	} else if (sock->type == isc_sockettype_tcp) {
		INSIST(address == NULL);
		dev->address = sock->address;
	}
}

683
684
static isc_socketevent_t *
allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
David Lawrence's avatar
David Lawrence committed
685
		     isc_taskaction_t action, const void *arg)
686
687
688
689
690
691
692
693
694
695
696
697
{
	isc_socketevent_t *ev;

	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
						     sock, eventtype,
						     action, arg,
						     sizeof (*ev));

	if (ev == NULL)
		return (NULL);

	ev->result = ISC_R_UNEXPECTED;
698
	ISC_LINK_INIT(ev, ev_link);
699
700
	ISC_LIST_INIT(ev->bufferlist);
	ev->region.base = NULL;
Michael Graff's avatar
fix    
Michael Graff committed
701
702
	ev->n = 0;
	ev->offset = 0;
703
	ev->attributes = 0;
704
705
706
707

	return (ev);
}

708
709
#if defined(ISC_SOCKET_DEBUG)
static void
710
dump_msg(struct msghdr *msg) {
711
712
713
714
715
	unsigned int i;

	printf("MSGHDR %p\n", msg);
	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
Michael Graff's avatar
Michael Graff committed
716
	for (i = 0 ; i < (unsigned int)msg->msg_iovlen ; i++)
717
718
719
		printf("\t\t%d\tbase %p, len %d\n", i,
		       msg->msg_iov[i].iov_base,
		       msg->msg_iov[i].iov_len);
Michael Graff's avatar
Michael Graff committed
720
721
722
723
#ifdef ISC_NET_BSD44MSGHDR
	printf("\tcontrol %p, controllen %d\n", msg->msg_control,
	       msg->msg_controllen);
#endif
724
725
726
}
#endif

Michael Graff's avatar
Michael Graff committed
727
728
729
730
731
#define DOIO_SUCCESS		0	/* i/o ok, event sent */
#define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
#define DOIO_HARD		2	/* i/o error, event sent */
#define DOIO_EOF		3	/* EOF, no event sent */

732
static int
733
doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
734
	int cc;
735
	struct iovec iov[MAXSCATTERGATHER_RECV];
736
	size_t read_count;
737
	size_t actual_count;
738
	struct msghdr msghdr;
739
	isc_buffer_t *buffer;
740

741
	build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
742
743
744
745

#if defined(ISC_SOCKET_DEBUG)
	dump_msg(&msghdr);
#endif
746
747
748
749
750
751
752

	cc = recvmsg(sock->fd, &msghdr, 0);

	if (cc < 0) {
		if (SOFT_ERROR(errno))
			return (DOIO_SOFT);

753
754
755
756
		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL))
			socket_log(sock, NULL, IOEVENT,
				   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
				   sock->fd, cc, errno, strerror(errno));
757

758
759
760
761
762
763
#define SOFT_OR_HARD(_system, _isc) \
	if (errno == _system) { \
		if (sock->connected) { \
			if (sock->type == isc_sockettype_tcp) \
				sock->recv_result = _isc; \
			send_recvdone_event(sock, &dev, _isc); \
Michael Graff's avatar
Michael Graff committed
764
			return (DOIO_HARD); \
765
		} \
Michael Graff's avatar
Michael Graff committed
766
		return (DOIO_SOFT); \
767
	}
Michael Graff's avatar
Michael Graff committed
768
769
770
771
772
773
#define ALWAYS_HARD(_system, _isc) \
	if (errno == _system) { \
		sock->recv_result = _isc; \
		send_recvdone_event(sock, &dev, _isc); \
		return (DOIO_HARD); \
	}
774
775

		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
776
777
		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
Michael Graff's avatar
Michael Graff committed
778
		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
779

Michael Graff's avatar
Michael Graff committed
780
781
#undef SOFT_OR_HARD
#undef ALWAYS_HARD
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797

		sock->recv_result = ISC_R_UNEXPECTED;
		send_recvdone_event(sock, &dev, ISC_R_UNEXPECTED);
		return (DOIO_SUCCESS);
	}

	/*
	 * On TCP, zero length reads indicate EOF, while on
	 * UDP, zero length reads are perfectly valid, although
	 * strange.
	 */
	if ((sock->type == isc_sockettype_tcp) && (cc == 0)) {
		sock->recv_result = ISC_R_EOF;
		return (DOIO_EOF);
	}

Michael Graff's avatar
Michael Graff committed
798
799
800
	if (sock->type == isc_sockettype_udp)
		dev->address.length = msghdr.msg_namelen;

Michael Graff's avatar
Michael Graff committed
801
802
	socket_log(sock, &dev->address, IOEVENT, "packet received correctly");

803
804
805
806
807
808
809
810
811
812
813
814
	/*
	 * Overflow bit detection.  If we received MORE bytes than we should,
	 * this indicates an overflow situation.  Set the flag in the
	 * dev entry and adjust how much we read by one.
	 */
#ifdef ISC_NET_RECVOVERFLOW
	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
		cc--;
	}
#endif

815
816
817
818
	/*
	 * If there are control messages attached, run through them and pull
	 * out the interesting bits.
	 */
Michael Graff's avatar
Michael Graff committed
819
820
	if (sock->type == isc_sockettype_udp)
		process_cmsg(sock, &msghdr, dev);
821

822
823
824
825
826
827
828
	/*
	 * update the buffers (if any) and the i/o count
	 */
	dev->n += cc;
	actual_count = cc;
	buffer = ISC_LIST_HEAD(dev->bufferlist);
	while (buffer != NULL && actual_count > 0) {
829
		REQUIRE(ISC_BUFFER_VALID(buffer));
830
831
		if (isc_buffer_availablelength(buffer) <= actual_count) {
			actual_count -= isc_buffer_availablelength(buffer);
832
			isc_buffer_add(buffer,
833
				       isc_buffer_availablelength(buffer));
834
835
836
837
838
839
840
841
842
843
844
		} else {
			isc_buffer_add(buffer, actual_count);
			actual_count = 0;
			break;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
		if (buffer == NULL) {
			INSIST(actual_count == 0);
		}
	}

845
846
847
848
849
850
851
852
853
854
855
	/*
	 * If we read less than we expected, update counters,
	 * and let the upper layer poke the descriptor.
	 */
	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
		return (DOIO_SOFT);

	/*
	 * full reads are posted, or partials if partials are ok.
	 */
	send_recvdone_event(sock, &dev, ISC_R_SUCCESS);
Michael Graff's avatar
Michael Graff committed
856
857
858
	return (DOIO_SUCCESS);
}

Andreas Gustafsson's avatar
Andreas Gustafsson committed
859
860
861
862
863
864
865
866
867
868
869
870
871
/*
 * Returns:
 *	DOIO_SUCCESS	The operation succeeded.  The senddone event
 *			was sent.
 *
 *	DOIO_HARD	A hard or unexpected I/O error was encountered.
 *			The senddone event was sent.
 *
 *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
 *			event was sent.  The operation should be retried.
 *
 *	No other return values are possible.
 */
Michael Graff's avatar
Michael Graff committed
872
static int
873
doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
Michael Graff's avatar
Michael Graff committed
874
	int cc;
875
	struct iovec iov[MAXSCATTERGATHER_SEND];
Michael Graff's avatar
Michael Graff committed
876
877
878
	size_t write_count;
	struct msghdr msghdr;

879
	build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
Michael Graff's avatar
Michael Graff committed
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899

	cc = sendmsg(sock->fd, &msghdr, 0);

	/*
	 * check for error or block condition
	 */
	if (cc < 0) {
		if (SOFT_ERROR(errno))
			return (DOIO_SOFT);

#define SOFT_OR_HARD(_system, _isc) \
	if (errno == _system) { \
		if (sock->connected) { \
			if (sock->type == isc_sockettype_tcp) \
				sock->send_result = _isc; \
			send_senddone_event(sock, &dev, _isc); \
			return (DOIO_HARD); \
		} \
		return (DOIO_SOFT); \
	}
Michael Graff's avatar
Michael Graff committed
900
901
#define ALWAYS_HARD(_system, _isc) \
	if (errno == _system) { \
902
903
		if (sock->connected && sock->type == isc_sockettype_tcp) \
			sock->send_result = _isc; \
Michael Graff's avatar
Michael Graff committed
904
905
906
		send_senddone_event(sock, &dev, _isc); \
		return (DOIO_HARD); \
	}
Michael Graff's avatar
Michael Graff committed
907
908

		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
909
910
		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
Michael Graff's avatar
Michael Graff committed
911
		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
912
		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
Michael Graff's avatar
Michael Graff committed
913

Michael Graff's avatar
Michael Graff committed
914
915
#undef SOFT_OR_HARD
#undef ALWAYS_HARD
Michael Graff's avatar
Michael Graff committed
916
917
918
919
920
921
922
923
924
925
926
927
928

		/*
		 * The other error types depend on whether or not the
		 * socket is UDP or TCP.  If it is UDP, some errors
		 * that we expect to be fatal under TCP are merely
		 * annoying, and are really soft errors.
		 *
		 * However, these soft errors are still returned as
		 * a status.
		 */
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "internal_send: %s",
				 strerror(errno));
929
930
		if (sock->connected && sock->type == isc_sockettype_tcp)
			sock->send_result = ISC_R_UNEXPECTED;
Michael Graff's avatar
Michael Graff committed
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
		send_senddone_event(sock, &dev, ISC_R_UNEXPECTED);
		return (DOIO_HARD);
	}

	if (cc == 0)
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "internal_send: send() returned 0");

	/*
	 * if we write less than we expected, update counters,
	 * poke.
	 */
	dev->n += cc;
	if ((size_t)cc != write_count)
		return (DOIO_SOFT);
946

Michael Graff's avatar
Michael Graff committed
947
948
949
950
951
	/*
	 * Exactly what we wanted to write.  We're done with this
	 * entry.  Post its completion event.
	 */
	send_senddone_event(sock, &dev, ISC_R_SUCCESS);
952
953
954
	return (DOIO_SUCCESS);
}

955
956
957
/*
 * Kill.
 *
958
959
 * Caller must ensure that the socket is not locked and no external
 * references exist.
960
961
 */
static void
962
destroy(isc_socket_t **sockp) {
Bob Halley's avatar
Bob Halley committed
963
964
	isc_socket_t *sock = *sockp;
	isc_socketmgr_t *manager = sock->manager;
965

Michael Graff's avatar
Michael Graff committed
966
	socket_log(sock, NULL, CREATION, "destroying");
Michael Graff's avatar
Michael Graff committed
967

968
969
970
971
972
	INSIST(ISC_LIST_EMPTY(sock->accept_list));
	INSIST(ISC_LIST_EMPTY(sock->recv_list));
	INSIST(ISC_LIST_EMPTY(sock->send_list));
	INSIST(sock->connect_ev == NULL);

973
974
975
	LOCK(&manager->lock);

	/*
Bob Halley's avatar
Bob Halley committed
976
	 * No one has this socket open, so the watcher doesn't have to be
Michael Graff's avatar
Michael Graff committed
977
	 * poked, and the socket doesn't have to be locked.
978
	 */
Michael Graff's avatar
Michael Graff committed
979
	manager->fds[sock->fd] = NULL;
Michael Graff's avatar
Michael Graff committed
980
	manager->fdstate[sock->fd] = CLOSE_PENDING;
981
982
	select_poke(manager, sock->fd);
	ISC_LIST_UNLINK(manager->socklist, sock, link);
Michael Graff's avatar
Michael Graff committed
983

984
	if (ISC_LIST_EMPTY(manager->socklist))
985
		SIGNAL(&manager->shutdown_ok);
986

987
988
989
990
	/*
	 * XXX should reset manager->maxfd here
	 */

991
992
	UNLOCK(&manager->lock);

Michael Graff's avatar
Michael Graff committed
993
	free_socket(sockp);
Michael Graff's avatar
Michael Graff committed
994
995
996
}

static isc_result_t
Bob Halley's avatar
Bob Halley committed
997
998
allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
		isc_socket_t **socketp)
Michael Graff's avatar
Michael Graff committed
999
{
Bob Halley's avatar
Bob Halley committed
1000
	isc_socket_t *sock;