socket.c 74.6 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
David Lawrence's avatar
David Lawrence committed
2
 * Copyright (C) 1998-2000  Internet Software Consortium.
3
 *
Bob Halley's avatar
Bob Halley committed
4
5
6
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
7
 *
8
9
10
11
12
13
14
15
 * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM
 * DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
 * INTERNET SOFTWARE CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
 * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
 * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
 * WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
Bob Halley's avatar
Bob Halley committed
16
 */
Bob Halley's avatar
Bob Halley committed
17

18
/* $Id: socket.c,v 1.153 2000/08/01 01:31:27 tale Exp $ */
David Lawrence's avatar
David Lawrence committed
19

Bob Halley's avatar
Bob Halley committed
20
#include <config.h>
21

22
#include <sys/param.h>
Michael Graff's avatar
Michael Graff committed
23
#include <sys/types.h>
Michael Graff's avatar
Michael Graff committed
24
25
#include <sys/socket.h>
#include <sys/time.h>
Michael Graff's avatar
Michael Graff committed
26
27
#include <sys/uio.h>

28
29
30
31
32
#include <errno.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
Michael Graff's avatar
Michael Graff committed
33
#include <fcntl.h>
34

35
#include <isc/buffer.h>
36
#include <isc/bufferlist.h>
37
38
#include <isc/condition.h>
#include <isc/list.h>
Michael Graff's avatar
Michael Graff committed
39
#include <isc/log.h>
40
#include <isc/mem.h>
41
#include <isc/net.h>
Michael Graff's avatar
Michael Graff committed
42
#include <isc/print.h>
43
#include <isc/region.h>
44
#include <isc/socket.h>
45
#include <isc/task.h>
46
#include <isc/thread.h>
Michael Graff's avatar
Michael Graff committed
47
#include <isc/util.h>
Bob Halley's avatar
Bob Halley committed
48

49
50
/*
 * Some systems define the socket length argument as an int, some as size_t,
51
 * some as socklen_t.  This is here so it can be easily changed if needed.
52
 */
53
#ifndef ISC_SOCKADDR_LEN_T
54
#define ISC_SOCKADDR_LEN_T unsigned int
55
#endif
56

57
58
59
/*
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
60
61
62
63
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
64
 */
65
66
67
68
#define SOFT_ERROR(e)	((e) == EAGAIN || \
			 (e) == EWOULDBLOCK || \
			 (e) == EINTR || \
			 (e) == 0)
69

Michael Graff's avatar
Michael Graff committed
70
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
71

Michael Graff's avatar
Michael Graff committed
72
73
74
75
76
77
78
/*
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
79
80
81
82
83
84
85
86
87
88
89
#define TRACE_LEVEL		90
#define CORRECTNESS_LEVEL	70
#define IOEVENT_LEVEL		60
#define EVENT_LEVEL		50
#define CREATION_LEVEL		20

#define TRACE		DLVL(TRACE_LEVEL)
#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
#define IOEVENT		DLVL(IOEVENT_LEVEL)
#define EVENT		DLVL(EVENT_LEVEL)
#define CREATION	DLVL(CREATION_LEVEL)
90

91
typedef isc_event_t intev_t;
Michael Graff's avatar
Michael Graff committed
92
93
94

#define SOCKET_MAGIC		0x494f696fU	/* IOio */
#define VALID_SOCKET(t)		((t) != NULL && (t)->magic == SOCKET_MAGIC)
Michael Graff's avatar
Michael Graff committed
95

Michael Graff's avatar
Michael Graff committed
96
97
98
99
100
101
102
103
104
105
106
107
/*
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
#ifdef ISC_PLATFORM_HAVEIPV6
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

/*
108
 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
Michael Graff's avatar
Michael Graff committed
109
110
111
112
113
114
115
116
117
 * a setsockopt() like interface to request timestamps, and if the OS
 * doesn't do it for us, call gettimeofday() on every UDP receive?
 */
#ifdef SO_TIMESTAMP
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

118
119
120
121
122
123
124
125
/*
 * Check to see if we have even basic support for cracking messages from
 * the control data returned from/sent via recvmsg()/sendmsg().
 */
#if defined(USE_CMSG) && (!defined(CMSG_LEN) || !defined(CMSG_SPACE))
#undef USE_CMSG
#endif

126
127
struct isc_socket {
	/* Not locked. */
128
129
130
131
	unsigned int		magic;
	isc_socketmgr_t	       *manager;
	isc_mutex_t		lock;
	isc_sockettype_t	type;
Michael Graff's avatar
Michael Graff committed
132

133
	/* Locked by socket lock. */
134
	ISC_LINK(isc_socket_t)	link;
135
136
137
138
	unsigned int		references;
	int			fd;
	isc_result_t		recv_result;
	isc_result_t		send_result;
139

140
	ISC_LIST(isc_socketevent_t)		send_list;
141
	ISC_LIST(isc_socketevent_t)		recv_list;
142
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
143
144
145
146
147
148
149
	isc_socket_connev_t		       *connect_ev;

	/*
	 * Internal events.  Posted when a descriptor is readable or
	 * writable.  These are statically allocated and never freed.
	 * They will be set to non-purgable before use.
	 */
150
151
	intev_t			readable_ev;
	intev_t			writable_ev;
152

153
	isc_sockaddr_t		address;  /* remote address */
154

155
156
157
158
159
160
161
	unsigned int		pending_recv : 1,
				pending_send : 1,
				pending_accept : 1,
				listener : 1, /* listener socket */
				connected : 1,
				connecting : 1, /* connect pending */
				bound : 1; /* bound to local addr */
162

163
#ifdef ISC_NET_RECVOVERFLOW
164
	unsigned char		overflow; /* used for MSG_TRUNC fake */
165
#endif
Michael Graff's avatar
Michael Graff committed
166
#ifdef USE_CMSG
167
168
	unsigned char	       *cmsg;
	unsigned int		cmsglen;
169
#endif
170
171
};

172
173
174
#define SOCKET_MANAGER_MAGIC	0x494f6d67U	/* IOmg */
#define VALID_MANAGER(m)	((m) != NULL && \
				 (m)->magic == SOCKET_MANAGER_MAGIC)
175
176
struct isc_socketmgr {
	/* Not locked. */
177
178
179
	unsigned int		magic;
	isc_mem_t	       *mctx;
	isc_mutex_t		lock;
180
	/* Locked by manager lock. */
181
	ISC_LIST(isc_socket_t)	socklist;
182
183
184
185
186
187
188
189
	isc_thread_t		watcher;
	isc_condition_t		shutdown_ok;
	fd_set			read_fds;
	fd_set			write_fds;
	isc_socket_t	       *fds[FD_SETSIZE];
	int			fdstate[FD_SETSIZE];
	int			maxfd;
	int			pipe_fds[2];
190
191
};

Michael Graff's avatar
Michael Graff committed
192
193
194
195
#define CLOSED		0	/* this one must be zero */
#define MANAGED		1
#define CLOSE_PENDING	2

196
197
198
199
200
201
202
203
204
205
/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
#ifdef ISC_NET_RECVOVERFLOW
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
#else
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
#endif

206
207
static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **,
				isc_result_t);
Michael Graff's avatar
Michael Graff committed
208
209
static void send_senddone_event(isc_socket_t *, isc_socketevent_t **,
				isc_result_t);
Bob Halley's avatar
Bob Halley committed
210
211
212
213
static void free_socket(isc_socket_t **);
static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
				    isc_socket_t **);
static void destroy(isc_socket_t **);
214
215
216
217
static void internal_accept(isc_task_t *, isc_event_t *);
static void internal_connect(isc_task_t *, isc_event_t *);
static void internal_recv(isc_task_t *, isc_event_t *);
static void internal_send(isc_task_t *, isc_event_t *);
218
static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
Michael Graff's avatar
Michael Graff committed
219
static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
220
			      struct msghdr *, struct iovec *, size_t *);
Michael Graff's avatar
Michael Graff committed
221
static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
222
			      struct msghdr *, struct iovec *, size_t *);
Michael Graff's avatar
Michael Graff committed
223
224
225

#define SELECT_POKE_SHUTDOWN		(-1)
#define SELECT_POKE_NOTHING		(-2)
226

227
228
#define SOCK_DEAD(s)			((s)->references == 0)

Michael Graff's avatar
Michael Graff committed
229
230
231
232
233
234
235
236
static void
manager_log(isc_socketmgr_t *sockmgr,
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...)
{
	char msgbuf[2048];
	va_list ap;

237
238
239
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
		      "sockmgr %p: %s", sockmgr, msgbuf);
}

static void
socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   const char *fmt, ...)
{
	char msgbuf[2048];
	char peerbuf[256];
	va_list ap;

257
258
259
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
260
261
262
263
264
265
266
267
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
		isc_log_write(isc_lctx, category, module, level,
			      "socket %p: %s", sock, msgbuf);
	} else {
268
		isc_sockaddr_format(address, peerbuf, sizeof peerbuf);
Michael Graff's avatar
Michael Graff committed
269
270
271
272
273
		isc_log_write(isc_lctx, category, module, level,
			      "socket %p %s: %s", sock, peerbuf, msgbuf);
	}
}

274
/*
Michael Graff's avatar
Michael Graff committed
275
276
277
 * Poke the select loop when there is something for us to do.
 * We assume that if a write completes here, it will be inserted into the
 * queue fully.  That is, we will not get partial writes.
278
279
 */
static void
280
select_poke(isc_socketmgr_t *mgr, int msg) {
Michael Graff's avatar
Michael Graff committed
281
282
	int cc;

283
284
285
286
287
	do {
		cc = write(mgr->pipe_fds[1], &msg, sizeof(int));
	} while (cc < 0 && SOFT_ERROR(errno));

	if (cc < 0)
Michael Graff's avatar
Michael Graff committed
288
289
290
		FATAL_ERROR(__FILE__, __LINE__,
			    "write() failed during watcher poke: %s",
			    strerror(errno));
291
292

	INSIST(cc == sizeof(int));
293
294
295
296
297
}

/*
 * read a message on the internal fd.
 */
Michael Graff's avatar
Michael Graff committed
298
static int
299
select_readmsg(isc_socketmgr_t *mgr) {
Michael Graff's avatar
Michael Graff committed
300
	int msg;
Michael Graff's avatar
Michael Graff committed
301
302
	int cc;

Michael Graff's avatar
Michael Graff committed
303
	cc = read(mgr->pipe_fds[0], &msg, sizeof(int));
Michael Graff's avatar
Michael Graff committed
304
	if (cc < 0) {
Michael Graff's avatar
Michael Graff committed
305
		if (SOFT_ERROR(errno))
Michael Graff's avatar
Michael Graff committed
306
			return (SELECT_POKE_NOTHING);
Michael Graff's avatar
Michael Graff committed
307

Michael Graff's avatar
Michael Graff committed
308
309
310
311
		FATAL_ERROR(__FILE__, __LINE__,
			    "read() failed during watcher poke: %s",
			    strerror(errno));

Michael Graff's avatar
Michael Graff committed
312
		return (SELECT_POKE_NOTHING);
Michael Graff's avatar
Michael Graff committed
313
	}
314

Michael Graff's avatar
Michael Graff committed
315
	return (msg);
316
317
318
}

/*
Michael Graff's avatar
Michael Graff committed
319
 * Make a fd non-blocking
320
 */
Michael Graff's avatar
Michael Graff committed
321
static isc_result_t
322
make_nonblock(int fd) {
Michael Graff's avatar
Michael Graff committed
323
324
	int ret;
	int flags;
325

Michael Graff's avatar
Michael Graff committed
326
327
328
	flags = fcntl(fd, F_GETFL, 0);
	flags |= O_NONBLOCK;
	ret = fcntl(fd, F_SETFL, flags);
329

Michael Graff's avatar
Michael Graff committed
330
331
332
333
	if (ret == -1) {
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "fcntl(%d, F_SETFL, %d): %s",
				 fd, flags, strerror(errno));
Michael Graff's avatar
Michael Graff committed
334

Michael Graff's avatar
Michael Graff committed
335
		return (ISC_R_UNEXPECTED);
Michael Graff's avatar
Michael Graff committed
336
337
	}

Michael Graff's avatar
Michael Graff committed
338
	return (ISC_R_SUCCESS);
339
340
}

341
342
343
344
/*
 * Process control messages received on a socket.
 */
static void
345
process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
Michael Graff's avatar
Michael Graff committed
346
#ifdef USE_CMSG
347
	struct cmsghdr *cmsgp;
Michael Graff's avatar
Michael Graff committed
348
349
350
351
352
353
354
355
#ifdef ISC_PLATFORM_HAVEIPV6
	struct in6_pktinfo *pktinfop;
#endif
#ifdef SO_TIMESTAMP
	struct timeval *timevalp;
#endif
#endif

356
357
358
359
360
361
	/*
	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
	 * They are all here, outside of the CPP tests, because it is
	 * more consistent with the usual ISC coding style.
	 */
362
	UNUSED(sock);
363
364
365
366
367
368
369
	UNUSED(msg);
	UNUSED(dev);

#ifndef ISC_NET_BSD44MSGHDR
	return;

#else  /* defined ISC_NET_BSD44MSGHDR */
370

Bob Halley's avatar
Bob Halley committed
371
#ifdef MSG_TRUNC
372
373
	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
Bob Halley's avatar
Bob Halley committed
374
#endif
375

Bob Halley's avatar
Bob Halley committed
376
#ifdef MSG_CTRUNC
377
378
	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
Bob Halley's avatar
Bob Halley committed
379
#endif
380

381
382
383
384
385
386
	/*
	 * Check for multicast.
	 */
	if (isc_sockaddr_ismulticast(&dev->address))
		dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;

Michael Graff's avatar
Michael Graff committed
387
388
389
#ifndef USE_CMSG
	return;
#else
390
391
	if (msg->msg_controllen == 0 || msg->msg_control == NULL)
		return;
Michael Graff's avatar
Michael Graff committed
392
393
394
395
396
397
398
399
400
401

#ifdef SO_TIMESTAMP
	timevalp = NULL;
#endif
#ifdef ISC_PLATFORM_HAVEIPV6
	pktinfop = NULL;
#endif

	cmsgp = CMSG_FIRSTHDR(msg);
	while (cmsgp != NULL) {
402
		socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
Michael Graff's avatar
Michael Graff committed
403
404

#ifdef ISC_PLATFORM_HAVEIPV6
405
406
		if (cmsgp->cmsg_level == IPPROTO_IPV6
		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
407

Michael Graff's avatar
Michael Graff committed
408
			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
409
410
			memcpy(&dev->pktinfo, pktinfop,
			       sizeof(struct in6_pktinfo));
Michael Graff's avatar
Michael Graff committed
411
			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
412
			socket_log(sock, NULL, TRACE,
David Lawrence's avatar
David Lawrence committed
413
414
				   "interface received on ifindex %u",
				   dev->pktinfo.ipi6_ifindex);
Michael Graff's avatar
Michael Graff committed
415
416
417
418
419
			goto next;
		}
#endif

#ifdef SO_TIMESTAMP
420
421
		if (cmsgp->cmsg_level == SOL_SOCKET
		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
Michael Graff's avatar
Michael Graff committed
422
423
424
425
426
427
428
429
430
431
432
433
434
			timevalp = (struct timeval *)CMSG_DATA(cmsgp);
			dev->timestamp.seconds = timevalp->tv_sec;
			dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
			goto next;
		}
#endif

	next:
		cmsgp = CMSG_NXTHDR(msg, cmsgp);
	}
#endif /* USE_CMSG */

435
#endif /* ISC_NET_BSD44MSGHDR */
436
437
438

}

439
440
441
/*
 * Construct an iov array and attach it to the msghdr passed in.  Return
 * 0 on success, non-zero on failure.  This is the SEND constructor, which
442
443
 * will used the used region of the buffer (if using a buffer list) or
 * will use the internal region (if a single buffer I/O is requested).
444
445
446
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
Michael Graff's avatar
fix    
Michael Graff committed
447
448
449
 *
 * If write_countp != NULL, *write_countp will hold the number of bytes
 * this transaction can send.
450
 */
Michael Graff's avatar
Michael Graff committed
451
static void
452
build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
453
		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
454
455
456
457
{
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t used;
458
459
460
	size_t write_count;
	size_t skip_count;

Michael Graff's avatar
fix    
Michael Graff committed
461
	memset(msg, 0, sizeof (*msg));
462
463
464
465

	if (sock->type == isc_sockettype_udp) {
		msg->msg_name = (void *)&dev->address.type.sa;
		msg->msg_namelen = dev->address.length;
Michael Graff's avatar
fix    
Michael Graff committed
466
467
468
	} else {
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
469
	}
470
471

	buffer = ISC_LIST_HEAD(dev->bufferlist);
472
	write_count = 0;
Michael Graff's avatar
fix    
Michael Graff committed
473
	iovcount = 0;
474

475
	/*
476
	 * Single buffer I/O?  Skip what we've done so far in this region.
477
478
	 */
	if (buffer == NULL) {
479
480
481
		write_count = dev->region.length - dev->n;
		iov[0].iov_base = (void *)(dev->region.base + dev->n);
		iov[0].iov_len = write_count;
Michael Graff's avatar
fix    
Michael Graff committed
482
		iovcount = 1;
483

484
485
486
487
488
489
490
		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip the data in the buffer list that we have already written.
	 */
Michael Graff's avatar
fix    
Michael Graff committed
491
	skip_count = dev->n;
492
	while (buffer != NULL) {
493
		REQUIRE(ISC_BUFFER_VALID(buffer));
494
		if (skip_count < isc_buffer_usedlength(buffer))
495
			break;
496
		skip_count -= isc_buffer_usedlength(buffer);
497
		buffer = ISC_LIST_NEXT(buffer, link);
498
499
500
	}

	while (buffer != NULL) {
501
		INSIST(iovcount < MAXSCATTERGATHER_SEND);
502

503
		isc_buffer_usedregion(buffer, &used);
504

505
		if (used.length > 0) {
506
507
508
509
510
			iov[iovcount].iov_base = (void *)(used.base
							  + skip_count);
			iov[iovcount].iov_len = used.length - skip_count;
			write_count += (used.length - skip_count);
			skip_count = 0;
511
512
513
514
515
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

Michael Graff's avatar
fix    
Michael Graff committed
516
517
518
	INSIST(skip_count == 0);

 config:
519
520
521
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;

522
523
524
525
#ifdef ISC_NET_BSD44MSGHDR
	msg->msg_control = NULL;
	msg->msg_controllen = 0;
	msg->msg_flags = 0;
526
527
528
529
#if defined(USE_CMSG)
	if ((sock->type == isc_sockettype_udp)
	    && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
		struct cmsghdr *cmsgp;
530
531
		struct in6_pktinfo *pktinfop;

532
		socket_log(sock, NULL, TRACE,
David Lawrence's avatar
David Lawrence committed
533
534
			   "sendto pktinfo data, ifindex %u",
			   dev->pktinfo.ipi6_ifindex);
535

536
537
538
539
540
541
542
543
		msg->msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo));
		msg->msg_control = (void *)sock->cmsg;

		cmsgp = (struct cmsghdr *)sock->cmsg;
		cmsgp->cmsg_level = IPPROTO_IPV6;
		cmsgp->cmsg_type = IPV6_PKTINFO;
		cmsgp->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
544
		memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
545
546
547
	}
#endif /* USE_CMSG */
#else /* ISC_NET_BSD44MSGHDR */
548
549
	msg->msg_accrights = NULL;
	msg->msg_accrightslen = 0;
550
#endif /* ISC_NET_BSD44MSGHDR */
551
552
553

	if (write_countp != NULL)
		*write_countp = write_count;
554
555
}

Michael Graff's avatar
fix    
Michael Graff committed
556
557
558
559
560
561
562
563
564
565
566
567
/*
 * Construct an iov array and attach it to the msghdr passed in.  Return
 * 0 on success, non-zero on failure.  This is the RECV constructor, which
 * will use the avialable region of the buffer (if using a buffer list) or
 * will use the internal region (if a single buffer I/O is requested).
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
 *
 * If read_countp != NULL, *read_countp will hold the number of bytes
 * this transaction can receive.
 */
Michael Graff's avatar
Michael Graff committed
568
static void
Michael Graff's avatar
fix    
Michael Graff committed
569
build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
570
		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
Michael Graff's avatar
fix    
Michael Graff committed
571
572
573
574
575
576
577
578
579
580
581
{
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t available;
	size_t read_count;

	memset(msg, 0, sizeof (struct msghdr));

	if (sock->type == isc_sockettype_udp) {
		memset(&dev->address, 0, sizeof(dev->address));
		msg->msg_name = (void *)&dev->address.type.sa;
Bob Halley's avatar
Bob Halley committed
582
		msg->msg_namelen = sizeof(dev->address.type);
583
584
585
586
587
#ifdef ISC_NET_RECVOVERFLOW
		/* If needed, steal one iovec for overflow detection. */
		maxiov--;
#endif
	} else { /* TCP */
Michael Graff's avatar
fix    
Michael Graff committed
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
		dev->address = sock->address;
	}

	buffer = ISC_LIST_HEAD(dev->bufferlist);
	read_count = 0;

	/*
	 * Single buffer I/O?  Skip what we've done so far in this region.
	 */
	if (buffer == NULL) {
		read_count = dev->region.length - dev->n;
		iov[0].iov_base = (void *)(dev->region.base + dev->n);
		iov[0].iov_len = read_count;
603
		iovcount = 1;
Michael Graff's avatar
fix    
Michael Graff committed
604
605
606
607
608
609
610
611
612

		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip empty buffers.
	 */
	while (buffer != NULL) {
613
		REQUIRE(ISC_BUFFER_VALID(buffer));
614
		if (isc_buffer_availablelength(buffer) != 0)
Michael Graff's avatar
fix    
Michael Graff committed
615
616
617
618
619
620
			break;
		buffer = ISC_LIST_NEXT(buffer, link);
	}

	iovcount = 0;
	while (buffer != NULL) {
621
		INSIST(iovcount < MAXSCATTERGATHER_RECV);
Michael Graff's avatar
fix    
Michael Graff committed
622

623
		isc_buffer_availableregion(buffer, &available);
Michael Graff's avatar
fix    
Michael Graff committed
624
625
626
627
628
629
630
631
632
633

		if (available.length > 0) {
			iov[iovcount].iov_base = (void *)(available.base);
			iov[iovcount].iov_len = available.length;
			read_count += available.length;
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
 config:

	/*
	 * If needed, set up to receive that one extra byte.  Note that
	 * we know there is at least one iov left, since we stole it
	 * at the top of this function.
	 */
#ifdef ISC_NET_RECVOVERFLOW
	if (sock->type == isc_sockettype_udp) {
		iov[iovcount].iov_base = (void *)(&sock->overflow);
		iov[iovcount].iov_len = 1;
		iovcount++;
	}
#endif

Michael Graff's avatar
fix    
Michael Graff committed
649
650
651
652
653
654
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;

#ifdef ISC_NET_BSD44MSGHDR
	msg->msg_control = NULL;
	msg->msg_controllen = 0;
655
656
	msg->msg_flags = 0;
#if defined(USE_CMSG)
Michael Graff's avatar
Michael Graff committed
657
	if (sock->type == isc_sockettype_udp) {
658
659
		msg->msg_control = (void *)sock->cmsg;
		msg->msg_controllen = sock->cmsglen;
Michael Graff's avatar
Michael Graff committed
660
	}
661
662
#endif /* USE_CMSG */
#else /* ISC_NET_BSD44MSGHDR */
Michael Graff's avatar
fix    
Michael Graff committed
663
664
	msg->msg_accrights = NULL;
	msg->msg_accrightslen = 0;
665
#endif /* ISC_NET_BSD44MSGHDR */
Michael Graff's avatar
fix    
Michael Graff committed
666
667
668
669
670

	if (read_countp != NULL)
		*read_countp = read_count;
}

671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
static void
set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
		isc_socketevent_t *dev)
{
	if (sock->type == isc_sockettype_udp) {
		if (address != NULL)
			dev->address = *address;
		else
			dev->address = sock->address;
	} else if (sock->type == isc_sockettype_tcp) {
		INSIST(address == NULL);
		dev->address = sock->address;
	}
}

686
687
static isc_socketevent_t *
allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
David Lawrence's avatar
David Lawrence committed
688
		     isc_taskaction_t action, const void *arg)
689
690
691
692
693
694
695
696
697
698
699
700
{
	isc_socketevent_t *ev;

	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
						     sock, eventtype,
						     action, arg,
						     sizeof (*ev));

	if (ev == NULL)
		return (NULL);

	ev->result = ISC_R_UNEXPECTED;
701
	ISC_LINK_INIT(ev, ev_link);
702
703
	ISC_LIST_INIT(ev->bufferlist);
	ev->region.base = NULL;
Michael Graff's avatar
fix    
Michael Graff committed
704
705
	ev->n = 0;
	ev->offset = 0;
706
	ev->attributes = 0;
707
708
709
710

	return (ev);
}

711
712
#if defined(ISC_SOCKET_DEBUG)
static void
713
dump_msg(struct msghdr *msg) {
714
715
716
717
718
	unsigned int i;

	printf("MSGHDR %p\n", msg);
	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
Michael Graff's avatar
Michael Graff committed
719
	for (i = 0 ; i < (unsigned int)msg->msg_iovlen ; i++)
720
721
722
		printf("\t\t%d\tbase %p, len %d\n", i,
		       msg->msg_iov[i].iov_base,
		       msg->msg_iov[i].iov_len);
Michael Graff's avatar
Michael Graff committed
723
724
725
726
#ifdef ISC_NET_BSD44MSGHDR
	printf("\tcontrol %p, controllen %d\n", msg->msg_control,
	       msg->msg_controllen);
#endif
727
728
729
}
#endif

Michael Graff's avatar
Michael Graff committed
730
731
732
733
734
#define DOIO_SUCCESS		0	/* i/o ok, event sent */
#define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
#define DOIO_HARD		2	/* i/o error, event sent */
#define DOIO_EOF		3	/* EOF, no event sent */

735
static int
736
doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
737
	int cc;
738
	struct iovec iov[MAXSCATTERGATHER_RECV];
739
	size_t read_count;
740
	size_t actual_count;
741
	struct msghdr msghdr;
742
	isc_buffer_t *buffer;
743

744
	build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
745
746
747
748

#if defined(ISC_SOCKET_DEBUG)
	dump_msg(&msghdr);
#endif
749
750
751
752
753
754
755

	cc = recvmsg(sock->fd, &msghdr, 0);

	if (cc < 0) {
		if (SOFT_ERROR(errno))
			return (DOIO_SOFT);

756
757
758
759
		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL))
			socket_log(sock, NULL, IOEVENT,
				   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
				   sock->fd, cc, errno, strerror(errno));
760

761
762
763
764
765
766
#define SOFT_OR_HARD(_system, _isc) \
	if (errno == _system) { \
		if (sock->connected) { \
			if (sock->type == isc_sockettype_tcp) \
				sock->recv_result = _isc; \
			send_recvdone_event(sock, &dev, _isc); \
Michael Graff's avatar
Michael Graff committed
767
			return (DOIO_HARD); \
768
		} \
Michael Graff's avatar
Michael Graff committed
769
		return (DOIO_SOFT); \
770
	}
Michael Graff's avatar
Michael Graff committed
771
772
773
774
775
776
#define ALWAYS_HARD(_system, _isc) \
	if (errno == _system) { \
		sock->recv_result = _isc; \
		send_recvdone_event(sock, &dev, _isc); \
		return (DOIO_HARD); \
	}
777
778

		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
779
780
		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
Michael Graff's avatar
Michael Graff committed
781
		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
782

Michael Graff's avatar
Michael Graff committed
783
784
#undef SOFT_OR_HARD
#undef ALWAYS_HARD
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800

		sock->recv_result = ISC_R_UNEXPECTED;
		send_recvdone_event(sock, &dev, ISC_R_UNEXPECTED);
		return (DOIO_SUCCESS);
	}

	/*
	 * On TCP, zero length reads indicate EOF, while on
	 * UDP, zero length reads are perfectly valid, although
	 * strange.
	 */
	if ((sock->type == isc_sockettype_tcp) && (cc == 0)) {
		sock->recv_result = ISC_R_EOF;
		return (DOIO_EOF);
	}

Michael Graff's avatar
Michael Graff committed
801
802
803
	if (sock->type == isc_sockettype_udp)
		dev->address.length = msghdr.msg_namelen;

Michael Graff's avatar
Michael Graff committed
804
805
	socket_log(sock, &dev->address, IOEVENT, "packet received correctly");

806
807
808
809
810
811
812
813
814
815
816
817
	/*
	 * Overflow bit detection.  If we received MORE bytes than we should,
	 * this indicates an overflow situation.  Set the flag in the
	 * dev entry and adjust how much we read by one.
	 */
#ifdef ISC_NET_RECVOVERFLOW
	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
		cc--;
	}
#endif

818
819
820
821
	/*
	 * If there are control messages attached, run through them and pull
	 * out the interesting bits.
	 */
Michael Graff's avatar
Michael Graff committed
822
823
	if (sock->type == isc_sockettype_udp)
		process_cmsg(sock, &msghdr, dev);
824

825
826
827
828
829
830
831
	/*
	 * update the buffers (if any) and the i/o count
	 */
	dev->n += cc;
	actual_count = cc;
	buffer = ISC_LIST_HEAD(dev->bufferlist);
	while (buffer != NULL && actual_count > 0) {
832
		REQUIRE(ISC_BUFFER_VALID(buffer));
833
834
		if (isc_buffer_availablelength(buffer) <= actual_count) {
			actual_count -= isc_buffer_availablelength(buffer);
835
			isc_buffer_add(buffer,
836
				       isc_buffer_availablelength(buffer));
837
838
839
840
841
842
843
844
845
846
847
		} else {
			isc_buffer_add(buffer, actual_count);
			actual_count = 0;
			break;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
		if (buffer == NULL) {
			INSIST(actual_count == 0);
		}
	}

848
849
850
851
852
853
854
855
856
857
858
	/*
	 * If we read less than we expected, update counters,
	 * and let the upper layer poke the descriptor.
	 */
	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
		return (DOIO_SOFT);

	/*
	 * full reads are posted, or partials if partials are ok.
	 */
	send_recvdone_event(sock, &dev, ISC_R_SUCCESS);
Michael Graff's avatar
Michael Graff committed
859
860
861
	return (DOIO_SUCCESS);
}

Andreas Gustafsson's avatar
Andreas Gustafsson committed
862
863
864
865
866
867
868
869
870
871
872
873
874
/*
 * Returns:
 *	DOIO_SUCCESS	The operation succeeded.  The senddone event
 *			was sent.
 *
 *	DOIO_HARD	A hard or unexpected I/O error was encountered.
 *			The senddone event was sent.
 *
 *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
 *			event was sent.  The operation should be retried.
 *
 *	No other return values are possible.
 */
Michael Graff's avatar
Michael Graff committed
875
static int
876
doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
Michael Graff's avatar
Michael Graff committed
877
	int cc;
878
	struct iovec iov[MAXSCATTERGATHER_SEND];
Michael Graff's avatar
Michael Graff committed
879
880
881
	size_t write_count;
	struct msghdr msghdr;

882
	build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
Michael Graff's avatar
Michael Graff committed
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902

	cc = sendmsg(sock->fd, &msghdr, 0);

	/*
	 * check for error or block condition
	 */
	if (cc < 0) {
		if (SOFT_ERROR(errno))
			return (DOIO_SOFT);

#define SOFT_OR_HARD(_system, _isc) \
	if (errno == _system) { \
		if (sock->connected) { \
			if (sock->type == isc_sockettype_tcp) \
				sock->send_result = _isc; \
			send_senddone_event(sock, &dev, _isc); \
			return (DOIO_HARD); \
		} \
		return (DOIO_SOFT); \
	}
Michael Graff's avatar
Michael Graff committed
903
904
#define ALWAYS_HARD(_system, _isc) \
	if (errno == _system) { \
905
906
		if (sock->connected && sock->type == isc_sockettype_tcp) \
			sock->send_result = _isc; \
Michael Graff's avatar
Michael Graff committed
907
908
909
		send_senddone_event(sock, &dev, _isc); \
		return (DOIO_HARD); \
	}
Michael Graff's avatar
Michael Graff committed
910
911

		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
912
913
		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
Michael Graff's avatar
Michael Graff committed
914
		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
915
		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
Michael Graff's avatar
Michael Graff committed
916

Michael Graff's avatar
Michael Graff committed
917
918
#undef SOFT_OR_HARD
#undef ALWAYS_HARD
Michael Graff's avatar
Michael Graff committed
919
920
921
922
923
924
925
926
927
928
929
930
931

		/*
		 * The other error types depend on whether or not the
		 * socket is UDP or TCP.  If it is UDP, some errors
		 * that we expect to be fatal under TCP are merely
		 * annoying, and are really soft errors.
		 *
		 * However, these soft errors are still returned as
		 * a status.
		 */
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "internal_send: %s",
				 strerror(errno));
932
933
		if (sock->connected && sock->type == isc_sockettype_tcp)
			sock->send_result = ISC_R_UNEXPECTED;
Michael Graff's avatar
Michael Graff committed
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
		send_senddone_event(sock, &dev, ISC_R_UNEXPECTED);
		return (DOIO_HARD);
	}

	if (cc == 0)
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "internal_send: send() returned 0");

	/*
	 * if we write less than we expected, update counters,
	 * poke.
	 */
	dev->n += cc;
	if ((size_t)cc != write_count)
		return (DOIO_SOFT);
949

Michael Graff's avatar
Michael Graff committed
950
951
952
953
954
	/*
	 * Exactly what we wanted to write.  We're done with this
	 * entry.  Post its completion event.
	 */
	send_senddone_event(sock, &dev, ISC_R_SUCCESS);
955
956
957
	return (DOIO_SUCCESS);
}

958
959
960
/*
 * Kill.
 *
961
962
 * Caller must ensure that the socket is not locked and no external
 * references exist.
963
964
 */
static void
965
destroy(isc_socket_t **sockp) {
Bob Halley's avatar
Bob Halley committed
966
967
	isc_socket_t *sock = *sockp;
	isc_socketmgr_t *manager = sock->manager;
968

Michael Graff's avatar
Michael Graff committed
969
	socket_log(sock, NULL, CREATION, "destroying");
Michael Graff's avatar
Michael Graff committed
970

971
972
973
974
975
	INSIST(ISC_LIST_EMPTY(sock->accept_list));
	INSIST(ISC_LIST_EMPTY(sock->recv_list));
	INSIST(ISC_LIST_EMPTY(sock->send_list));
	INSIST(sock->connect_ev == NULL);

976
977
978
	LOCK(&manager->lock);

	/*
Bob Halley's avatar
Bob Halley committed
979
	 * No one has this socket open, so the watcher doesn't have to be
Michael Graff's avatar
Michael Graff committed
980
	 * poked, and the socket doesn't have to be locked.
981
	 */
Michael Graff's avatar
Michael Graff committed
982
	manager->fds[sock->fd] = NULL;
Michael Graff's avatar
Michael Graff committed
983
	manager->fdstate[sock->fd] = CLOSE_PENDING;
984
985
	select_poke(manager, sock->fd);
	ISC_LIST_UNLINK(manager->socklist, sock, link);
Michael Graff's avatar
Michael Graff committed
986

987
	if (ISC_LIST_EMPTY(manager->socklist))
988
		SIGNAL(&manager->shutdown_ok);
989

990
991
992
993
	/*
	 * XXX should reset manager->maxfd here
	 */

994
995
	UNLOCK(&manager->lock);

Michael Graff's avatar
Michael Graff committed
996
	free_socket(sockp);
Michael Graff's avatar
Michael Graff committed
997
998
999
}

static isc_result_t
Bob Halley's avatar
Bob Halley committed
1000
allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,