socket.c 77.1 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
David Lawrence's avatar
David Lawrence committed
2
 * Copyright (C) 1998-2000  Internet Software Consortium.
3
 *
Bob Halley's avatar
Bob Halley committed
4
5
6
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
7
 *
8
9
10
11
12
13
14
15
 * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM
 * DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
 * INTERNET SOFTWARE CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
 * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
 * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
 * WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
Bob Halley's avatar
Bob Halley committed
16
 */
Bob Halley's avatar
Bob Halley committed
17

Andreas Gustafsson's avatar
Andreas Gustafsson committed
18
/* $Id: socket.c,v 1.163 2000/08/31 17:10:58 gson Exp $ */
David Lawrence's avatar
David Lawrence committed
19

Bob Halley's avatar
Bob Halley committed
20
#include <config.h>
21

22
#include <sys/param.h>
Michael Graff's avatar
Michael Graff committed
23
#include <sys/types.h>
Michael Graff's avatar
Michael Graff committed
24
25
#include <sys/socket.h>
#include <sys/time.h>
Michael Graff's avatar
Michael Graff committed
26
27
#include <sys/uio.h>

28
29
30
31
32
#include <errno.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
Michael Graff's avatar
Michael Graff committed
33
#include <fcntl.h>
34

35
#include <isc/buffer.h>
36
#include <isc/bufferlist.h>
37
38
#include <isc/condition.h>
#include <isc/list.h>
Michael Graff's avatar
Michael Graff committed
39
#include <isc/log.h>
40
#include <isc/mem.h>
41
#include <isc/mutex.h>
42
#include <isc/net.h>
43
#include <isc/platform.h>
Michael Graff's avatar
Michael Graff committed
44
#include <isc/print.h>
45
#include <isc/region.h>
46
#include <isc/socket.h>
47
#include <isc/task.h>
48
#include <isc/thread.h>
Michael Graff's avatar
Michael Graff committed
49
#include <isc/util.h>
Bob Halley's avatar
Bob Halley committed
50

51
52
53
54
#ifndef ISC_PLATFORM_USETHREADS
#include "socket_p.h"
#endif

55
56
/*
 * Some systems define the socket length argument as an int, some as size_t,
57
 * some as socklen_t.  This is here so it can be easily changed if needed.
58
 */
59
#ifndef ISC_SOCKADDR_LEN_T
60
#define ISC_SOCKADDR_LEN_T unsigned int
61
#endif
62

63
64
65
/*
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
66
67
68
69
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
70
 */
71
72
73
74
#define SOFT_ERROR(e)	((e) == EAGAIN || \
			 (e) == EWOULDBLOCK || \
			 (e) == EINTR || \
			 (e) == 0)
75

Michael Graff's avatar
Michael Graff committed
76
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
77

Michael Graff's avatar
Michael Graff committed
78
79
80
81
82
83
84
/*
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
85
86
87
88
89
90
91
92
93
94
95
#define TRACE_LEVEL		90
#define CORRECTNESS_LEVEL	70
#define IOEVENT_LEVEL		60
#define EVENT_LEVEL		50
#define CREATION_LEVEL		20

#define TRACE		DLVL(TRACE_LEVEL)
#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
#define IOEVENT		DLVL(IOEVENT_LEVEL)
#define EVENT		DLVL(EVENT_LEVEL)
#define CREATION	DLVL(CREATION_LEVEL)
96

97
typedef isc_event_t intev_t;
Michael Graff's avatar
Michael Graff committed
98
99
100

#define SOCKET_MAGIC		0x494f696fU	/* IOio */
#define VALID_SOCKET(t)		((t) != NULL && (t)->magic == SOCKET_MAGIC)
Michael Graff's avatar
Michael Graff committed
101

Michael Graff's avatar
Michael Graff committed
102
103
104
105
106
107
108
109
110
111
112
113
/*
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
#ifdef ISC_PLATFORM_HAVEIPV6
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

/*
114
 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
Michael Graff's avatar
Michael Graff committed
115
116
117
118
119
120
121
122
123
 * a setsockopt() like interface to request timestamps, and if the OS
 * doesn't do it for us, call gettimeofday() on every UDP receive?
 */
#ifdef SO_TIMESTAMP
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

124
125
126
127
128
129
130
131
/*
 * Check to see if we have even basic support for cracking messages from
 * the control data returned from/sent via recvmsg()/sendmsg().
 */
#if defined(USE_CMSG) && (!defined(CMSG_LEN) || !defined(CMSG_SPACE))
#undef USE_CMSG
#endif

132
133
struct isc_socket {
	/* Not locked. */
134
135
136
137
	unsigned int		magic;
	isc_socketmgr_t	       *manager;
	isc_mutex_t		lock;
	isc_sockettype_t	type;
Michael Graff's avatar
Michael Graff committed
138

139
	/* Locked by socket lock. */
140
	ISC_LINK(isc_socket_t)	link;
141
142
	unsigned int		references;
	int			fd;
143
	int			pf;
144
145
	isc_result_t		recv_result;
	isc_result_t		send_result;
146

147
	ISC_LIST(isc_socketevent_t)		send_list;
148
	ISC_LIST(isc_socketevent_t)		recv_list;
149
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
150
151
152
153
154
155
156
	isc_socket_connev_t		       *connect_ev;

	/*
	 * Internal events.  Posted when a descriptor is readable or
	 * writable.  These are statically allocated and never freed.
	 * They will be set to non-purgable before use.
	 */
157
158
	intev_t			readable_ev;
	intev_t			writable_ev;
159

160
	isc_sockaddr_t		address;  /* remote address */
161

162
163
164
165
166
167
168
	unsigned int		pending_recv : 1,
				pending_send : 1,
				pending_accept : 1,
				listener : 1, /* listener socket */
				connected : 1,
				connecting : 1, /* connect pending */
				bound : 1; /* bound to local addr */
169

170
#ifdef ISC_NET_RECVOVERFLOW
171
	unsigned char		overflow; /* used for MSG_TRUNC fake */
172
#endif
Michael Graff's avatar
Michael Graff committed
173
#ifdef USE_CMSG
174
175
	unsigned char	       *cmsg;
	unsigned int		cmsglen;
176
#endif
177
178
};

179
180
181
#define SOCKET_MANAGER_MAGIC	0x494f6d67U	/* IOmg */
#define VALID_MANAGER(m)	((m) != NULL && \
				 (m)->magic == SOCKET_MANAGER_MAGIC)
182
183
struct isc_socketmgr {
	/* Not locked. */
184
185
186
	unsigned int		magic;
	isc_mem_t	       *mctx;
	isc_mutex_t		lock;
187
	/* Locked by manager lock. */
188
	ISC_LIST(isc_socket_t)	socklist;
189
190
191
192
193
	fd_set			read_fds;
	fd_set			write_fds;
	isc_socket_t	       *fds[FD_SETSIZE];
	int			fdstate[FD_SETSIZE];
	int			maxfd;
194
195
196
#ifdef ISC_PLATFORM_USETHREADS
	isc_thread_t		watcher;
	isc_condition_t		shutdown_ok;
197
	int			pipe_fds[2];
198
199
200
#else
	unsigned int		refs;
#endif
201
202
};

203
204
205
206
#ifndef ISC_PLATFORM_USETHREADS
static isc_socketmgr_t *socketmgr = NULL;
#endif

Michael Graff's avatar
Michael Graff committed
207
208
209
210
#define CLOSED		0	/* this one must be zero */
#define MANAGED		1
#define CLOSE_PENDING	2

211
212
213
214
215
216
217
218
219
220
/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
#ifdef ISC_NET_RECVOVERFLOW
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
#else
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
#endif

221
222
static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **,
				isc_result_t);
Michael Graff's avatar
Michael Graff committed
223
224
static void send_senddone_event(isc_socket_t *, isc_socketevent_t **,
				isc_result_t);
Bob Halley's avatar
Bob Halley committed
225
226
227
228
static void free_socket(isc_socket_t **);
static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
				    isc_socket_t **);
static void destroy(isc_socket_t **);
229
230
231
232
static void internal_accept(isc_task_t *, isc_event_t *);
static void internal_connect(isc_task_t *, isc_event_t *);
static void internal_recv(isc_task_t *, isc_event_t *);
static void internal_send(isc_task_t *, isc_event_t *);
233
static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
Michael Graff's avatar
Michael Graff committed
234
static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
235
			      struct msghdr *, struct iovec *, size_t *);
Michael Graff's avatar
Michael Graff committed
236
static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
237
			      struct msghdr *, struct iovec *, size_t *);
Michael Graff's avatar
Michael Graff committed
238
239
240

#define SELECT_POKE_SHUTDOWN		(-1)
#define SELECT_POKE_NOTHING		(-2)
241

242
243
#define SOCK_DEAD(s)			((s)->references == 0)

Michael Graff's avatar
Michael Graff committed
244
245
246
247
248
249
250
251
static void
manager_log(isc_socketmgr_t *sockmgr,
	    isc_logcategory_t *category, isc_logmodule_t *module, int level,
	    const char *fmt, ...)
{
	char msgbuf[2048];
	va_list ap;

252
253
254
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
		      "sockmgr %p: %s", sockmgr, msgbuf);
}

static void
socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   const char *fmt, ...)
{
	char msgbuf[2048];
	char peerbuf[256];
	va_list ap;

272
273
274
	if (! isc_log_wouldlog(isc_lctx, level))
		return;

Michael Graff's avatar
Michael Graff committed
275
276
277
278
279
280
281
282
	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
		isc_log_write(isc_lctx, category, module, level,
			      "socket %p: %s", sock, msgbuf);
	} else {
283
		isc_sockaddr_format(address, peerbuf, sizeof peerbuf);
Michael Graff's avatar
Michael Graff committed
284
285
286
287
288
		isc_log_write(isc_lctx, category, module, level,
			      "socket %p %s: %s", sock, peerbuf, msgbuf);
	}
}

289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
static void
wakeup_socket(isc_socketmgr_t *manager, int fd) {
	isc_event_t *ev2;
	isc_socketevent_t *rev;
	isc_socket_t *sock;

	/*
	 * This is a wakeup on a socket.  Look at the event queue for both
	 * read and write, and decide if we need to watch on it now or not.
	 */
	INSIST(fd < FD_SETSIZE);

	if (manager->fdstate[fd] == CLOSE_PENDING) {
		manager->fdstate[fd] = CLOSED;
		FD_CLR(fd, &manager->read_fds);
		FD_CLR(fd, &manager->write_fds);
		close(fd);
		return;
	}
	if (manager->fdstate[fd] != MANAGED)
		return;

	sock = manager->fds[fd];

	/*
	 * If there are no events, or there is an event but we
	 * have already queued up the internal event on a task's
	 * queue, clear the bit.  Otherwise, set it.
	 */
	rev = ISC_LIST_HEAD(sock->recv_list);
	ev2 = (isc_event_t *) ISC_LIST_HEAD(sock->accept_list);
	if ((rev == NULL && ev2 == NULL)
	    || sock->pending_recv || sock->pending_accept)
		FD_CLR(sock->fd, &manager->read_fds);
	else
		FD_SET(sock->fd, &manager->read_fds);

	rev = ISC_LIST_HEAD(sock->send_list);
	if ((rev == NULL || sock->pending_send) && !sock->connecting)
		FD_CLR(sock->fd, &manager->write_fds);
	else
		FD_SET(sock->fd, &manager->write_fds);
}

#ifdef ISC_PLATFORM_USETHREADS
334
/*
Michael Graff's avatar
Michael Graff committed
335
336
337
 * Poke the select loop when there is something for us to do.
 * We assume that if a write completes here, it will be inserted into the
 * queue fully.  That is, we will not get partial writes.
338
339
 */
static void
340
select_poke(isc_socketmgr_t *mgr, int msg) {
Michael Graff's avatar
Michael Graff committed
341
342
	int cc;

343
344
345
	do {
		cc = write(mgr->pipe_fds[1], &msg, sizeof(int));
	} while (cc < 0 && SOFT_ERROR(errno));
346
			        
347
	if (cc < 0)
Michael Graff's avatar
Michael Graff committed
348
349
350
		FATAL_ERROR(__FILE__, __LINE__,
			    "write() failed during watcher poke: %s",
			    strerror(errno));
351
352

	INSIST(cc == sizeof(int));
353
354
355
}

/*
Andreas Gustafsson's avatar
Andreas Gustafsson committed
356
 * Read a message on the internal fd.
357
 */
Michael Graff's avatar
Michael Graff committed
358
static int
359
select_readmsg(isc_socketmgr_t *mgr) {
Michael Graff's avatar
Michael Graff committed
360
	int msg;
Michael Graff's avatar
Michael Graff committed
361
362
	int cc;

Michael Graff's avatar
Michael Graff committed
363
	cc = read(mgr->pipe_fds[0], &msg, sizeof(int));
Michael Graff's avatar
Michael Graff committed
364
	if (cc < 0) {
Michael Graff's avatar
Michael Graff committed
365
		if (SOFT_ERROR(errno))
Michael Graff's avatar
Michael Graff committed
366
			return (SELECT_POKE_NOTHING);
Michael Graff's avatar
Michael Graff committed
367

Michael Graff's avatar
Michael Graff committed
368
369
370
		FATAL_ERROR(__FILE__, __LINE__,
			    "read() failed during watcher poke: %s",
			    strerror(errno));
371
		
Michael Graff's avatar
Michael Graff committed
372
		return (SELECT_POKE_NOTHING);
Michael Graff's avatar
Michael Graff committed
373
	}
374

Michael Graff's avatar
Michael Graff committed
375
	return (msg);
376
}
377
378
379
380
381
382
383
384
385
386
387
388
389
#else
/*
 * Update the state of the socketmgr when something changes.
 */
static void
select_poke(isc_socketmgr_t *manager, int msg) {
	if (msg == SELECT_POKE_SHUTDOWN)
		return;
	else if (msg >= 0)
		wakeup_socket(manager, msg);
	return;
}
#endif
390
391

/*
Andreas Gustafsson's avatar
Andreas Gustafsson committed
392
 * Make a fd non-blocking.
393
 */
Michael Graff's avatar
Michael Graff committed
394
static isc_result_t
395
make_nonblock(int fd) {
Michael Graff's avatar
Michael Graff committed
396
397
	int ret;
	int flags;
398

Michael Graff's avatar
Michael Graff committed
399
400
401
	flags = fcntl(fd, F_GETFL, 0);
	flags |= O_NONBLOCK;
	ret = fcntl(fd, F_SETFL, flags);
402

Michael Graff's avatar
Michael Graff committed
403
404
405
406
	if (ret == -1) {
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "fcntl(%d, F_SETFL, %d): %s",
				 fd, flags, strerror(errno));
Michael Graff's avatar
Michael Graff committed
407

Michael Graff's avatar
Michael Graff committed
408
		return (ISC_R_UNEXPECTED);
Michael Graff's avatar
Michael Graff committed
409
410
	}

Michael Graff's avatar
Michael Graff committed
411
	return (ISC_R_SUCCESS);
412
413
}

414
415
416
417
/*
 * Process control messages received on a socket.
 */
static void
418
process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
Michael Graff's avatar
Michael Graff committed
419
#ifdef USE_CMSG
420
	struct cmsghdr *cmsgp;
Michael Graff's avatar
Michael Graff committed
421
422
423
424
425
426
427
428
#ifdef ISC_PLATFORM_HAVEIPV6
	struct in6_pktinfo *pktinfop;
#endif
#ifdef SO_TIMESTAMP
	struct timeval *timevalp;
#endif
#endif

429
430
431
432
433
434
	/*
	 * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
	 * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
	 * They are all here, outside of the CPP tests, because it is
	 * more consistent with the usual ISC coding style.
	 */
435
	UNUSED(sock);
436
437
438
439
440
441
442
	UNUSED(msg);
	UNUSED(dev);

#ifndef ISC_NET_BSD44MSGHDR
	return;

#else  /* defined ISC_NET_BSD44MSGHDR */
443

Bob Halley's avatar
Bob Halley committed
444
#ifdef MSG_TRUNC
445
446
	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
Bob Halley's avatar
Bob Halley committed
447
#endif
448

Bob Halley's avatar
Bob Halley committed
449
#ifdef MSG_CTRUNC
450
451
	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
Bob Halley's avatar
Bob Halley committed
452
#endif
453

454
455
456
457
458
459
	/*
	 * Check for multicast.
	 */
	if (isc_sockaddr_ismulticast(&dev->address))
		dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;

Michael Graff's avatar
Michael Graff committed
460
461
462
#ifndef USE_CMSG
	return;
#else
463
464
	if (msg->msg_controllen == 0 || msg->msg_control == NULL)
		return;
Michael Graff's avatar
Michael Graff committed
465
466
467
468
469
470
471
472
473
474

#ifdef SO_TIMESTAMP
	timevalp = NULL;
#endif
#ifdef ISC_PLATFORM_HAVEIPV6
	pktinfop = NULL;
#endif

	cmsgp = CMSG_FIRSTHDR(msg);
	while (cmsgp != NULL) {
475
		socket_log(sock, NULL, TRACE, "processing cmsg %p", cmsgp);
Michael Graff's avatar
Michael Graff committed
476
477

#ifdef ISC_PLATFORM_HAVEIPV6
478
479
		if (cmsgp->cmsg_level == IPPROTO_IPV6
		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
480

Michael Graff's avatar
Michael Graff committed
481
			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
482
483
			memcpy(&dev->pktinfo, pktinfop,
			       sizeof(struct in6_pktinfo));
Michael Graff's avatar
Michael Graff committed
484
			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
485
			socket_log(sock, NULL, TRACE,
David Lawrence's avatar
David Lawrence committed
486
487
				   "interface received on ifindex %u",
				   dev->pktinfo.ipi6_ifindex);
Michael Graff's avatar
Michael Graff committed
488
489
490
491
492
			goto next;
		}
#endif

#ifdef SO_TIMESTAMP
493
494
		if (cmsgp->cmsg_level == SOL_SOCKET
		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
Michael Graff's avatar
Michael Graff committed
495
496
497
498
499
500
501
502
503
504
505
506
507
			timevalp = (struct timeval *)CMSG_DATA(cmsgp);
			dev->timestamp.seconds = timevalp->tv_sec;
			dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
			goto next;
		}
#endif

	next:
		cmsgp = CMSG_NXTHDR(msg, cmsgp);
	}
#endif /* USE_CMSG */

508
#endif /* ISC_NET_BSD44MSGHDR */
509
510
511

}

512
513
514
/*
 * Construct an iov array and attach it to the msghdr passed in.  Return
 * 0 on success, non-zero on failure.  This is the SEND constructor, which
515
516
 * will used the used region of the buffer (if using a buffer list) or
 * will use the internal region (if a single buffer I/O is requested).
517
518
519
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
Michael Graff's avatar
fix    
Michael Graff committed
520
521
522
 *
 * If write_countp != NULL, *write_countp will hold the number of bytes
 * this transaction can send.
523
 */
Michael Graff's avatar
Michael Graff committed
524
static void
525
build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
526
		  struct msghdr *msg, struct iovec *iov, size_t *write_countp)
527
528
529
530
{
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t used;
531
532
533
	size_t write_count;
	size_t skip_count;

Michael Graff's avatar
fix    
Michael Graff committed
534
	memset(msg, 0, sizeof (*msg));
535
536
537
538

	if (sock->type == isc_sockettype_udp) {
		msg->msg_name = (void *)&dev->address.type.sa;
		msg->msg_namelen = dev->address.length;
Michael Graff's avatar
fix    
Michael Graff committed
539
540
541
	} else {
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
542
	}
543
544

	buffer = ISC_LIST_HEAD(dev->bufferlist);
545
	write_count = 0;
Michael Graff's avatar
fix    
Michael Graff committed
546
	iovcount = 0;
547

548
	/*
549
	 * Single buffer I/O?  Skip what we've done so far in this region.
550
551
	 */
	if (buffer == NULL) {
552
553
554
		write_count = dev->region.length - dev->n;
		iov[0].iov_base = (void *)(dev->region.base + dev->n);
		iov[0].iov_len = write_count;
Michael Graff's avatar
fix    
Michael Graff committed
555
		iovcount = 1;
556

557
558
559
560
561
562
563
		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip the data in the buffer list that we have already written.
	 */
Michael Graff's avatar
fix    
Michael Graff committed
564
	skip_count = dev->n;
565
	while (buffer != NULL) {
566
		REQUIRE(ISC_BUFFER_VALID(buffer));
567
		if (skip_count < isc_buffer_usedlength(buffer))
568
			break;
569
		skip_count -= isc_buffer_usedlength(buffer);
570
		buffer = ISC_LIST_NEXT(buffer, link);
571
572
573
	}

	while (buffer != NULL) {
574
		INSIST(iovcount < MAXSCATTERGATHER_SEND);
575

576
		isc_buffer_usedregion(buffer, &used);
577

578
		if (used.length > 0) {
579
580
581
582
583
			iov[iovcount].iov_base = (void *)(used.base
							  + skip_count);
			iov[iovcount].iov_len = used.length - skip_count;
			write_count += (used.length - skip_count);
			skip_count = 0;
584
585
586
587
588
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

Michael Graff's avatar
fix    
Michael Graff committed
589
590
591
	INSIST(skip_count == 0);

 config:
592
593
594
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;

595
596
597
598
#ifdef ISC_NET_BSD44MSGHDR
	msg->msg_control = NULL;
	msg->msg_controllen = 0;
	msg->msg_flags = 0;
599
600
601
602
#if defined(USE_CMSG)
	if ((sock->type == isc_sockettype_udp)
	    && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
		struct cmsghdr *cmsgp;
603
604
		struct in6_pktinfo *pktinfop;

605
		socket_log(sock, NULL, TRACE,
David Lawrence's avatar
David Lawrence committed
606
607
			   "sendto pktinfo data, ifindex %u",
			   dev->pktinfo.ipi6_ifindex);
608

609
610
611
612
613
614
615
616
		msg->msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo));
		msg->msg_control = (void *)sock->cmsg;

		cmsgp = (struct cmsghdr *)sock->cmsg;
		cmsgp->cmsg_level = IPPROTO_IPV6;
		cmsgp->cmsg_type = IPV6_PKTINFO;
		cmsgp->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
617
		memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
618
619
620
	}
#endif /* USE_CMSG */
#else /* ISC_NET_BSD44MSGHDR */
621
622
	msg->msg_accrights = NULL;
	msg->msg_accrightslen = 0;
623
#endif /* ISC_NET_BSD44MSGHDR */
624
625
626

	if (write_countp != NULL)
		*write_countp = write_count;
627
628
}

Michael Graff's avatar
fix    
Michael Graff committed
629
630
631
632
633
634
635
636
637
638
639
640
/*
 * Construct an iov array and attach it to the msghdr passed in.  Return
 * 0 on success, non-zero on failure.  This is the RECV constructor, which
 * will use the avialable region of the buffer (if using a buffer list) or
 * will use the internal region (if a single buffer I/O is requested).
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
 *
 * If read_countp != NULL, *read_countp will hold the number of bytes
 * this transaction can receive.
 */
Michael Graff's avatar
Michael Graff committed
641
static void
Michael Graff's avatar
fix    
Michael Graff committed
642
build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
643
		  struct msghdr *msg, struct iovec *iov, size_t *read_countp)
Michael Graff's avatar
fix    
Michael Graff committed
644
645
646
647
648
649
650
651
652
653
654
{
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t available;
	size_t read_count;

	memset(msg, 0, sizeof (struct msghdr));

	if (sock->type == isc_sockettype_udp) {
		memset(&dev->address, 0, sizeof(dev->address));
		msg->msg_name = (void *)&dev->address.type.sa;
Bob Halley's avatar
Bob Halley committed
655
		msg->msg_namelen = sizeof(dev->address.type);
656
657
658
659
660
#ifdef ISC_NET_RECVOVERFLOW
		/* If needed, steal one iovec for overflow detection. */
		maxiov--;
#endif
	} else { /* TCP */
Michael Graff's avatar
fix    
Michael Graff committed
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
		dev->address = sock->address;
	}

	buffer = ISC_LIST_HEAD(dev->bufferlist);
	read_count = 0;

	/*
	 * Single buffer I/O?  Skip what we've done so far in this region.
	 */
	if (buffer == NULL) {
		read_count = dev->region.length - dev->n;
		iov[0].iov_base = (void *)(dev->region.base + dev->n);
		iov[0].iov_len = read_count;
676
		iovcount = 1;
Michael Graff's avatar
fix    
Michael Graff committed
677
678
679
680
681
682
683
684
685

		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip empty buffers.
	 */
	while (buffer != NULL) {
686
		REQUIRE(ISC_BUFFER_VALID(buffer));
687
		if (isc_buffer_availablelength(buffer) != 0)
Michael Graff's avatar
fix    
Michael Graff committed
688
689
690
691
692
693
			break;
		buffer = ISC_LIST_NEXT(buffer, link);
	}

	iovcount = 0;
	while (buffer != NULL) {
694
		INSIST(iovcount < MAXSCATTERGATHER_RECV);
Michael Graff's avatar
fix    
Michael Graff committed
695

696
		isc_buffer_availableregion(buffer, &available);
Michael Graff's avatar
fix    
Michael Graff committed
697
698
699
700
701
702
703
704
705
706

		if (available.length > 0) {
			iov[iovcount].iov_base = (void *)(available.base);
			iov[iovcount].iov_len = available.length;
			read_count += available.length;
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
 config:

	/*
	 * If needed, set up to receive that one extra byte.  Note that
	 * we know there is at least one iov left, since we stole it
	 * at the top of this function.
	 */
#ifdef ISC_NET_RECVOVERFLOW
	if (sock->type == isc_sockettype_udp) {
		iov[iovcount].iov_base = (void *)(&sock->overflow);
		iov[iovcount].iov_len = 1;
		iovcount++;
	}
#endif

Michael Graff's avatar
fix    
Michael Graff committed
722
723
724
725
726
727
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;

#ifdef ISC_NET_BSD44MSGHDR
	msg->msg_control = NULL;
	msg->msg_controllen = 0;
728
729
	msg->msg_flags = 0;
#if defined(USE_CMSG)
Michael Graff's avatar
Michael Graff committed
730
	if (sock->type == isc_sockettype_udp) {
731
732
		msg->msg_control = (void *)sock->cmsg;
		msg->msg_controllen = sock->cmsglen;
Michael Graff's avatar
Michael Graff committed
733
	}
734
735
#endif /* USE_CMSG */
#else /* ISC_NET_BSD44MSGHDR */
Michael Graff's avatar
fix    
Michael Graff committed
736
737
	msg->msg_accrights = NULL;
	msg->msg_accrightslen = 0;
738
#endif /* ISC_NET_BSD44MSGHDR */
Michael Graff's avatar
fix    
Michael Graff committed
739
740
741
742
743

	if (read_countp != NULL)
		*read_countp = read_count;
}

744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
static void
set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
		isc_socketevent_t *dev)
{
	if (sock->type == isc_sockettype_udp) {
		if (address != NULL)
			dev->address = *address;
		else
			dev->address = sock->address;
	} else if (sock->type == isc_sockettype_tcp) {
		INSIST(address == NULL);
		dev->address = sock->address;
	}
}

759
760
static isc_socketevent_t *
allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
David Lawrence's avatar
David Lawrence committed
761
		     isc_taskaction_t action, const void *arg)
762
763
764
765
766
767
768
769
770
771
772
773
{
	isc_socketevent_t *ev;

	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
						     sock, eventtype,
						     action, arg,
						     sizeof (*ev));

	if (ev == NULL)
		return (NULL);

	ev->result = ISC_R_UNEXPECTED;
774
	ISC_LINK_INIT(ev, ev_link);
775
776
	ISC_LIST_INIT(ev->bufferlist);
	ev->region.base = NULL;
Michael Graff's avatar
fix    
Michael Graff committed
777
778
	ev->n = 0;
	ev->offset = 0;
779
	ev->attributes = 0;
780
781
782
783

	return (ev);
}

784
785
#if defined(ISC_SOCKET_DEBUG)
static void
786
dump_msg(struct msghdr *msg) {
787
788
789
790
791
	unsigned int i;

	printf("MSGHDR %p\n", msg);
	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
Michael Graff's avatar
Michael Graff committed
792
	for (i = 0 ; i < (unsigned int)msg->msg_iovlen ; i++)
793
794
795
		printf("\t\t%d\tbase %p, len %d\n", i,
		       msg->msg_iov[i].iov_base,
		       msg->msg_iov[i].iov_len);
Michael Graff's avatar
Michael Graff committed
796
797
798
799
#ifdef ISC_NET_BSD44MSGHDR
	printf("\tcontrol %p, controllen %d\n", msg->msg_control,
	       msg->msg_controllen);
#endif
800
801
802
}
#endif

Michael Graff's avatar
Michael Graff committed
803
804
805
806
807
#define DOIO_SUCCESS		0	/* i/o ok, event sent */
#define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
#define DOIO_HARD		2	/* i/o error, event sent */
#define DOIO_EOF		3	/* EOF, no event sent */

808
static int
809
doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) {
810
	int cc;
811
	struct iovec iov[MAXSCATTERGATHER_RECV];
812
	size_t read_count;
813
	size_t actual_count;
814
	struct msghdr msghdr;
815
	isc_buffer_t *buffer;
816

817
	build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
818
819
820
821

#if defined(ISC_SOCKET_DEBUG)
	dump_msg(&msghdr);
#endif
822
823
824
825
826
827
828

	cc = recvmsg(sock->fd, &msghdr, 0);

	if (cc < 0) {
		if (SOFT_ERROR(errno))
			return (DOIO_SOFT);

829
830
831
832
		if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL))
			socket_log(sock, NULL, IOEVENT,
				   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
				   sock->fd, cc, errno, strerror(errno));
833

834
835
836
837
838
839
#define SOFT_OR_HARD(_system, _isc) \
	if (errno == _system) { \
		if (sock->connected) { \
			if (sock->type == isc_sockettype_tcp) \
				sock->recv_result = _isc; \
			send_recvdone_event(sock, &dev, _isc); \
Michael Graff's avatar
Michael Graff committed
840
			return (DOIO_HARD); \
841
		} \
Michael Graff's avatar
Michael Graff committed
842
		return (DOIO_SOFT); \
843
	}
Michael Graff's avatar
Michael Graff committed
844
845
846
847
848
849
#define ALWAYS_HARD(_system, _isc) \
	if (errno == _system) { \
		sock->recv_result = _isc; \
		send_recvdone_event(sock, &dev, _isc); \
		return (DOIO_HARD); \
	}
850
851

		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
852
853
		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
Michael Graff's avatar
Michael Graff committed
854
		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
855

Michael Graff's avatar
Michael Graff committed
856
857
#undef SOFT_OR_HARD
#undef ALWAYS_HARD
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873

		sock->recv_result = ISC_R_UNEXPECTED;
		send_recvdone_event(sock, &dev, ISC_R_UNEXPECTED);
		return (DOIO_SUCCESS);
	}

	/*
	 * On TCP, zero length reads indicate EOF, while on
	 * UDP, zero length reads are perfectly valid, although
	 * strange.
	 */
	if ((sock->type == isc_sockettype_tcp) && (cc == 0)) {
		sock->recv_result = ISC_R_EOF;
		return (DOIO_EOF);
	}

Michael Graff's avatar
Michael Graff committed
874
875
876
	if (sock->type == isc_sockettype_udp)
		dev->address.length = msghdr.msg_namelen;

Michael Graff's avatar
Michael Graff committed
877
878
	socket_log(sock, &dev->address, IOEVENT, "packet received correctly");

879
880
881
882
883
884
885
886
887
888
889
890
	/*
	 * Overflow bit detection.  If we received MORE bytes than we should,
	 * this indicates an overflow situation.  Set the flag in the
	 * dev entry and adjust how much we read by one.
	 */
#ifdef ISC_NET_RECVOVERFLOW
	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
		cc--;
	}
#endif

891
892
893
894
	/*
	 * If there are control messages attached, run through them and pull
	 * out the interesting bits.
	 */
Michael Graff's avatar
Michael Graff committed
895
896
	if (sock->type == isc_sockettype_udp)
		process_cmsg(sock, &msghdr, dev);
897

898
899
900
901
902
903
904
	/*
	 * update the buffers (if any) and the i/o count
	 */
	dev->n += cc;
	actual_count = cc;
	buffer = ISC_LIST_HEAD(dev->bufferlist);
	while (buffer != NULL && actual_count > 0) {
905
		REQUIRE(ISC_BUFFER_VALID(buffer));
906
907
		if (isc_buffer_availablelength(buffer) <= actual_count) {
			actual_count -= isc_buffer_availablelength(buffer);
908
			isc_buffer_add(buffer,
909
				       isc_buffer_availablelength(buffer));
910
911
912
913
914
915
916
917
918
919
920
		} else {
			isc_buffer_add(buffer, actual_count);
			actual_count = 0;
			break;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
		if (buffer == NULL) {
			INSIST(actual_count == 0);
		}
	}

921
922
923
924
925
926
927
928
	/*
	 * If we read less than we expected, update counters,
	 * and let the upper layer poke the descriptor.
	 */
	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
		return (DOIO_SOFT);

	/*
Andreas Gustafsson's avatar
Andreas Gustafsson committed
929
	 * Full reads are posted, or partials if partials are ok.
930
931
	 */
	send_recvdone_event(sock, &dev, ISC_R_SUCCESS);
Michael Graff's avatar
Michael Graff committed
932
933
934
	return (DOIO_SUCCESS);
}

Andreas Gustafsson's avatar
Andreas Gustafsson committed
935
936
937
938
939
940
941
942
943
944
945
946
947
/*
 * Returns:
 *	DOIO_SUCCESS	The operation succeeded.  The senddone event
 *			was sent.
 *
 *	DOIO_HARD	A hard or unexpected I/O error was encountered.
 *			The senddone event was sent.
 *
 *	DOIO_SOFT	A soft I/O error was encountered.  No senddone
 *			event was sent.  The operation should be retried.
 *
 *	No other return values are possible.
 */
Michael Graff's avatar
Michael Graff committed
948
static int
949
doio_send(isc_socket_t *sock, isc_socketevent_t *dev) {
Michael Graff's avatar
Michael Graff committed
950
	int cc;
951
	struct iovec iov[MAXSCATTERGATHER_SEND];
Michael Graff's avatar
Michael Graff committed
952
953
	size_t write_count;
	struct msghdr msghdr;
954
	char addrbuf[ISC_SOCKADDR_FORMATSIZE];
Michael Graff's avatar
Michael Graff committed
955

956
	build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
Michael Graff's avatar
Michael Graff committed
957
958
959
960

	cc = sendmsg(sock->fd, &msghdr, 0);

	/*
Andreas Gustafsson's avatar
Andreas Gustafsson committed
961
	 * Check for error or block condition.
Michael Graff's avatar
Michael Graff committed
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
	 */
	if (cc < 0) {
		if (SOFT_ERROR(errno))
			return (DOIO_SOFT);

#define SOFT_OR_HARD(_system, _isc) \
	if (errno == _system) { \
		if (sock->connected) { \
			if (sock->type == isc_sockettype_tcp) \
				sock->send_result = _isc; \
			send_senddone_event(sock, &dev, _isc); \
			return (DOIO_HARD); \
		} \
		return (DOIO_SOFT); \
	}
Michael Graff's avatar
Michael Graff committed
977
978
#define ALWAYS_HARD(_system, _isc) \
	if (errno == _system) { \
979
980
		if (sock->connected && sock->type == isc_sockettype_tcp) \
			sock->send_result = _isc; \
Michael Graff's avatar
Michael Graff committed
981
982
983
		send_senddone_event(sock, &dev, _isc); \
		return (DOIO_HARD); \
	}
Michael Graff's avatar
Michael Graff committed
984
985

		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
986
987
		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
Michael Graff's avatar
Michael Graff committed
988
		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
989
		ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
990
		ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
Michael Graff's avatar
Michael Graff committed
991

Michael Graff's avatar
Michael Graff committed
992
993
#undef SOFT_OR_HARD
#undef ALWAYS_HARD
Michael Graff's avatar
Michael Graff committed
994
995
996
997
998
999
1000

		/*
		 * The other error types depend on whether or not the
		 * socket is UDP or TCP.  If it is UDP, some errors
		 * that we expect to be fatal under TCP are merely
		 * annoying, and are really soft errors.
		 *