socket.c 72.4 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
Bob Halley's avatar
Bob Halley committed
2
 * Copyright (C) 1998, 1999, 2000  Internet Software Consortium.
Bob Halley's avatar
Bob Halley committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
 * 
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS
 * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE
 * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
 * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
 * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 * SOFTWARE.
 */
Bob Halley's avatar
Bob Halley committed
17
18

#include <config.h>
19

20
#include <sys/param.h>
Michael Graff's avatar
Michael Graff committed
21
#include <sys/types.h>
Michael Graff's avatar
Michael Graff committed
22
23
#include <sys/socket.h>
#include <sys/time.h>
Michael Graff's avatar
Michael Graff committed
24
25
#include <sys/uio.h>

26
27
28
29
30
#include <errno.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
Michael Graff's avatar
Michael Graff committed
31
#include <fcntl.h>
32
33

#include <isc/assertions.h>
34
35
#include <isc/buffer.h>
#include <isc/condition.h>
36
#include <isc/error.h>
37
#include <isc/list.h>
38
#include <isc/mutex.h>
39
#include <isc/net.h>
40
#include <isc/region.h>
41
#include <isc/socket.h>
42
#include <isc/thread.h>
Michael Graff's avatar
Michael Graff committed
43
#include <isc/util.h>
Bob Halley's avatar
Bob Halley committed
44

45
46
/*
 * Some systems define the socket length argument as an int, some as size_t,
47
 * some as socklen_t.  This is here so it can be easily changed if needed.
48
 */
49
#ifndef ISC_SOCKADDR_LEN_T
50
#define ISC_SOCKADDR_LEN_T unsigned int
51
#endif
52

53
54
55
/*
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
56
57
58
59
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
60
 */
61
62
63
64
#define SOFT_ERROR(e)	((e) == EAGAIN || \
			 (e) == EWOULDBLOCK || \
			 (e) == EINTR || \
			 (e) == 0)
65

Michael Graff's avatar
Michael Graff committed
66
#if 0
67
68
69
70
#define ISC_SOCKET_DEBUG
#endif

#if defined(ISC_SOCKET_DEBUG)
Michael Graff's avatar
Michael Graff committed
71
72
73
74
75
76
77
#define TRACE_WATCHER	0x0001
#define TRACE_LISTEN	0x0002
#define TRACE_CONNECT	0x0004
#define TRACE_RECV	0x0008
#define TRACE_SEND    	0x0010
#define TRACE_MANAGER	0x0020

Michael Graff's avatar
Michael Graff committed
78
int trace_level = TRACE_RECV;
79
80
#define XTRACE(l, a)	do {						\
				if ((l) & trace_level) {		\
Michael Graff's avatar
Michael Graff committed
81
					printf("[%s:%d] ", __FILE__, __LINE__); \
82
83
84
85
					printf a;			\
					fflush(stdout);			\
				}					\
			} while (0)
86
87
#define XENTER(l, a)	do {						\
				if ((l) & trace_level)			\
88
					fprintf(stderr, "ENTER %s\n", (a)); \
89
90
91
			} while (0)
#define XEXIT(l, a)	do {						\
				if ((l) & trace_level)			\
92
					fprintf(stderr, "EXIT %s\n", (a)); \
93
			} while (0)
94
#else
Michael Graff's avatar
Michael Graff committed
95
96
97
#define XTRACE(l, a)
#define XENTER(l, a)
#define XEXIT(l, a)
98
99
#endif

100
typedef isc_event_t intev_t;
Michael Graff's avatar
Michael Graff committed
101
102
103

#define SOCKET_MAGIC		0x494f696fU	/* IOio */
#define VALID_SOCKET(t)		((t) != NULL && (t)->magic == SOCKET_MAGIC)
Michael Graff's avatar
Michael Graff committed
104

Michael Graff's avatar
Michael Graff committed
105
106
107
108
109
110
111
112
113
114
115
116
/*
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
#ifdef ISC_PLATFORM_HAVEIPV6
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

/*
117
 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
Michael Graff's avatar
Michael Graff committed
118
119
120
121
122
123
124
125
126
 * a setsockopt() like interface to request timestamps, and if the OS
 * doesn't do it for us, call gettimeofday() on every UDP receive?
 */
#ifdef SO_TIMESTAMP
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

127
128
129
130
131
132
133
134
/*
 * Check to see if we have even basic support for cracking messages from
 * the control data returned from/sent via recvmsg()/sendmsg().
 */
#if defined(USE_CMSG) && (!defined(CMSG_LEN) || !defined(CMSG_SPACE))
#undef USE_CMSG
#endif

135
136
struct isc_socket {
	/* Not locked. */
137
138
139
140
	unsigned int			magic;
	isc_socketmgr_t		       *manager;
	isc_mutex_t			lock;
	isc_sockettype_t		type;
Michael Graff's avatar
Michael Graff committed
141

142
	/* Locked by socket lock. */
143
144
145
146
	unsigned int			references;
	int				fd;
	isc_result_t			recv_result;
	isc_result_t			send_result;
147

148
	ISC_LIST(isc_socketevent_t)		send_list;
149
	ISC_LIST(isc_socketevent_t)		recv_list;
150
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
151
152
153
154
155
156
157
	isc_socket_connev_t		       *connect_ev;

	/*
	 * Internal events.  Posted when a descriptor is readable or
	 * writable.  These are statically allocated and never freed.
	 * They will be set to non-purgable before use.
	 */
158
	intev_t				readable_ev;
159
	intev_t				writable_ev;
160

161
162
	isc_sockaddr_t			address;  /* remote address */

Michael Graff's avatar
Michael Graff committed
163
164
165
166
167
168
	unsigned int			pending_recv : 1,
					pending_send : 1,
					pending_accept : 1,
					listener : 1, /* listener socket */
					connected : 1,
					connecting : 1; /* connect pending */
169

170
171
172
#ifdef ISC_NET_RECVOVERFLOW
	unsigned char			overflow; /* used for MSG_TRUNC fake */
#endif
Michael Graff's avatar
Michael Graff committed
173
#ifdef USE_CMSG
174
175
	unsigned char		       *cmsg;
	unsigned int			cmsglen;
176
#endif
177
178
179
180
181
182
183
184
};

#define SOCKET_MANAGER_MAGIC		0x494f6d67U	/* IOmg */
#define VALID_MANAGER(m)		((m) != NULL && \
					 (m)->magic == SOCKET_MANAGER_MAGIC)
struct isc_socketmgr {
	/* Not locked. */
	unsigned int			magic;
185
	isc_mem_t		       *mctx;
186
187
	isc_mutex_t			lock;
	/* Locked by manager lock. */
Michael Graff's avatar
Michael Graff committed
188
	unsigned int			nsockets;  /* sockets managed */
Michael Graff's avatar
Michael Graff committed
189
	isc_thread_t			watcher;
190
	isc_condition_t			shutdown_ok;
Michael Graff's avatar
Michael Graff committed
191
	fd_set				read_fds;
192
	fd_set				write_fds;
193
	isc_socket_t		       *fds[FD_SETSIZE];
Michael Graff's avatar
Michael Graff committed
194
	int				fdstate[FD_SETSIZE];
195
	int				maxfd;
Michael Graff's avatar
Michael Graff committed
196
	int				pipe_fds[2];
197
198
};

Michael Graff's avatar
Michael Graff committed
199
200
201
202
#define CLOSED		0	/* this one must be zero */
#define MANAGED		1
#define CLOSE_PENDING	2

203
204
205
206
207
208
209
210
211
212
/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
#ifdef ISC_NET_RECVOVERFLOW
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER + 1)
#else
# define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
#endif

213
214
static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **,
				isc_result_t);
Michael Graff's avatar
Michael Graff committed
215
216
static void send_senddone_event(isc_socket_t *, isc_socketevent_t **,
				isc_result_t);
Bob Halley's avatar
Bob Halley committed
217
218
219
220
static void free_socket(isc_socket_t **);
static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
				    isc_socket_t **);
static void destroy(isc_socket_t **);
221
222
223
224
static void internal_accept(isc_task_t *, isc_event_t *);
static void internal_connect(isc_task_t *, isc_event_t *);
static void internal_recv(isc_task_t *, isc_event_t *);
static void internal_send(isc_task_t *, isc_event_t *);
225
static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *);
Michael Graff's avatar
Michael Graff committed
226
227
228
229
230
231
static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *,
			      struct msghdr *, struct iovec *, unsigned int,
			      size_t *);
static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *,
			      struct msghdr *, struct iovec *, unsigned int,
			      size_t *);
Michael Graff's avatar
Michael Graff committed
232
233
234

#define SELECT_POKE_SHUTDOWN		(-1)
#define SELECT_POKE_NOTHING		(-2)
235

236
237
#define SOCK_DEAD(s)			((s)->references == 0)

238
/*
Michael Graff's avatar
Michael Graff committed
239
240
241
 * Poke the select loop when there is something for us to do.
 * We assume that if a write completes here, it will be inserted into the
 * queue fully.  That is, we will not get partial writes.
242
243
 */
static void
Bob Halley's avatar
Bob Halley committed
244
select_poke(isc_socketmgr_t *mgr, int msg)
245
{
Michael Graff's avatar
Michael Graff committed
246
247
	int cc;

248
249
250
251
252
	do {
		cc = write(mgr->pipe_fds[1], &msg, sizeof(int));
	} while (cc < 0 && SOFT_ERROR(errno));

	if (cc < 0)
Michael Graff's avatar
Michael Graff committed
253
254
255
		FATAL_ERROR(__FILE__, __LINE__,
			    "write() failed during watcher poke: %s",
			    strerror(errno));
256
257

	INSIST(cc == sizeof(int));
258
259
260
261
262
}

/*
 * read a message on the internal fd.
 */
Michael Graff's avatar
Michael Graff committed
263
static int
Bob Halley's avatar
Bob Halley committed
264
select_readmsg(isc_socketmgr_t *mgr)
265
{
Michael Graff's avatar
Michael Graff committed
266
	int msg;
Michael Graff's avatar
Michael Graff committed
267
268
	int cc;

Michael Graff's avatar
Michael Graff committed
269
	cc = read(mgr->pipe_fds[0], &msg, sizeof(int));
Michael Graff's avatar
Michael Graff committed
270
	if (cc < 0) {
Michael Graff's avatar
Michael Graff committed
271
		if (SOFT_ERROR(errno))
Michael Graff's avatar
Michael Graff committed
272
			return (SELECT_POKE_NOTHING);
Michael Graff's avatar
Michael Graff committed
273

Michael Graff's avatar
Michael Graff committed
274
275
276
277
		FATAL_ERROR(__FILE__, __LINE__,
			    "read() failed during watcher poke: %s",
			    strerror(errno));

Michael Graff's avatar
Michael Graff committed
278
		return (SELECT_POKE_NOTHING);
Michael Graff's avatar
Michael Graff committed
279
	}
280

Michael Graff's avatar
Michael Graff committed
281
	return (msg);
282
283
284
}

/*
Michael Graff's avatar
Michael Graff committed
285
 * Make a fd non-blocking
286
 */
Michael Graff's avatar
Michael Graff committed
287
288
static isc_result_t
make_nonblock(int fd)
289
{
Michael Graff's avatar
Michael Graff committed
290
291
	int ret;
	int flags;
292

Michael Graff's avatar
Michael Graff committed
293
294
295
	flags = fcntl(fd, F_GETFL, 0);
	flags |= O_NONBLOCK;
	ret = fcntl(fd, F_SETFL, flags);
296

Michael Graff's avatar
Michael Graff committed
297
298
299
300
	if (ret == -1) {
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "fcntl(%d, F_SETFL, %d): %s",
				 fd, flags, strerror(errno));
Michael Graff's avatar
Michael Graff committed
301

Michael Graff's avatar
Michael Graff committed
302
		return (ISC_R_UNEXPECTED);
Michael Graff's avatar
Michael Graff committed
303
304
	}

Michael Graff's avatar
Michael Graff committed
305
	return (ISC_R_SUCCESS);
306
307
}

308
309
310
311
312
313
/*
 * Process control messages received on a socket.
 */
static void
process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev)
{
Michael Graff's avatar
Michael Graff committed
314
#ifdef USE_CMSG
315
	struct cmsghdr *cmsgp;
Michael Graff's avatar
Michael Graff committed
316
317
318
319
320
321
322
323
#ifdef ISC_PLATFORM_HAVEIPV6
	struct in6_pktinfo *pktinfop;
#endif
#ifdef SO_TIMESTAMP
	struct timeval *timevalp;
#endif
#endif

324
	UNUSED(sock);
325
326

#ifdef ISC_NET_BSD44MSGHDR
Bob Halley's avatar
Bob Halley committed
327
#ifdef MSG_TRUNC
328
329
	if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
Bob Halley's avatar
Bob Halley committed
330
#endif
331

Bob Halley's avatar
Bob Halley committed
332
#ifdef MSG_CTRUNC
333
334
	if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
		dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
Bob Halley's avatar
Bob Halley committed
335
#endif
336

Michael Graff's avatar
Michael Graff committed
337
338
339
#ifndef USE_CMSG
	return;
#else
340
341
	if (msg->msg_controllen == 0 || msg->msg_control == NULL)
		return;
Michael Graff's avatar
Michael Graff committed
342
343
344
345
346
347
348
349
350
351
352
353
354

#ifdef SO_TIMESTAMP
	timevalp = NULL;
#endif
#ifdef ISC_PLATFORM_HAVEIPV6
	pktinfop = NULL;
#endif

	cmsgp = CMSG_FIRSTHDR(msg);
	while (cmsgp != NULL) {
		XTRACE(TRACE_RECV, ("Processing cmsg %p\n", cmsgp));

#ifdef ISC_PLATFORM_HAVEIPV6
355
356
		if (cmsgp->cmsg_level == IPPROTO_IPV6
		    && cmsgp->cmsg_type == IPV6_PKTINFO) {
Michael Graff's avatar
Michael Graff committed
357
			pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
358
359
			memcpy(&dev->pktinfo, pktinfop,
			       sizeof(struct in6_pktinfo));
Michael Graff's avatar
Michael Graff committed
360
361
362
363
364
365
			dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
			goto next;
		}
#endif

#ifdef SO_TIMESTAMP
366
367
		if (cmsgp->cmsg_level == SOL_SOCKET
		    && cmsgp->cmsg_type == SCM_TIMESTAMP) {
Michael Graff's avatar
Michael Graff committed
368
369
370
371
372
373
374
375
376
377
378
379
380
			timevalp = (struct timeval *)CMSG_DATA(cmsgp);
			dev->timestamp.seconds = timevalp->tv_sec;
			dev->timestamp.nanoseconds = timevalp->tv_usec * 1000;
			dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
			goto next;
		}
#endif

	next:
		cmsgp = CMSG_NXTHDR(msg, cmsgp);
	}
#endif /* USE_CMSG */

381
#endif /* ISC_NET_BSD44MSGHDR */
382
383
384

}

385
386
387
/*
 * Construct an iov array and attach it to the msghdr passed in.  Return
 * 0 on success, non-zero on failure.  This is the SEND constructor, which
388
389
 * will used the used region of the buffer (if using a buffer list) or
 * will use the internal region (if a single buffer I/O is requested).
390
391
392
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
Michael Graff's avatar
fix    
Michael Graff committed
393
394
395
 *
 * If write_countp != NULL, *write_countp will hold the number of bytes
 * this transaction can send.
396
 */
Michael Graff's avatar
Michael Graff committed
397
static void
398
399
400
build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
		  struct msghdr *msg, struct iovec *iov, unsigned int maxiov,
		  size_t *write_countp)
401
402
403
404
{
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t used;
405
406
407
	size_t write_count;
	size_t skip_count;

Michael Graff's avatar
fix    
Michael Graff committed
408
	memset(msg, 0, sizeof (*msg));
409
410
411
412

	if (sock->type == isc_sockettype_udp) {
		msg->msg_name = (void *)&dev->address.type.sa;
		msg->msg_namelen = dev->address.length;
Michael Graff's avatar
fix    
Michael Graff committed
413
414
415
	} else {
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
416
	}
417
418

	buffer = ISC_LIST_HEAD(dev->bufferlist);
419
	write_count = 0;
Michael Graff's avatar
fix    
Michael Graff committed
420
	iovcount = 0;
421

422
	/*
423
	 * Single buffer I/O?  Skip what we've done so far in this region.
424
425
	 */
	if (buffer == NULL) {
426
427
428
		write_count = dev->region.length - dev->n;
		iov[0].iov_base = (void *)(dev->region.base + dev->n);
		iov[0].iov_len = write_count;
Michael Graff's avatar
fix    
Michael Graff committed
429
		iovcount = 1;
430

431
432
433
434
435
436
437
		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip the data in the buffer list that we have already written.
	 */
Michael Graff's avatar
fix    
Michael Graff committed
438
	skip_count = dev->n;
439
	while (buffer != NULL) {
440
441
		REQUIRE(ISC_BUFFER_VALID(buffer));
		if (skip_count < ISC_BUFFER_USEDCOUNT(buffer))
442
			break;
443
		skip_count -= ISC_BUFFER_USEDCOUNT(buffer);
444
		buffer = ISC_LIST_NEXT(buffer, link);
445
446
447
	}

	while (buffer != NULL) {
Michael Graff's avatar
Michael Graff committed
448
		INSIST(iovcount < maxiov);
449
450

		isc_buffer_used(buffer, &used);
451

452
		if (used.length > 0) {
453
454
455
456
457
			iov[iovcount].iov_base = (void *)(used.base
							  + skip_count);
			iov[iovcount].iov_len = used.length - skip_count;
			write_count += (used.length - skip_count);
			skip_count = 0;
458
459
460
461
462
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

Michael Graff's avatar
fix    
Michael Graff committed
463
464
465
	INSIST(skip_count == 0);

 config:
466
467
468
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;

469
470
471
472
#ifdef ISC_NET_BSD44MSGHDR
	msg->msg_control = NULL;
	msg->msg_controllen = 0;
	msg->msg_flags = 0;
473
474
475
476
477
478
479
480
481
482
483
484
485
486
#if defined(USE_CMSG)
	if ((sock->type == isc_sockettype_udp)
	    && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
		struct cmsghdr *cmsgp;
		struct in6_pktinfo *pktinfop;

		msg->msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo));
		msg->msg_control = (void *)sock->cmsg;

		cmsgp = (struct cmsghdr *)sock->cmsg;
		cmsgp->cmsg_level = IPPROTO_IPV6;
		cmsgp->cmsg_type = IPV6_PKTINFO;
		cmsgp->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
		pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
487
		memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
488
489
490
	}
#endif /* USE_CMSG */
#else /* ISC_NET_BSD44MSGHDR */
491
492
	msg->msg_accrights = NULL;
	msg->msg_accrightslen = 0;
493
#endif /* ISC_NET_BSD44MSGHDR */
494
495
496

	if (write_countp != NULL)
		*write_countp = write_count;
497
498
}

Michael Graff's avatar
fix    
Michael Graff committed
499
500
501
502
503
504
505
506
507
508
509
510
/*
 * Construct an iov array and attach it to the msghdr passed in.  Return
 * 0 on success, non-zero on failure.  This is the RECV constructor, which
 * will use the avialable region of the buffer (if using a buffer list) or
 * will use the internal region (if a single buffer I/O is requested).
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
 *
 * If read_countp != NULL, *read_countp will hold the number of bytes
 * this transaction can receive.
 */
Michael Graff's avatar
Michael Graff committed
511
static void
Michael Graff's avatar
fix    
Michael Graff committed
512
513
514
515
516
517
518
519
520
521
522
523
524
525
build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
		  struct msghdr *msg, struct iovec *iov, unsigned int maxiov,
		  size_t *read_countp)
{
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t available;
	size_t read_count;

	memset(msg, 0, sizeof (struct msghdr));

	if (sock->type == isc_sockettype_udp) {
		memset(&dev->address, 0, sizeof(dev->address));
		msg->msg_name = (void *)&dev->address.type.sa;
Bob Halley's avatar
Bob Halley committed
526
		msg->msg_namelen = sizeof(dev->address.type);
527
528
529
530
531
#ifdef ISC_NET_RECVOVERFLOW
		/* If needed, steal one iovec for overflow detection. */
		maxiov--;
#endif
	} else { /* TCP */
Michael Graff's avatar
fix    
Michael Graff committed
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
		dev->address = sock->address;
	}

	buffer = ISC_LIST_HEAD(dev->bufferlist);
	read_count = 0;

	/*
	 * Single buffer I/O?  Skip what we've done so far in this region.
	 */
	if (buffer == NULL) {
		read_count = dev->region.length - dev->n;
		iov[0].iov_base = (void *)(dev->region.base + dev->n);
		iov[0].iov_len = read_count;
547
		iovcount = 1;
Michael Graff's avatar
fix    
Michael Graff committed
548
549
550
551
552
553
554
555
556

		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip empty buffers.
	 */
	while (buffer != NULL) {
557
558
		REQUIRE(ISC_BUFFER_VALID(buffer));
		if (ISC_BUFFER_AVAILABLECOUNT(buffer) != 0)
Michael Graff's avatar
fix    
Michael Graff committed
559
560
561
562
563
564
			break;
		buffer = ISC_LIST_NEXT(buffer, link);
	}

	iovcount = 0;
	while (buffer != NULL) {
Michael Graff's avatar
Michael Graff committed
565
		INSIST(iovcount < maxiov);
Michael Graff's avatar
fix    
Michael Graff committed
566
567
568
569
570
571
572
573
574
575
576
577

		isc_buffer_available(buffer, &available);

		if (available.length > 0) {
			iov[iovcount].iov_base = (void *)(available.base);
			iov[iovcount].iov_len = available.length;
			read_count += available.length;
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
 config:

	/*
	 * If needed, set up to receive that one extra byte.  Note that
	 * we know there is at least one iov left, since we stole it
	 * at the top of this function.
	 */
#ifdef ISC_NET_RECVOVERFLOW
	if (sock->type == isc_sockettype_udp) {
		iov[iovcount].iov_base = (void *)(&sock->overflow);
		iov[iovcount].iov_len = 1;
		iovcount++;
	}
#endif

Michael Graff's avatar
fix    
Michael Graff committed
593
594
595
596
597
598
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;

#ifdef ISC_NET_BSD44MSGHDR
	msg->msg_control = NULL;
	msg->msg_controllen = 0;
599
600
	msg->msg_flags = 0;
#if defined(USE_CMSG)
Michael Graff's avatar
Michael Graff committed
601
	if (sock->type == isc_sockettype_udp) {
602
603
		msg->msg_control = (void *)sock->cmsg;
		msg->msg_controllen = sock->cmsglen;
Michael Graff's avatar
Michael Graff committed
604
	}
605
606
#endif /* USE_CMSG */
#else /* ISC_NET_BSD44MSGHDR */
Michael Graff's avatar
fix    
Michael Graff committed
607
608
	msg->msg_accrights = NULL;
	msg->msg_accrightslen = 0;
609
#endif /* ISC_NET_BSD44MSGHDR */
Michael Graff's avatar
fix    
Michael Graff committed
610
611
612
613
614

	if (read_countp != NULL)
		*read_countp = read_count;
}

615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
static void
set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
		isc_socketevent_t *dev)
{
	if (sock->type == isc_sockettype_udp) {
		if (address != NULL)
			dev->address = *address;
		else
			dev->address = sock->address;
	} else if (sock->type == isc_sockettype_tcp) {
		INSIST(address == NULL);
		dev->address = sock->address;
	}
}

630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
static isc_socketevent_t *
allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
		     isc_taskaction_t action, void *arg)
{
	isc_socketevent_t *ev;

	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
						     sock, eventtype,
						     action, arg,
						     sizeof (*ev));

	if (ev == NULL)
		return (NULL);

	ev->result = ISC_R_UNEXPECTED;
645
	ISC_LINK_INIT(ev, ev_link);
646
647
	ISC_LIST_INIT(ev->bufferlist);
	ev->region.base = NULL;
Michael Graff's avatar
fix    
Michael Graff committed
648
649
	ev->n = 0;
	ev->offset = 0;
650
	ev->attributes = 0;
651
652
653
654

	return (ev);
}

655
656
657
658
659
660
661
662
663
#if defined(ISC_SOCKET_DEBUG)
static void
dump_msg(struct msghdr *msg)
{
	unsigned int i;

	printf("MSGHDR %p\n", msg);
	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
Michael Graff's avatar
Michael Graff committed
664
	for (i = 0 ; i < (unsigned int)msg->msg_iovlen ; i++)
665
666
667
		printf("\t\t%d\tbase %p, len %d\n", i,
		       msg->msg_iov[i].iov_base,
		       msg->msg_iov[i].iov_len);
Michael Graff's avatar
Michael Graff committed
668
669
670
671
#ifdef ISC_NET_BSD44MSGHDR
	printf("\tcontrol %p, controllen %d\n", msg->msg_control,
	       msg->msg_controllen);
#endif
672
673
674
}
#endif

Michael Graff's avatar
Michael Graff committed
675
676
677
678
679
680
#define DOIO_SUCCESS		0	/* i/o ok, event sent */
#define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
#define DOIO_HARD		2	/* i/o error, event sent */
#define DOIO_EOF		3	/* EOF, no event sent */
#define DOIO_UNEXPECTED		(-1)	/* bad stuff, no event sent */

681
682
683
684
static int
doio_recv(isc_socket_t *sock, isc_socketevent_t *dev)
{
	int cc;
685
	struct iovec iov[MAXSCATTERGATHER_RECV];
686
	size_t read_count;
687
	size_t actual_count;
688
	struct msghdr msghdr;
689
	isc_buffer_t *buffer;
690

Michael Graff's avatar
Michael Graff committed
691
	build_msghdr_recv(sock, dev, &msghdr, iov,
692
693
694
695
696
			  MAXSCATTERGATHER_RECV, &read_count);

#if defined(ISC_SOCKET_DEBUG)
	dump_msg(&msghdr);
#endif
697
698
699
700
701
702
703

	cc = recvmsg(sock->fd, &msghdr, 0);

	if (cc < 0) {
		if (SOFT_ERROR(errno))
			return (DOIO_SOFT);

704
705
706
707
	XTRACE(TRACE_RECV,
	       ("doio_recv: recvmsg(%d) %d bytes, err %d/%s\n",
		sock->fd, cc, errno, strerror(errno)));

708
709
710
711
712
713
#define SOFT_OR_HARD(_system, _isc) \
	if (errno == _system) { \
		if (sock->connected) { \
			if (sock->type == isc_sockettype_tcp) \
				sock->recv_result = _isc; \
			send_recvdone_event(sock, &dev, _isc); \
Michael Graff's avatar
Michael Graff committed
714
			return (DOIO_HARD); \
715
		} \
Michael Graff's avatar
Michael Graff committed
716
		return (DOIO_SOFT); \
717
	}
Michael Graff's avatar
Michael Graff committed
718
719
720
721
722
723
#define ALWAYS_HARD(_system, _isc) \
	if (errno == _system) { \
		sock->recv_result = _isc; \
		send_recvdone_event(sock, &dev, _isc); \
		return (DOIO_HARD); \
	}
724
725

		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
Michael Graff's avatar
Michael Graff committed
726
727
728
		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
729

Michael Graff's avatar
Michael Graff committed
730
731
#undef SOFT_OR_HARD
#undef ALWAYS_HARD
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747

		sock->recv_result = ISC_R_UNEXPECTED;
		send_recvdone_event(sock, &dev, ISC_R_UNEXPECTED);
		return (DOIO_SUCCESS);
	}

	/*
	 * On TCP, zero length reads indicate EOF, while on
	 * UDP, zero length reads are perfectly valid, although
	 * strange.
	 */
	if ((sock->type == isc_sockettype_tcp) && (cc == 0)) {
		sock->recv_result = ISC_R_EOF;
		return (DOIO_EOF);
	}

Michael Graff's avatar
Michael Graff committed
748
749
750
	if (sock->type == isc_sockettype_udp)
		dev->address.length = msghdr.msg_namelen;

751
752
753
754
755
756
757
758
759
760
761
762
	/*
	 * Overflow bit detection.  If we received MORE bytes than we should,
	 * this indicates an overflow situation.  Set the flag in the
	 * dev entry and adjust how much we read by one.
	 */
#ifdef ISC_NET_RECVOVERFLOW
	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
		cc--;
	}
#endif

763
764
765
766
	/*
	 * If there are control messages attached, run through them and pull
	 * out the interesting bits.
	 */
Michael Graff's avatar
Michael Graff committed
767
768
	if (sock->type == isc_sockettype_udp)
		process_cmsg(sock, &msghdr, dev);
769

770
771
772
773
774
775
776
	/*
	 * update the buffers (if any) and the i/o count
	 */
	dev->n += cc;
	actual_count = cc;
	buffer = ISC_LIST_HEAD(dev->bufferlist);
	while (buffer != NULL && actual_count > 0) {
777
778
779
780
781
		REQUIRE(ISC_BUFFER_VALID(buffer));
		if (ISC_BUFFER_AVAILABLECOUNT(buffer) <= actual_count) {
			actual_count -= ISC_BUFFER_AVAILABLECOUNT(buffer);
			isc_buffer_add(buffer,
				       ISC_BUFFER_AVAILABLECOUNT(buffer));
782
783
784
785
786
787
788
789
790
791
792
		} else {
			isc_buffer_add(buffer, actual_count);
			actual_count = 0;
			break;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
		if (buffer == NULL) {
			INSIST(actual_count == 0);
		}
	}

793
794
795
796
797
798
799
800
801
802
803
	/*
	 * If we read less than we expected, update counters,
	 * and let the upper layer poke the descriptor.
	 */
	if (((size_t)cc != read_count) && (dev->n < dev->minimum))
		return (DOIO_SOFT);

	/*
	 * full reads are posted, or partials if partials are ok.
	 */
	send_recvdone_event(sock, &dev, ISC_R_SUCCESS);
Michael Graff's avatar
Michael Graff committed
804
805
806
807
808
809
810
	return (DOIO_SUCCESS);
}

static int
doio_send(isc_socket_t *sock, isc_socketevent_t *dev)
{
	int cc;
811
	struct iovec iov[MAXSCATTERGATHER_SEND];
Michael Graff's avatar
Michael Graff committed
812
813
814
	size_t write_count;
	struct msghdr msghdr;

815
	/* XXXMLG Should verify that we didn't overflow MAXSCATTERGATHER? */
Michael Graff's avatar
Michael Graff committed
816
	build_msghdr_send(sock, dev, &msghdr, iov,
817
			  MAXSCATTERGATHER_SEND, &write_count);
Michael Graff's avatar
Michael Graff committed
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837

	cc = sendmsg(sock->fd, &msghdr, 0);

	/*
	 * check for error or block condition
	 */
	if (cc < 0) {
		if (SOFT_ERROR(errno))
			return (DOIO_SOFT);

#define SOFT_OR_HARD(_system, _isc) \
	if (errno == _system) { \
		if (sock->connected) { \
			if (sock->type == isc_sockettype_tcp) \
				sock->send_result = _isc; \
			send_senddone_event(sock, &dev, _isc); \
			return (DOIO_HARD); \
		} \
		return (DOIO_SOFT); \
	}
Michael Graff's avatar
Michael Graff committed
838
839
840
841
842
843
#define ALWAYS_HARD(_system, _isc) \
	if (errno == _system) { \
		sock->send_result = _isc; \
		send_senddone_event(sock, &dev, _isc); \
		return (DOIO_HARD); \
	}
Michael Graff's avatar
Michael Graff committed
844
845

		SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
Michael Graff's avatar
Michael Graff committed
846
847
848
		ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
		ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
		ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
Michael Graff's avatar
Michael Graff committed
849

Michael Graff's avatar
Michael Graff committed
850
851
#undef SOFT_OR_HARD
#undef ALWAYS_HARD
Michael Graff's avatar
Michael Graff committed
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880

		/*
		 * The other error types depend on whether or not the
		 * socket is UDP or TCP.  If it is UDP, some errors
		 * that we expect to be fatal under TCP are merely
		 * annoying, and are really soft errors.
		 *
		 * However, these soft errors are still returned as
		 * a status.
		 */
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "internal_send: %s",
				 strerror(errno));
		sock->send_result = ISC_R_UNEXPECTED;
		send_senddone_event(sock, &dev, ISC_R_UNEXPECTED);
		return (DOIO_HARD);
	}

	if (cc == 0)
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "internal_send: send() returned 0");

	/*
	 * if we write less than we expected, update counters,
	 * poke.
	 */
	dev->n += cc;
	if ((size_t)cc != write_count)
		return (DOIO_SOFT);
881

Michael Graff's avatar
Michael Graff committed
882
883
884
885
886
	/*
	 * Exactly what we wanted to write.  We're done with this
	 * entry.  Post its completion event.
	 */
	send_senddone_event(sock, &dev, ISC_R_SUCCESS);
887
888
889
	return (DOIO_SUCCESS);
}

890
891
892
/*
 * Kill.
 *
893
894
 * Caller must ensure that the socket is not locked and no external
 * references exist.
895
896
 */
static void
Bob Halley's avatar
Bob Halley committed
897
destroy(isc_socket_t **sockp)
898
{
Bob Halley's avatar
Bob Halley committed
899
900
	isc_socket_t *sock = *sockp;
	isc_socketmgr_t *manager = sock->manager;
901

Michael Graff's avatar
Michael Graff committed
902
903
	XTRACE(TRACE_MANAGER,
	       ("destroy sockp = %p, sock = %p\n", sockp, sock));
Michael Graff's avatar
Michael Graff committed
904

905
906
907
908
909
	INSIST(ISC_LIST_EMPTY(sock->accept_list));
	INSIST(ISC_LIST_EMPTY(sock->recv_list));
	INSIST(ISC_LIST_EMPTY(sock->send_list));
	INSIST(sock->connect_ev == NULL);

910
911
912
	LOCK(&manager->lock);

	/*
Bob Halley's avatar
Bob Halley committed
913
	 * No one has this socket open, so the watcher doesn't have to be
Michael Graff's avatar
Michael Graff committed
914
	 * poked, and the socket doesn't have to be locked.
915
	 */
Michael Graff's avatar
Michael Graff committed
916
	manager->fds[sock->fd] = NULL;
Michael Graff's avatar
Michael Graff committed
917
918
	manager->fdstate[sock->fd] = CLOSE_PENDING;
	select_poke(sock->manager, sock->fd);
Michael Graff's avatar
Michael Graff committed
919
	manager->nsockets--;
Michael Graff's avatar
Michael Graff committed
920
	XTRACE(TRACE_MANAGER, ("nsockets == %d\n", manager->nsockets));
921
922
	if (manager->nsockets == 0)
		SIGNAL(&manager->shutdown_ok);
923

924
925
926
927
	/*
	 * XXX should reset manager->maxfd here
	 */

928
929
	UNLOCK(&manager->lock);

Michael Graff's avatar
Michael Graff committed
930
	free_socket(sockp);
Michael Graff's avatar
Michael Graff committed
931
932
933
}

static isc_result_t
Bob Halley's avatar
Bob Halley committed
934
935
allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
		isc_socket_t **socketp)
Michael Graff's avatar
Michael Graff committed
936
{
Bob Halley's avatar
Bob Halley committed
937
	isc_socket_t *sock;
938
	isc_result_t ret;
Michael Graff's avatar
Michael Graff committed
939
940
941
942

	sock = isc_mem_get(manager->mctx, sizeof *sock);

	if (sock == NULL)
943
		return (ISC_R_NOMEMORY);
Michael Graff's avatar
Michael Graff committed
944

945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
#if USE_CMSG  /* Let's hope the OSs are sane, and pad correctly XXXMLG */
	sock->cmsglen = 0;
#ifdef ISC_PLATFORM_HAVEIPV6
	sock->cmsglen += CMSG_SPACE(sizeof(struct in6_pktinfo));
#endif
#ifdef SO_TIMESTAMP
	sock->cmsglen += CMSG_SPACE(sizeof(struct timeval));
#endif
	sock->cmsg = isc_mem_get(manager->mctx, sock->cmsglen);
	if (sock->cmsg == NULL) {
		ret = ISC_R_NOMEMORY;
		goto err1;
	}
#endif

960
961
962
	ret = ISC_R_UNEXPECTED;

	sock->magic = 0;
Michael Graff's avatar
Michael Graff committed
963
	sock->references = 0;
Michael Graff's avatar
Michael Graff committed
964
965
966

	sock->manager = manager;
	sock->type = type;
967
	sock->fd = -1;
Michael Graff's avatar
Michael Graff committed
968
969
970
971

	/*
	 * set up list of readers and writers to be initially empty
	 */
972
973
	ISC_LIST_INIT(sock->recv_list);
	ISC_LIST_INIT(sock->send_list);
974
	ISC_LIST_INIT(sock->accept_list);
Michael Graff's avatar
Michael Graff committed
975
	sock->connect_ev = NULL;
Michael Graff's avatar
Michael Graff committed
976
977
978
979
980
981
	sock->pending_recv = 0;
	sock->pending_send = 0;
	sock->pending_accept = 0;
	sock->listener = 0;
	sock->connected = 0;
	sock->connecting = 0;
982

983
984
985
	sock->recv_result = ISC_R_SUCCESS;
	sock->send_result = ISC_R_SUCCESS;

Michael Graff's avatar
Michael Graff committed
986
987
988
989
990
991
992
	/*
	 * initialize the lock
	 */
	if (isc_mutex_init(&sock->lock) != ISC_R_SUCCESS) {
		sock->magic = 0;
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "isc_mutex_init() failed");
993
		ret = ISC_R_UNEXPECTED;
994
		goto err2;
Michael Graff's avatar
Michael Graff committed
995
996
	}

997
	/*
998
	 * Initialize readable and writable events
999
1000
	 */
	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
For faster browsing, not all history is shown. View entire blame