socket.c 58 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
2
 * Copyright (C) 1998, 1999  Internet Software Consortium.
Bob Halley's avatar
Bob Halley committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
 * 
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS
 * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE
 * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
 * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
 * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
 * SOFTWARE.
 */
Bob Halley's avatar
Bob Halley committed
17
18

#include <config.h>
19

Michael Graff's avatar
Michael Graff committed
20
21
22
#include <sys/types.h>
#include <sys/uio.h>

23
24
25
26
27
#include <errno.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
Michael Graff's avatar
Michael Graff committed
28
#include <fcntl.h>
29
30

#include <isc/assertions.h>
31
#include <isc/error.h>
32
33
34
35
#include <isc/thread.h>
#include <isc/mutex.h>
#include <isc/condition.h>
#include <isc/socket.h>
36
#include <isc/list.h>
37

38
#include "util.h"
Bob Halley's avatar
Bob Halley committed
39

40
41
42
43
/*
 * Some systems define the socket length argument as an int, some as size_t,
 * some as socklen_t.  This is here, so it can be easily changed if needed.
 */
44
#ifndef ISC_SOCKADDR_LEN_T
45
#define ISC_SOCKADDR_LEN_T int
46
#endif
47

48
49
50
51
52
/*
 * As above, one system (solaris) wants the pointers passed into recv() and
 * the other network functions to be char *.  All the others seem to use
 * void *.  Cast everything to char * for now.
 */
53
#ifndef ISC_SOCKDATA_CAST
54
#define ISC_SOCKDATA_CAST(x) ((char *)(x))
55
#endif
56

57
58
59
/*
 * If we cannot send to this task, the application is broken.
 */
60
#define ISC_TASK_SEND(a, b) do { \
61
	RUNTIME_CHECK(isc_task_send(a, b) == ISC_R_SUCCESS); \
62
} while (0)
63

64
65
66
67
#define ISC_TASK_SENDANDDETACH(a, b) do { \
	RUNTIME_CHECK(isc_task_sendanddetach(a, b) == ISC_R_SUCCESS); \
} while (0)

68
69
70
/*
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
71
72
73
74
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
75
 */
76
77
78
79
#define SOFT_ERROR(e)	((e) == EAGAIN || \
			 (e) == EWOULDBLOCK || \
			 (e) == EINTR || \
			 (e) == 0)
80

Michael Graff's avatar
Michael Graff committed
81
#if 0
82
83
84
85
#define ISC_SOCKET_DEBUG
#endif

#if defined(ISC_SOCKET_DEBUG)
Michael Graff's avatar
Michael Graff committed
86
87
88
89
90
91
92
#define TRACE_WATCHER	0x0001
#define TRACE_LISTEN	0x0002
#define TRACE_CONNECT	0x0004
#define TRACE_RECV	0x0008
#define TRACE_SEND    	0x0010
#define TRACE_MANAGER	0x0020

Michael Graff's avatar
Michael Graff committed
93
int trace_level = TRACE_RECV | TRACE_WATCHER;
94
95
#define XTRACE(l, a)	do {						\
				if ((l) & trace_level) {		\
Michael Graff's avatar
Michael Graff committed
96
					printf("[%s:%d] ", __FILE__, __LINE__); \
97
98
99
100
					printf a;			\
					fflush(stdout);			\
				}					\
			} while (0)
101
102
#define XENTER(l, a)	do {						\
				if ((l) & trace_level)			\
103
					fprintf(stderr, "ENTER %s\n", (a)); \
104
105
106
			} while (0)
#define XEXIT(l, a)	do {						\
				if ((l) & trace_level)			\
107
					fprintf(stderr, "EXIT %s\n", (a)); \
108
			} while (0)
109
#else
Michael Graff's avatar
Michael Graff committed
110
111
112
#define XTRACE(l, a)
#define XENTER(l, a)
#define XEXIT(l, a)
113
114
#endif

115
typedef isc_event_t intev_t;
Michael Graff's avatar
Michael Graff committed
116
117
118

#define SOCKET_MAGIC		0x494f696fU	/* IOio */
#define VALID_SOCKET(t)		((t) != NULL && (t)->magic == SOCKET_MAGIC)
Michael Graff's avatar
Michael Graff committed
119

120
121
struct isc_socket {
	/* Not locked. */
122
123
124
125
	unsigned int			magic;
	isc_socketmgr_t		       *manager;
	isc_mutex_t			lock;
	isc_sockettype_t		type;
Michael Graff's avatar
Michael Graff committed
126

127
	/* Locked by socket lock. */
128
129
130
131
	unsigned int			references;
	int				fd;
	isc_result_t			recv_result;
	isc_result_t			send_result;
132

133
	ISC_LIST(isc_socketevent_t)		send_list;
134
	ISC_LIST(isc_socketevent_t)		recv_list;
135
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
136
137
138
139
140
141
142
	isc_socket_connev_t		       *connect_ev;

	/*
	 * Internal events.  Posted when a descriptor is readable or
	 * writable.  These are statically allocated and never freed.
	 * They will be set to non-purgable before use.
	 */
143
	intev_t				readable_ev;
144
	intev_t				writable_ev;
145

146
147
	isc_sockaddr_t			address;  /* remote address */

Michael Graff's avatar
Michael Graff committed
148
149
150
151
152
153
	unsigned int			pending_recv : 1,
					pending_send : 1,
					pending_accept : 1,
					listener : 1, /* listener socket */
					connected : 1,
					connecting : 1; /* connect pending */
154
155
156
157
158
159
160
161
};

#define SOCKET_MANAGER_MAGIC		0x494f6d67U	/* IOmg */
#define VALID_MANAGER(m)		((m) != NULL && \
					 (m)->magic == SOCKET_MANAGER_MAGIC)
struct isc_socketmgr {
	/* Not locked. */
	unsigned int			magic;
162
	isc_mem_t		       *mctx;
163
164
	isc_mutex_t			lock;
	/* Locked by manager lock. */
Michael Graff's avatar
Michael Graff committed
165
	unsigned int			nsockets;  /* sockets managed */
Michael Graff's avatar
Michael Graff committed
166
	isc_thread_t			watcher;
167
	isc_condition_t			shutdown_ok;
Michael Graff's avatar
Michael Graff committed
168
	fd_set				read_fds;
169
	fd_set				write_fds;
170
	isc_socket_t		       *fds[FD_SETSIZE];
Michael Graff's avatar
Michael Graff committed
171
	int				fdstate[FD_SETSIZE];
172
	int				maxfd;
Michael Graff's avatar
Michael Graff committed
173
	int				pipe_fds[2];
174
175
};

Michael Graff's avatar
Michael Graff committed
176
177
178
179
#define CLOSED		0	/* this one must be zero */
#define MANAGED		1
#define CLOSE_PENDING	2

180
181
static void send_recvdone_event(isc_socket_t *, isc_task_t **,
				isc_socketevent_t **, isc_result_t, int);
182
183
static void send_senddone_event(isc_socket_t *, isc_task_t **,
				isc_socketevent_t **, isc_result_t, int);
Bob Halley's avatar
Bob Halley committed
184
185
186
187
static void free_socket(isc_socket_t **);
static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t,
				    isc_socket_t **);
static void destroy(isc_socket_t **);
188
189
190
191
static void internal_accept(isc_task_t *, isc_event_t *);
static void internal_connect(isc_task_t *, isc_event_t *);
static void internal_recv(isc_task_t *, isc_event_t *);
static void internal_send(isc_task_t *, isc_event_t *);
Michael Graff's avatar
Michael Graff committed
192
193
194
195

#define SELECT_POKE_SHUTDOWN		(-1)
#define SELECT_POKE_NOTHING		(-2)
#define SELECT_POKE_RESCAN		(-3) /* XXX implement */
196
197

/*
Michael Graff's avatar
Michael Graff committed
198
199
200
 * Poke the select loop when there is something for us to do.
 * We assume that if a write completes here, it will be inserted into the
 * queue fully.  That is, we will not get partial writes.
201
202
 */
static void
Bob Halley's avatar
Bob Halley committed
203
select_poke(isc_socketmgr_t *mgr, int msg)
204
{
Michael Graff's avatar
Michael Graff committed
205
206
	int cc;

207
208
209
210
211
	do {
		cc = write(mgr->pipe_fds[1], &msg, sizeof(int));
	} while (cc < 0 && SOFT_ERROR(errno));

	if (cc < 0)
Michael Graff's avatar
Michael Graff committed
212
213
214
		FATAL_ERROR(__FILE__, __LINE__,
			    "write() failed during watcher poke: %s",
			    strerror(errno));
215
216

	INSIST(cc == sizeof(int));
217
218
219
220
221
}

/*
 * read a message on the internal fd.
 */
Michael Graff's avatar
Michael Graff committed
222
static int
Bob Halley's avatar
Bob Halley committed
223
select_readmsg(isc_socketmgr_t *mgr)
224
{
Michael Graff's avatar
Michael Graff committed
225
	int msg;
Michael Graff's avatar
Michael Graff committed
226
227
	int cc;

Michael Graff's avatar
Michael Graff committed
228
	cc = read(mgr->pipe_fds[0], &msg, sizeof(int));
Michael Graff's avatar
Michael Graff committed
229
	if (cc < 0) {
Michael Graff's avatar
Michael Graff committed
230
		if (SOFT_ERROR(errno))
Michael Graff's avatar
Michael Graff committed
231
			return (SELECT_POKE_NOTHING);
Michael Graff's avatar
Michael Graff committed
232

Michael Graff's avatar
Michael Graff committed
233
234
235
236
		FATAL_ERROR(__FILE__, __LINE__,
			    "read() failed during watcher poke: %s",
			    strerror(errno));

Michael Graff's avatar
Michael Graff committed
237
		return (SELECT_POKE_NOTHING);
Michael Graff's avatar
Michael Graff committed
238
	}
239

Michael Graff's avatar
Michael Graff committed
240
	return (msg);
241
242
243
}

/*
Michael Graff's avatar
Michael Graff committed
244
 * Make a fd non-blocking
245
 */
Michael Graff's avatar
Michael Graff committed
246
247
static isc_result_t
make_nonblock(int fd)
248
{
Michael Graff's avatar
Michael Graff committed
249
250
	int ret;
	int flags;
251

Michael Graff's avatar
Michael Graff committed
252
253
254
	flags = fcntl(fd, F_GETFL, 0);
	flags |= O_NONBLOCK;
	ret = fcntl(fd, F_SETFL, flags);
255

Michael Graff's avatar
Michael Graff committed
256
257
258
259
	if (ret == -1) {
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "fcntl(%d, F_SETFL, %d): %s",
				 fd, flags, strerror(errno));
Michael Graff's avatar
Michael Graff committed
260

Michael Graff's avatar
Michael Graff committed
261
		return (ISC_R_UNEXPECTED);
Michael Graff's avatar
Michael Graff committed
262
263
	}

Michael Graff's avatar
Michael Graff committed
264
	return (ISC_R_SUCCESS);
265
266
267
268
269
}

/*
 * Kill.
 *
270
271
 * Caller must ensure that the socket is not locked and no external
 * references exist.
272
273
 */
static void
Bob Halley's avatar
Bob Halley committed
274
destroy(isc_socket_t **sockp)
275
{
Bob Halley's avatar
Bob Halley committed
276
277
	isc_socket_t *sock = *sockp;
	isc_socketmgr_t *manager = sock->manager;
278

Michael Graff's avatar
Michael Graff committed
279
280
	XTRACE(TRACE_MANAGER,
	       ("destroy sockp = %p, sock = %p\n", sockp, sock));
Michael Graff's avatar
Michael Graff committed
281

282
283
284
285
286
	INSIST(ISC_LIST_EMPTY(sock->accept_list));
	INSIST(ISC_LIST_EMPTY(sock->recv_list));
	INSIST(ISC_LIST_EMPTY(sock->send_list));
	INSIST(sock->connect_ev == NULL);

287
288
289
	LOCK(&manager->lock);

	/*
Bob Halley's avatar
Bob Halley committed
290
	 * No one has this socket open, so the watcher doesn't have to be
Michael Graff's avatar
Michael Graff committed
291
	 * poked, and the socket doesn't have to be locked.
292
	 */
Michael Graff's avatar
Michael Graff committed
293
	manager->fds[sock->fd] = NULL;
Michael Graff's avatar
Michael Graff committed
294
295
	manager->fdstate[sock->fd] = CLOSE_PENDING;
	select_poke(sock->manager, sock->fd);
Michael Graff's avatar
Michael Graff committed
296
	manager->nsockets--;
Michael Graff's avatar
Michael Graff committed
297
	XTRACE(TRACE_MANAGER, ("nsockets == %d\n", manager->nsockets));
298
299
	if (manager->nsockets == 0)
		SIGNAL(&manager->shutdown_ok);
300

301
302
303
304
	/*
	 * XXX should reset manager->maxfd here
	 */

305
306
	UNLOCK(&manager->lock);

Michael Graff's avatar
Michael Graff committed
307
	free_socket(sockp);
Michael Graff's avatar
Michael Graff committed
308
309
310
}

static isc_result_t
Bob Halley's avatar
Bob Halley committed
311
312
allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type,
		isc_socket_t **socketp)
Michael Graff's avatar
Michael Graff committed
313
{
Bob Halley's avatar
Bob Halley committed
314
	isc_socket_t *sock;
315
	isc_result_t ret;
Michael Graff's avatar
Michael Graff committed
316
317
318
319

	sock = isc_mem_get(manager->mctx, sizeof *sock);

	if (sock == NULL)
320
		return (ISC_R_NOMEMORY);
Michael Graff's avatar
Michael Graff committed
321

322
323
324
	ret = ISC_R_UNEXPECTED;

	sock->magic = 0;
Michael Graff's avatar
Michael Graff committed
325
	sock->references = 0;
Michael Graff's avatar
Michael Graff committed
326
327
328

	sock->manager = manager;
	sock->type = type;
329
	sock->fd = -1;
Michael Graff's avatar
Michael Graff committed
330
331
332
333

	/*
	 * set up list of readers and writers to be initially empty
	 */
334
335
	ISC_LIST_INIT(sock->recv_list);
	ISC_LIST_INIT(sock->send_list);
336
	ISC_LIST_INIT(sock->accept_list);
Michael Graff's avatar
Michael Graff committed
337
	sock->connect_ev = NULL;
Michael Graff's avatar
Michael Graff committed
338
339
340
341
342
343
	sock->pending_recv = 0;
	sock->pending_send = 0;
	sock->pending_accept = 0;
	sock->listener = 0;
	sock->connected = 0;
	sock->connecting = 0;
344

345
346
347
	sock->recv_result = ISC_R_SUCCESS;
	sock->send_result = ISC_R_SUCCESS;

Michael Graff's avatar
Michael Graff committed
348
349
350
351
352
353
354
	/*
	 * initialize the lock
	 */
	if (isc_mutex_init(&sock->lock) != ISC_R_SUCCESS) {
		sock->magic = 0;
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "isc_mutex_init() failed");
355
356
		ret = ISC_R_UNEXPECTED;
		goto err1;
Michael Graff's avatar
Michael Graff committed
357
358
	}

359
	/*
360
	 * Initialize readable and writable events
361
362
	 */
	ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
Bob Halley's avatar
Bob Halley committed
363
		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
364
		       NULL, sock, sock, NULL, NULL);
365
	ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
366
		       ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
367
		       NULL, sock, sock, NULL, NULL);
368
369

	sock->magic = SOCKET_MAGIC;
Michael Graff's avatar
Michael Graff committed
370
371
372
	*socketp = sock;

	return (ISC_R_SUCCESS);
373
374
375
376
377

 err1: /* socket allocated */
	isc_mem_put(manager->mctx, sock, sizeof *sock);

	return (ret);
Michael Graff's avatar
Michael Graff committed
378
379
380
381
382
383
384
385
386
387
}

/*
 * This event requires that the various lists be empty, that the reference
 * count be 1, and that the magic number is valid.  The other socket bits,
 * like the lock, must be initialized as well.  The fd associated must be
 * marked as closed, by setting it to -1 on close, or this routine will
 * also close the socket.
 */
static void
Bob Halley's avatar
Bob Halley committed
388
free_socket(isc_socket_t **socketp)
Michael Graff's avatar
Michael Graff committed
389
{
Bob Halley's avatar
Bob Halley committed
390
	isc_socket_t *sock = *socketp;
Michael Graff's avatar
Michael Graff committed
391

392
393
394
395
396
397
398
399
400
	INSIST(sock->references == 0);
	INSIST(VALID_SOCKET(sock));
	INSIST(!sock->connecting);
	INSIST(!sock->pending_recv);
	INSIST(!sock->pending_send);
	INSIST(!sock->pending_accept);
	INSIST(EMPTY(sock->recv_list));
	INSIST(EMPTY(sock->send_list));
	INSIST(EMPTY(sock->accept_list));
Michael Graff's avatar
Michael Graff committed
401

402
	sock->magic = 0;
Michael Graff's avatar
Michael Graff committed
403
404
405
406

	(void)isc_mutex_destroy(&sock->lock);

	isc_mem_put(sock->manager->mctx, sock, sizeof *sock);
Michael Graff's avatar
Michael Graff committed
407
408

	*socketp = NULL;
409
410
411
412
413
414
415
416
417
418
}

/*
 * Create a new 'type' socket managed by 'manager'.  The sockets
 * parameters are specified by 'expires' and 'interval'.  Events
 * will be posted to 'task' and when dispatched 'action' will be
 * called with 'arg' as the arg value.  The new socket is returned
 * in 'socketp'.
 */
isc_result_t
Bob Halley's avatar
Bob Halley committed
419
isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
Bob Halley's avatar
Bob Halley committed
420
		  isc_socket_t **socketp)
421
{
Bob Halley's avatar
Bob Halley committed
422
	isc_socket_t *sock = NULL;
Michael Graff's avatar
Michael Graff committed
423
	isc_result_t ret;
424
425
426
427

	REQUIRE(VALID_MANAGER(manager));
	REQUIRE(socketp != NULL && *socketp == NULL);

Michael Graff's avatar
Michael Graff committed
428
	XENTER(TRACE_MANAGER, "isc_socket_create");
Michael Graff's avatar
Michael Graff committed
429
430
431
432
	
	ret = allocate_socket(manager, type, &sock);
	if (ret != ISC_R_SUCCESS)
		return (ret);
433
434

	switch (type) {
435
	case isc_sockettype_udp:
Bob Halley's avatar
Bob Halley committed
436
		sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP);
437
		break;
438
	case isc_sockettype_tcp:
Bob Halley's avatar
Bob Halley committed
439
		sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP);
440
441
442
		break;
	}
	if (sock->fd < 0) {
Michael Graff's avatar
Michael Graff committed
443
		free_socket(&sock);
444
445
446
447
448
449

		switch (errno) {
		case EMFILE:
		case ENFILE:
		case ENOBUFS:
			return (ISC_R_NORESOURCES);
450
			/* NOTREACHED */
451
452
453
454
455
456
			break;
		default:
			UNEXPECTED_ERROR(__FILE__, __LINE__,
					 "socket() failed: %s",
					 strerror(errno));
			return (ISC_R_UNEXPECTED);
457
			/* NOTREACHED */
458
459
460
461
			break;
		}
	}

Michael Graff's avatar
Michael Graff committed
462
	if (make_nonblock(sock->fd) != ISC_R_SUCCESS) {
Michael Graff's avatar
Michael Graff committed
463
		free_socket(&sock);
Michael Graff's avatar
Michael Graff committed
464
465
466
		return (ISC_R_UNEXPECTED);
	}

467
468
469
	sock->references = 1;
	*socketp = sock;

470
471
472
473
474
475
476
	LOCK(&manager->lock);

	/*
	 * Note we don't have to lock the socket like we normally would because
	 * there are no external references to it yet.
	 */

Michael Graff's avatar
Michael Graff committed
477
	manager->fds[sock->fd] = sock;
Michael Graff's avatar
Michael Graff committed
478
	manager->fdstate[sock->fd] = MANAGED;
Michael Graff's avatar
Michael Graff committed
479
	manager->nsockets++;
Michael Graff's avatar
Michael Graff committed
480
	XTRACE(TRACE_MANAGER, ("nsockets == %d\n", manager->nsockets));
481
482
	if (manager->maxfd < sock->fd)
		manager->maxfd = sock->fd;
483
484
485

	UNLOCK(&manager->lock);

Michael Graff's avatar
Michael Graff committed
486
	XEXIT(TRACE_MANAGER, "isc_socket_create");
487

Michael Graff's avatar
Michael Graff committed
488
	return (ISC_R_SUCCESS);
489
490
491
492
493
494
}

/*
 * Attach to a socket.  Caller must explicitly detach when it is done.
 */
void
Bob Halley's avatar
Bob Halley committed
495
isc_socket_attach(isc_socket_t *sock, isc_socket_t **socketp)
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
{
	REQUIRE(VALID_SOCKET(sock));
	REQUIRE(socketp != NULL && *socketp == NULL);

	LOCK(&sock->lock);
	sock->references++;
	UNLOCK(&sock->lock);
	
	*socketp = sock;
}

/*
 * Dereference a socket.  If this is the last reference to it, clean things
 * up by destroying the socket.
 */
void 
Bob Halley's avatar
Bob Halley committed
512
isc_socket_detach(isc_socket_t **socketp)
513
{
Bob Halley's avatar
Bob Halley committed
514
	isc_socket_t *sock;
Michael Graff's avatar
Michael Graff committed
515
	isc_boolean_t kill_socket = ISC_FALSE;
516
517
518
519
520

	REQUIRE(socketp != NULL);
	sock = *socketp;
	REQUIRE(VALID_SOCKET(sock));

Michael Graff's avatar
Michael Graff committed
521
	XENTER(TRACE_MANAGER, "isc_socket_detach");
522
523
524
525
526

	LOCK(&sock->lock);
	REQUIRE(sock->references > 0);
	sock->references--;
	if (sock->references == 0)
Michael Graff's avatar
Michael Graff committed
527
		kill_socket = ISC_TRUE;
528
529
	UNLOCK(&sock->lock);
	
Michael Graff's avatar
Michael Graff committed
530
531
	if (kill_socket)
		destroy(&sock);
532

Michael Graff's avatar
Michael Graff committed
533
	XEXIT(TRACE_MANAGER, "isc_socket_detach");
534
535
536
537

	*socketp = NULL;
}

Michael Graff's avatar
Michael Graff committed
538
539
540
541
542
543
544
545
546
/*
 * I/O is possible on a given socket.  Schedule an event to this task that
 * will call an internal function to do the I/O.  This will charge the
 * task with the I/O operation and let our select loop handler get back
 * to doing something real as fast as possible.
 *
 * The socket and manager must be locked before calling this function.
 */
static void
Bob Halley's avatar
Bob Halley committed
547
dispatch_read(isc_socket_t *sock)
Michael Graff's avatar
Michael Graff committed
548
{
549
550
	intev_t *iev;
	isc_socketevent_t *ev;
Michael Graff's avatar
Michael Graff committed
551

552
553
	iev = &sock->readable_ev;
	ev = ISC_LIST_HEAD(sock->recv_list);
Michael Graff's avatar
Michael Graff committed
554

555
556
	INSIST(ev != NULL);
	INSIST(!sock->pending_recv);
Michael Graff's avatar
Michael Graff committed
557
	sock->pending_recv = 1;
Michael Graff's avatar
Michael Graff committed
558

Michael Graff's avatar
Michael Graff committed
559
	XTRACE(TRACE_WATCHER, ("dispatch_read:  posted event %p to task %p\n",
560
			       ev, ev->sender));
561

562
563
564
565
	sock->references++;
	iev->sender = sock;
	iev->action = internal_recv;
	iev->arg = sock;
Michael Graff's avatar
Michael Graff committed
566

567
	ISC_TASK_SEND(ev->sender, (isc_event_t **)&iev);
Michael Graff's avatar
Michael Graff committed
568
569
}

570
static void
Bob Halley's avatar
Bob Halley committed
571
dispatch_write(isc_socket_t *sock)
Michael Graff's avatar
Michael Graff committed
572
{
573
574
	intev_t *iev;
	isc_socketevent_t *ev;
Michael Graff's avatar
Michael Graff committed
575

576
577
	iev = &sock->writable_ev;
	ev = ISC_LIST_HEAD(sock->send_list);
Michael Graff's avatar
Michael Graff committed
578

579
580
	INSIST(ev != NULL);
	INSIST(!sock->pending_send);
Michael Graff's avatar
Michael Graff committed
581
	sock->pending_send = 1;
Michael Graff's avatar
Michael Graff committed
582

583
584
585
586
587
588
589
	XTRACE(TRACE_WATCHER, ("dispatch_send:  posted event %p to task %p\n",
			       ev, ev->sender));

	sock->references++;
	iev->sender = sock;
	iev->action = internal_send;
	iev->arg = sock;
Michael Graff's avatar
Michael Graff committed
590

591
	ISC_TASK_SEND(ev->sender, (isc_event_t **)&iev);
Michael Graff's avatar
Michael Graff committed
592
593
}

594
595
596
/*
 * Dispatch an internal accept event.
 */
597
static void
598
dispatch_accept(isc_socket_t *sock)
599
{
600
601
	intev_t *iev;
	isc_socket_newconnev_t *ev;
602

603
604
	iev = &sock->readable_ev;
	ev = ISC_LIST_HEAD(sock->accept_list);
605

606
607
	INSIST(ev != NULL);
	INSIST(!sock->pending_accept);
608
	sock->pending_accept = 1;
609

610
611
612
613
	sock->references++;  /* keep socket around for this internal event */
	iev->sender = sock;
	iev->action = internal_accept;
	iev->arg = sock;
614

615
	ISC_TASK_SEND(ev->sender, (isc_event_t **)&iev);
616
617
}

Michael Graff's avatar
Michael Graff committed
618
static void
Bob Halley's avatar
Bob Halley committed
619
dispatch_connect(isc_socket_t *sock)
Michael Graff's avatar
Michael Graff committed
620
{
621
622
	intev_t *iev;
	isc_socket_connev_t *ev;
Michael Graff's avatar
Michael Graff committed
623

624
	iev = &sock->writable_ev;
Michael Graff's avatar
Michael Graff committed
625

626
627
	ev = sock->connect_ev;
	INSIST(ev != NULL);
Michael Graff's avatar
Michael Graff committed
628

629
	INSIST(sock->connecting);
Michael Graff's avatar
Michael Graff committed
630

631
632
633
634
635
636
	sock->references++;  /* keep socket around for this internal event */
	iev->sender = sock;
	iev->action = internal_connect;
	iev->arg = sock;

	ISC_TASK_SEND(ev->sender, (isc_event_t **)&iev);
Michael Graff's avatar
Michael Graff committed
637
638
}

Michael Graff's avatar
Michael Graff committed
639
640
641
642
643
/*
 * Dequeue an item off the given socket's read queue, set the result code
 * in the done event to the one provided, and send it to the task it was
 * destined for.
 *
644
645
 * If the event to be sent is on a list, remove it before sending.  If
 * asked to, send and detach from the socket as well.
646
 *
Michael Graff's avatar
Michael Graff committed
647
648
 * Caller must have the socket locked.
 */
Michael Graff's avatar
Michael Graff committed
649
static void
650
651
652
send_recvdone_event(isc_socket_t *sock, isc_task_t **taskp,
		    isc_socketevent_t **dev, isc_result_t resultcode,
		    int detach)
Michael Graff's avatar
Michael Graff committed
653
654
{
	(*dev)->result = resultcode;
655
656
657
658
659
660
661
	(*dev)->sender = sock;
	if (ISC_LINK_LINKED(*dev, link))
		ISC_LIST_DEQUEUE(sock->recv_list, *dev, link);
	if (detach)
		ISC_TASK_SENDANDDETACH(taskp, (isc_event_t **)dev);
	else
		ISC_TASK_SEND(*taskp, (isc_event_t **)dev);
Michael Graff's avatar
Michael Graff committed
662
}
663
664

/*
Michael Graff's avatar
Michael Graff committed
665
 * See comments for send_recvdone_event() above.
666
667
668
 *
 * Caller must have the socket locked.
 */
669
static void
670
671
672
send_senddone_event(isc_socket_t *sock, isc_task_t **taskp,
		    isc_socketevent_t **dev, isc_result_t resultcode,
		    int detach)
673
674
{
	(*dev)->result = resultcode;
675
676
677
678
679
680
681
	(*dev)->sender = sock;
	if (ISC_LINK_LINKED(*dev, link))
		ISC_LIST_DEQUEUE(sock->send_list, *dev, link);
	if (detach)
		ISC_TASK_SENDANDDETACH(taskp, (isc_event_t **)dev);
	else
		ISC_TASK_SEND(*taskp, (isc_event_t **)dev);
682
}
Michael Graff's avatar
Michael Graff committed
683

Michael Graff's avatar
Michael Graff committed
684
685
686
/*
 * Call accept() on a socket, to get the new file descriptor.  The listen
 * socket is used as a prototype to create a new isc_socket_t.  The new
687
688
689
690
691
692
693
 * socket has one outstanding reference.  The task receiving the event
 * will be detached from just after the event is delivered.
 *
 * On entry to this function, the event delivered is the internal
 * readable event, and the first item on the accept_list should be
 * the done event we want to send.  If the list is empty, this is a no-op,
 * so just unlock and return.
Michael Graff's avatar
Michael Graff committed
694
 */
695
static void
696
internal_accept(isc_task_t *me, isc_event_t *ev)
Michael Graff's avatar
Michael Graff committed
697
{
Bob Halley's avatar
Bob Halley committed
698
699
700
	isc_socket_t *sock;
	isc_socketmgr_t *manager;
	isc_socket_newconnev_t *dev;
701
	isc_task_t *task;
702
	ISC_SOCKADDR_LEN_T addrlen;
Michael Graff's avatar
Michael Graff committed
703
	int fd;
704
	isc_result_t result = ISC_R_SUCCESS;
Michael Graff's avatar
Michael Graff committed
705

706
707
	(void)me;

Michael Graff's avatar
Michael Graff committed
708
	sock = ev->sender;
709
	INSIST(VALID_SOCKET(sock));
710

Michael Graff's avatar
Michael Graff committed
711
	LOCK(&sock->lock);
Michael Graff's avatar
Michael Graff committed
712
713
	XTRACE(TRACE_LISTEN,
	       ("internal_accept called, locked parent sock %p\n", sock));
Michael Graff's avatar
Michael Graff committed
714

715
	manager = sock->manager;
716
	INSIST(VALID_MANAGER(manager));
Michael Graff's avatar
Michael Graff committed
717

718
	INSIST(sock->listener);
719
720
	INSIST(sock->pending_accept == 1);
	sock->pending_accept = 0;
Michael Graff's avatar
Michael Graff committed
721

722
723
724
	INSIST(sock->references > 0);
	sock->references--;  /* the internal event is done with this socket */
	if (sock->references == 0) {
725
		UNLOCK(&sock->lock);
726
727
728
729
730
731
732
733
734
735
736
		destroy(&sock);
		return;
	}

	/*
	 * Get the first item off the accept list.
	 * If it is empty, unlock the socket and return.
	 */
	dev = ISC_LIST_HEAD(sock->accept_list);
	if (dev == NULL) {
		UNLOCK(&sock->lock);
737
		return;
738
739
	}

Michael Graff's avatar
Michael Graff committed
740
741
	/*
	 * Try to accept the new connection.  If the accept fails with
Michael Graff's avatar
Michael Graff committed
742
	 * EAGAIN or EINTR, simply poke the watcher to watch this socket
Michael Graff's avatar
Michael Graff committed
743
744
	 * again.
	 */
745
746
747
	addrlen = sizeof dev->newsocket->address.type;
	fd = accept(sock->fd, &dev->newsocket->address.type.sa, &addrlen);
	dev->newsocket->address.length = addrlen;
Michael Graff's avatar
Michael Graff committed
748
	if (fd < 0) {
Michael Graff's avatar
Michael Graff committed
749
		if (SOFT_ERROR(errno)) {
Michael Graff's avatar
Michael Graff committed
750
751
			select_poke(sock->manager, sock->fd);
			UNLOCK(&sock->lock);
752
			return;
Michael Graff's avatar
Michael Graff committed
753
754
755
756
		}

		/*
		 * If some other error, ignore it as well and hope
Michael Graff's avatar
Michael Graff committed
757
		 * for the best, but log it.
Michael Graff's avatar
Michael Graff committed
758
		 */
Michael Graff's avatar
Michael Graff committed
759
760
		XTRACE(TRACE_LISTEN, ("internal_accept: accept returned %s\n",
				      strerror(errno)));
761
762

		fd = -1;
763
764
765
766
767

		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "internal_accept: accept() failed: %s",
				 strerror(errno));

768
		result = ISC_R_UNEXPECTED;
Michael Graff's avatar
Michael Graff committed
769
	}
770

771
772
773
774
775
776
777
778
779
780
781
782
783
	/*
	 * Pull off the done event.
	 */
	ISC_LIST_UNLINK(sock->accept_list, dev, link);

	/*
	 * Poke watcher if there are more pending accepts.
	 */
	if (!EMPTY(sock->accept_list))
		select_poke(sock->manager, sock->fd);

	UNLOCK(&sock->lock);

784
	if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) {
785
		close(fd);
786
787
		fd = -1;

788
789
790
791
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "internal_accept: make_nonblock() failed: %s",
				 strerror(errno));

792
		result = ISC_R_UNEXPECTED;
793
	}
Michael Graff's avatar
Michael Graff committed
794

795
	/*
796
	 * -1 means the new socket didn't happen.
797
	 */
798
799
	if (fd != -1) {
		dev->newsocket->fd = fd;
800

801
802
803
		/*
		 * Save away the remote address
		 */
804
		dev->address = dev->newsocket->address;
805
806
807
808
809
810
811

		LOCK(&manager->lock);
		manager->fds[fd] = dev->newsocket;
		manager->fdstate[fd] = MANAGED;
		if (manager->maxfd < fd)
			manager->maxfd = fd;
		manager->nsockets++;
812
		XTRACE(TRACE_MANAGER, ("nsockets == %d\n", manager->nsockets));
813
814
815
816
817
		UNLOCK(&manager->lock);

		XTRACE(TRACE_LISTEN, ("internal_accept: newsock %p, fd %d\n",
				      dev->newsocket, fd));
	}
Michael Graff's avatar
Michael Graff committed
818

819
820
821
822
	/*
	 * Fill in the done event details and send it off.
	 */
	dev->result = result;
823
	task = dev->sender;
824
825
	dev->sender = sock;

826
	ISC_TASK_SENDANDDETACH(&task, (isc_event_t **)&dev);
Michael Graff's avatar
Michael Graff committed
827
828
}

829
static void
830
internal_recv(isc_task_t *me, isc_event_t *ev)
831
{
Bob Halley's avatar
Bob Halley committed
832
833
	isc_socketevent_t *dev;
	isc_socket_t *sock;
834
	isc_task_t *task;
Michael Graff's avatar
Michael Graff committed
835
836
	int cc;
	size_t read_count;
Michael Graff's avatar
Michael Graff committed
837
838
	struct msghdr msghdr;
	struct iovec iov;
Michael Graff's avatar
Michael Graff committed
839

840
	(void)me;
Michael Graff's avatar
Michael Graff committed
841

842
843
	INSIST(ev->type == ISC_SOCKEVENT_INTR);

844
	sock = ev->sender;
845
	INSIST(VALID_SOCKET(sock));
Michael Graff's avatar
Michael Graff committed
846

847
	LOCK(&sock->lock);
848
849
850
	XTRACE(TRACE_SEND,
	       ("internal_recv: task %p got event %p, sock %p, fd %d\n",
		me, ev, sock, sock->fd));
851

852
853
854
855
856
857
858
859
860
861
	INSIST(sock->pending_recv == 1);
	sock->pending_recv = 0;

	INSIST(sock->references > 0);
	sock->references--;  /* the internal event is done with this socket */
	if (sock->references == 0) {
		UNLOCK(&sock->lock);
		destroy(&sock);
		return;
	}
Michael Graff's avatar
Michael Graff committed
862

Michael Graff's avatar
Michael Graff committed
863
864
865
866
867
868
	/*
	 * Try to do as much I/O as possible on this socket.  There are no
	 * limits here, currently.  If some sort of quantum read count is
	 * desired before giving up control, make certain to process markers
	 * regardless of quantum.
	 */
869
870
	dev = ISC_LIST_HEAD(sock->recv_list);
	while (dev != NULL) {
871
		task = dev->sender;
Michael Graff's avatar
Michael Graff committed
872

Michael Graff's avatar
Michael Graff committed
873
874
875
876
		/*
		 * If this is a marker event, post its completion and
		 * continue the loop.
		 */
877
878
879
		if (dev->type == ISC_SOCKEVENT_RECVMARK) {
			send_recvdone_event(sock, &task, &dev,
					    sock->recv_result, 1);
Michael Graff's avatar
Michael Graff committed
880
			goto next;
Michael Graff's avatar
Michael Graff committed
881
882
		}

Michael Graff's avatar
Michael Graff committed
883
884
885
886
		/*
		 * It must be a read request.  Try to satisfy it as best
		 * we can.
		 */
Michael Graff's avatar
Michael Graff committed
887
		read_count = dev->region.length - dev->n;
Michael Graff's avatar
Michael Graff committed
888
889
890
891
		iov.iov_base = dev->region.base + dev->n;
		iov.iov_len = read_count;

		memset(&msghdr, 0, sizeof (msghdr));
892
		if (sock->type == isc_sockettype_udp) {
Michael Graff's avatar
Michael Graff committed
893
894
			memset(&dev->address, 0, sizeof(dev->address));
			msghdr.msg_name = (void *)&dev->address.type.sa;
Michael Graff's avatar
Michael Graff committed
895
			msghdr.msg_namelen = sizeof (dev->address.type);
Michael Graff's avatar
Michael Graff committed
896
		} else {
Michael Graff's avatar
Michael Graff committed
897
898
			msghdr.msg_name = NULL;
			msghdr.msg_namelen = 0;
899
			dev->address = sock->address;
900
		}
Michael Graff's avatar
Michael Graff committed
901
902
903
904
905
906
907
908
909
		msghdr.msg_iov = &iov;
		msghdr.msg_iovlen = 1;
		msghdr.msg_control = NULL;
		msghdr.msg_controllen = 0;
		msghdr.msg_flags = 0;

		cc = recvmsg(sock->fd, &msghdr, 0);
		if (sock->type == isc_sockettype_udp)
			dev->address.length = msghdr.msg_namelen;
910

Michael Graff's avatar
Michael Graff committed
911
		XTRACE(TRACE_RECV,
Michael Graff's avatar
Michael Graff committed
912
913
914
		       ("internal_recv: recvmsg(%d) %d bytes, err %d/%s, from %s\n",
			sock->fd, cc, errno, strerror(errno),
			inet_ntoa(dev->address.type.sin.sin_addr)));
Michael Graff's avatar
Michael Graff committed
915
916
917
918
919

		/*
		 * check for error or block condition
		 */
		if (cc < 0) {
Michael Graff's avatar
Michael Graff committed
920
			if (SOFT_ERROR(errno))
Michael Graff's avatar
Michael Graff committed
921
				goto poke;
Michael Graff's avatar
Michael Graff committed
922
923
924
925

#define SOFT_OR_HARD(_system, _isc) \
	if (errno == _system) { \
		if (sock->connected) { \
926
			if (sock->type == isc_sockettype_tcp) \
Michael Graff's avatar
Michael Graff committed
927
				sock->recv_result = _isc; \
928
			send_recvdone_event(sock, &task, &dev, _isc, 1); \
Michael Graff's avatar
Michael Graff committed
929
930
931
932
933
934
935
936
937
938
939
940
941
		} \
		goto next; \
	}

			SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
			SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
			SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
#undef SOFT_OR_HARD

			/*
			 * This might not be a permanent error.
			 */
			if (errno == ENOBUFS) {
942
943
				send_recvdone_event(sock, &task, &dev,
						    ISC_R_NORESOURCES, 1);
Michael Graff's avatar
Michael Graff committed
944
945
946
				goto next;
			}

Michael Graff's avatar
Michael Graff committed
947
			UNEXPECTED_ERROR(__FILE__, __LINE__,
948
949
					 "internal read: %s", strerror(errno));

Michael Graff's avatar
Michael Graff committed
950
			sock->recv_result = ISC_R_UNEXPECTED;
951
952
			send_recvdone_event(sock, &task, &dev,
					    ISC_R_UNEXPECTED, 1);
Michael Graff's avatar
Michael Graff committed
953
			goto next;
Michael Graff's avatar
Michael Graff committed
954
		}
Michael Graff's avatar
Michael Graff committed
955

Michael Graff's avatar
Michael Graff committed
956
957
958
		/*
		 * read of 0 means the remote end was closed.  Run through
		 * the event queue and dispatch all the events with an EOF
Michael Graff's avatar
Michael Graff committed
959
960
		 * result code.  This will set the EOF flag in markers as
		 * well, but that's really ok.
Michael Graff's avatar
Michael Graff committed
961
		 */
962
		if ((sock->type == isc_sockettype_tcp) && (cc == 0)) {
963
			sock->recv_result = ISC_R_EOF;
Michael Graff's avatar
Michael Graff committed
964
			do {
965
966
967
968
				send_recvdone_event(sock, &task, &dev,
						    ISC_R_EOF, 1);
				dev = ISC_LIST_HEAD(sock->recv_list);
			} while (dev != NULL);
Michael Graff's avatar
Michael Graff committed
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
			goto poke;
		}

		/*
		 * if we read less than we expected, update counters,
		 * poke.
		 */
		if ((size_t)cc < read_count) {
			dev->n += cc;

			/*
			 * If partial reads are allowed, we return whatever
			 * was read with a success result, and continue
			 * the loop.
			 */
984
			if (dev->minimum <= dev->n) {
985
986
				send_recvdone_event(sock, &task, &dev,
						    ISC_R_SUCCESS, 1);
Michael Graff's avatar
Michael Graff committed
987
				goto next;
Michael Graff's avatar
Michael Graff committed
988
989
990
991
992
993
994
995
996
			}

			/*
			 * Partials not ok.  Exit the loop and notify the
			 * watcher to wait for more reads
			 */
			goto poke;
		}

997
		/*
Michael Graff's avatar
Michael Graff committed
998
999
		 * Exactly what we wanted to read.  We're done with this
		 * entry.  Post its completion event.
1000
		 */
1001
1002
		if ((size_t)cc == read_count) {
			dev->n += read_count;
1003
1004
			send_recvdone_event(sock, &task, &dev,
					    ISC_R_SUCCESS, 1);
1005
		}
Michael Graff's avatar
Michael Graff committed
1006

Michael Graff's avatar
Michael Graff committed
1007
	next: