socket.c 95.4 KB
Newer Older
Danny Mayer's avatar
Danny Mayer committed
1
/*
Mark Andrews's avatar
Mark Andrews committed
2
 * Copyright (C) 2004-2007  Internet Systems Consortium, Inc. ("ISC")
Mark Andrews's avatar
Mark Andrews committed
3
 * Copyright (C) 2000-2003  Internet Software Consortium.
Danny Mayer's avatar
Danny Mayer committed
4
5
6
7
8
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
Mark Andrews's avatar
Mark Andrews committed
9
10
11
12
13
14
15
 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
Danny Mayer's avatar
Danny Mayer committed
16
17
 */

18
/* $Id: socket.c,v 1.49 2007/03/06 01:50:48 marka Exp $ */
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

/* This code has been rewritten to take advantage of Windows Sockets
 * I/O Completion Ports and Events. I/O Completion Ports is ONLY
 * available on Windows NT, Windows 2000 and Windows XP series of
 * the Windows Operating Systems. In CANNOT run on Windows 95, Windows 98
 * or the follow-ons to those Systems.
 *
 * This code is by nature multithreaded and takes advantage of various
 * features to pass on information through the completion port for
 * when I/O is completed.  All sends and receives are completed through
 * the completion port. Due to an implementation bug in Windows 2000,
 * Service Pack 2 must installed on the system for this code to run correctly.
 * For details on this problem see Knowledge base article Q263823.
 * The code checks for this. The number of Completion Port Worker threads
 * used is the total number of CPU's + 1. This increases the likelihood that
 * a Worker Thread is available for processing a completed request.
 *
 * All accepts and connects are accomplished through the WSAEventSelect()
 * function and the event_wait loop. Events are added to and deleted from
 * each event_wait thread via a common event_update stack owned by the socket
 * manager. If the event_wait thread runs out of array space in the events
 * array it will look for another event_wait thread to add the event. If it
 * fails to find another one it will create a new thread to handle the
 * outstanding event.
 *
 * A future enhancement is to use AcceptEx to take avantage of Overlapped
 * I/O which allows for enhanced performance of TCP connections.
 * This will also reduce the number of events that are waited on by the
 * event_wait threads to just the connect sockets and reduce the number
 * additional threads required.
 *
 * XXXPDM 5 August, 2002
 */
Danny Mayer's avatar
Danny Mayer committed
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

#define MAKE_EXTERNAL 1
#include <config.h>

#include <sys/types.h>

#ifndef _WINSOCKAPI_
#define _WINSOCKAPI_   /* Prevent inclusion of winsock.h in windows.h */
#endif

#include <errno.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <io.h>
#include <fcntl.h>
69
#include <process.h>
Danny Mayer's avatar
Danny Mayer committed
70
71
72
73
74
75
76
77
78
79

#include <isc/buffer.h>
#include <isc/bufferlist.h>
#include <isc/condition.h>
#include <isc/list.h>
#include <isc/log.h>
#include <isc/mem.h>
#include <isc/msgs.h>
#include <isc/mutex.h>
#include <isc/net.h>
80
#include <isc/os.h>
Danny Mayer's avatar
Danny Mayer committed
81
82
83
84
#include <isc/platform.h>
#include <isc/print.h>
#include <isc/region.h>
#include <isc/socket.h>
85
86
#include <isc/strerror.h>
#include <isc/syslog.h>
Danny Mayer's avatar
Danny Mayer committed
87
88
89
#include <isc/task.h>
#include <isc/thread.h>
#include <isc/util.h>
90
#include <isc/win32os.h>
Danny Mayer's avatar
Danny Mayer committed
91

92
#include "errno2result.h"
93

94
95
#define ISC_SOCKET_NAMES 1

96
97
98
99
100
101
102
/*
 * Define this macro to control the behavior of connection
 * resets on UDP sockets.  See Microsoft KnowledgeBase Article Q263823
 * for details.
 * NOTE: This requires that Windows 2000 systems install Service Pack 2
 * or later.
 */
103
104
#ifndef SIO_UDP_CONNRESET
#define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12)
105
106
#endif

Danny Mayer's avatar
Danny Mayer committed
107
108
109
110
111
112
113
114
115
116
117
118
/*
 * Some systems define the socket length argument as an int, some as size_t,
 * some as socklen_t.  This is here so it can be easily changed if needed.
 */
#ifndef ISC_SOCKADDR_LEN_T
#define ISC_SOCKADDR_LEN_T unsigned int
#endif

/*
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
 */
119
#define SOFT_ERROR(e)	((e) == WSAEINTR || \
120
			 (e) == WSAEWOULDBLOCK || \
121
122
123
			 (e) == EWOULDBLOCK || \
			 (e) == EINTR || \
			 (e) == EAGAIN || \
Danny Mayer's avatar
Danny Mayer committed
124
125
			 (e) == 0)

126
127
128
129
130
131
132
133
134
135
136
137
/*
 * Pending errors are not really errors and should be
 * kept separate
 */
#define PENDING_ERROR(e) ((e) == WSA_IO_PENDING || (e) == 0)

#define DOIO_SUCCESS	  0       /* i/o ok, event sent */
#define DOIO_SOFT	     1       /* i/o ok, soft error, no event sent */
#define DOIO_HARD	     2       /* i/o error, event sent */
#define DOIO_EOF	      3       /* EOF, no event sent */
#define DOIO_PENDING	  4       /* status when i/o is in process */

Danny Mayer's avatar
Danny Mayer committed
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)

/*
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
#define TRACE_LEVEL		90
#define CORRECTNESS_LEVEL	70
#define IOEVENT_LEVEL		60
#define EVENT_LEVEL		50
#define CREATION_LEVEL		20

#define TRACE		DLVL(TRACE_LEVEL)
#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
#define IOEVENT		DLVL(IOEVENT_LEVEL)
#define EVENT		DLVL(EVENT_LEVEL)
#define CREATION	DLVL(CREATION_LEVEL)

typedef isc_event_t intev_t;

161
162
#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
#define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
Danny Mayer's avatar
Danny Mayer committed
163
164
165
166
167
168
169
170
171
172
173
174
175
176

/*
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
#ifdef ISC_PLATFORM_HAVEIPV6
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

/*
 * We really  don't want to try and use these control messages. Win32
177
 * doesn't have this mechanism before XP.
Danny Mayer's avatar
Danny Mayer committed
178
179
180
181
182
183
184
 */
#undef USE_CMSG

/*
 * Message header for recvmsg and sendmsg calls.
 * Used value-result for recvmsg, value only for sendmsg.
 */
185

Danny Mayer's avatar
Danny Mayer committed
186
187
188
189

struct msghdr {
        void	*msg_name;              /* optional address */
        u_int   msg_namelen;            /* size of address */
Danny Mayer's avatar
Danny Mayer committed
190
        WSABUF  *msg_iov;		/* scatter/gather array */
Danny Mayer's avatar
Danny Mayer committed
191
192
193
194
        u_int   msg_iovlen;             /* # elements in msg_iov */
        void	*msg_control;           /* ancillary data, see below */
        u_int   msg_controllen;         /* ancillary data buffer len */
        int     msg_flags;              /* flags on received message */
195
	int	msg_totallen;		/* total length of this message */
Danny Mayer's avatar
Danny Mayer committed
196
} msghdr;
197
	
198
199
200
201
202
/*%
 * The size to raise the recieve buffer to.
 */
#define RCVBUFSIZE (32*1024)

203
/*
204
205
 * The number of times a send operation is repeated if the result
 * is WSAEINTR.
206
207
208
 */
#define NRETRIES 10

Danny Mayer's avatar
Danny Mayer committed
209
210
211
212
213
214
struct isc_socket {
	/* Not locked. */
	unsigned int		magic;
	isc_socketmgr_t	       *manager;
	isc_mutex_t		lock;
	isc_sockettype_t	type;
215
216
217
218
219
	OVERLAPPED		overlapped;
	/* Pointers to scatter/gather buffers */
	WSABUF			iov[ISC_SOCKET_MAXSCATTERGATHER];
	WSAEVENT		hEvent;		/* Event Handle */
	long			wait_type;	/* Events to wait on */
220
221
222
	WSAEVENT		hAlert;		/* Alert Event Handle */
	DWORD			evthread_id;	/* Event Thread Id for socket */

Danny Mayer's avatar
Danny Mayer committed
223
224
225
	/* Locked by socket lock. */
	ISC_LINK(isc_socket_t)	link;
	unsigned int		references;
226
	SOCKET			fd;
Danny Mayer's avatar
Danny Mayer committed
227
228
	int			pf;

229
230
231
232
233
#ifdef ISC_SOCKET_NAMES   
	char			name[16];
	void *			tag;
#endif

Danny Mayer's avatar
Danny Mayer committed
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
	ISC_LIST(isc_socketevent_t)		send_list;
	ISC_LIST(isc_socketevent_t)		recv_list;
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
	isc_socket_connev_t		       *connect_ev;

	/*
	 * Internal events.  Posted when a descriptor is readable or
	 * writable.  These are statically allocated and never freed.
	 * They will be set to non-purgable before use.
	 */
	intev_t			readable_ev;
	intev_t			writable_ev;

	isc_sockaddr_t		address;  /* remote address */

249
	unsigned int		pending_close : 1,
Danny Mayer's avatar
Danny Mayer committed
250
				pending_accept : 1,
251
252
				iocp : 1,	/* I/O Completion Port */
				listener : 1,	/* listener socket */
Danny Mayer's avatar
Danny Mayer committed
253
254
				connected : 1,
				connecting : 1, /* connect pending */
255
256
257
258
				bound : 1,	/* bound to local addr */
				pending_free: 1;
	unsigned int		pending_recv;
	unsigned int		pending_send;
Danny Mayer's avatar
Danny Mayer committed
259
260
};

261
262
263
264
265
266
267
268
/*
 * I/O Completion ports Info structures
 */

static HANDLE hHeapHandle = NULL;
static int iocp_total = 0;
typedef struct IoCompletionInfo {
	OVERLAPPED			overlapped;
269
	isc_socketevent_t		*dev;
270
	int				request_type;
271
	struct msghdr			messagehdr;
272
273
274
275
} IoCompletionInfo;

/*
 * Define a maximum number of I/O Completion Port worker threads
276
277
 * to handle the load on the Completion Port. The actual number
 * used is the number of CPU's + 1.
278
279
280
281
282
283
284
285
286
287
288
 */
#define MAX_IOCPTHREADS 20

/*
 * event_change structure to handle adds and deletes from the list of
 * events in the Wait
 */
typedef struct event_change event_change_t;

struct event_change {
	isc_socket_t			*sock;
289
290
	WSAEVENT			hEvent;
	DWORD				evthread_id;
291
292
293
294
295
296
297
	SOCKET				fd;
	unsigned int			action;
	ISC_LINK(event_change_t)	link;
};

/*
 * Note: We are using an array here since *WaitForMultiple* wants an array
298
 * WARNING: This value may not be greater than 64 since the
299
 * WSAWaitForMultipleEvents function is limited to 64 events.
300
301
302
303
304
305
306
307
308
309
310
311
312
313
 */

#define MAX_EVENTS 64

/*
 * List of events being waited on and their associated sockets
 */
typedef struct sock_event_list {
	int max_event;
	int total_events;
	isc_socket_t			*aSockList[MAX_EVENTS];
	WSAEVENT			aEventList[MAX_EVENTS];
} sock_event_list;

314
315
316
317
318
319
320
321
322
323
324
325
326
/*
 * Thread Event structure for managing the threads handling events
 */
typedef struct events_thread events_thread_t;

struct events_thread {
	isc_thread_t			thread_handle;	/* Thread's handle */
	DWORD				thread_id;	/* Thread's id */
	sock_event_list			sockev_list;
	isc_socketmgr_t			*manager;
	ISC_LINK(events_thread_t)	link;
};

327
328
329
#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)

Danny Mayer's avatar
Danny Mayer committed
330
331
struct isc_socketmgr {
	/* Not locked. */
332
333
334
	unsigned int			magic;
	isc_mem_t		       *mctx;
	isc_mutex_t			lock;
Danny Mayer's avatar
Danny Mayer committed
335
	/* Locked by manager lock. */
336
337
338
339
340
341
342
343
344
345
346
	ISC_LIST(event_change_t)	event_updates;
	ISC_LIST(isc_socket_t)		socklist;
	int				event_written;
	WSAEVENT			prime_alert;
	isc_boolean_t			bShutdown;
	ISC_LIST(events_thread_t)	ev_threads;
	isc_condition_t			shutdown_ok;
	HANDLE				hIoCompletionPort;
	int				maxIOCPThreads;
	HANDLE				hIOCPThreads[MAX_IOCPTHREADS];
	DWORD				dwIOCPThreadIds[MAX_IOCPTHREADS];
Danny Mayer's avatar
Danny Mayer committed
347
348
349
350
351
352
};

/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
353
#define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
Danny Mayer's avatar
Danny Mayer committed
354

355
static isc_threadresult_t WINAPI event_wait(void *uap);
356
static isc_threadresult_t WINAPI SocketIoThread(LPVOID ThreadContext);
Danny Mayer's avatar
Danny Mayer committed
357
358
static void free_socket(isc_socket_t **);

359
360
361
362
363
364
365
366
367
enum {
	SOCKET_RECV,
	SOCKET_SEND,
};

enum {
	EVENT_ADD,
	EVENT_DELETE
};
Danny Mayer's avatar
Danny Mayer committed
368

369
370
#if defined(ISC_SOCKET_DEBUG)
/*
371
 * This is used to dump the contents of the sock structure
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
 * You should make sure that the sock is locked before
 * dumping it. Since the code uses simple printf() statements
 * it should only be used interactively.
 */
void
sock_dump(isc_socket_t *sock) {
	isc_socketevent_t *ldev;
	isc_socket_newconnev_t *ndev;
	isc_sockaddr_t addr;
	char socktext[256];


	isc_socket_getpeername(sock, &addr);
	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
	printf("Remote Socket: %s\n", socktext);
	isc_socket_getsockname(sock, &addr);
	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
	printf("This Socket: %s\n", socktext);

	printf("\n\t\tSock Dump\n");
	printf("\t\tfd: %u\n", sock->fd);
	printf("\t\treferences: %d\n", sock->references);
	printf("\t\tpending_accept: %d\n", sock->pending_accept);
	printf("\t\tpending_close: %d\n", sock->pending_close);
	printf("\t\tconnecting: %d\n", sock->connecting);
	printf("\t\tconnected: %d\n", sock->connected);
	printf("\t\tbound: %d\n", sock->bound);
	printf("\t\tiocp: %d\n", sock->iocp);
	printf("\t\tsocket type: %d\n", sock->type);

	printf("\n\t\tSock Recv List\n");
	ldev = ISC_LIST_HEAD(sock->recv_list);
	while (ldev != NULL) {
		printf("\t\tdev: %p\n", ldev);
		ldev = ISC_LIST_NEXT(ldev, ev_link);
	}
	printf("\n\t\tSock Send List\n");
	ldev = ISC_LIST_HEAD(sock->send_list);
	while (ldev != NULL) {
		printf("\t\tdev: %p\n", ldev);
		ldev = ISC_LIST_NEXT(ldev, ev_link);
	}
	printf("\n\t\tSock Accept List\n");
	ndev = ISC_LIST_HEAD(sock->accept_list);
	while (ndev != NULL) {
		printf("\t\tdev: %p\n", ldev);
		ndev = ISC_LIST_NEXT(ndev, ev_link);
	}
}
#endif

/*  This function will add an entry to the I/O completion port
 *  that will signal the I/O thread to exit (gracefully)
 */
static void
signal_iocompletionport_exit(isc_socketmgr_t *manager) {
	int i;
	int errval;
	char strbuf[ISC_STRERRORSIZE];

432
	REQUIRE(VALID_MANAGER(manager));
433
	for (i = 0; i < manager->maxIOCPThreads; i++) {
434
435
		if (!PostQueuedCompletionStatus(manager->hIoCompletionPort,
						0, 0, 0)) {
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
			errval = GetLastError();
			isc__strerror(errval, strbuf, sizeof(strbuf));
			FATAL_ERROR(__FILE__, __LINE__,
				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
				ISC_MSG_FAILED,
				"Can't request service thread to exit: %s"),
				strbuf);
		}
	}
}

/*
 * Create the worker threads for the I/O Completion Port
 */
void
iocompletionport_createthreads(int total_threads, isc_socketmgr_t *manager) {
	int errval;
	char strbuf[ISC_STRERRORSIZE];
454
	int i;
455

456
457
	INSIST(total_threads > 0);
	REQUIRE(VALID_MANAGER(manager));
458
459
460
461
	/*
	 * We need at least one
	 */
	for (i = 0; i < total_threads; i++) {
462
		manager->hIOCPThreads[i] = CreateThread(NULL, 0, SocketIoThread,
463
464
465
466
467
468
469
470
471
472
473
474
475
476
						manager, 0,
						&manager->dwIOCPThreadIds[i]);
		if(manager->hIOCPThreads[i] == NULL) {
			errval = GetLastError();
			isc__strerror(errval, strbuf, sizeof(strbuf));
			FATAL_ERROR(__FILE__, __LINE__,
				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
				ISC_MSG_FAILED,
				"Can't create IOCP thread: %s"),
				strbuf);
		}
	}
}

477
/*
478
 *  Create/initialise the I/O completion port
479
 */
480
481
482
483
void
iocompletionport_init(isc_socketmgr_t *manager) {
	int errval;
	char strbuf[ISC_STRERRORSIZE];
484
485

	REQUIRE(VALID_MANAGER(manager));
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
	/*
	 * Create a private heap to handle the socket overlapped structure
	 * The miniumum number of structures is 10, there is no maximum
	 */
	hHeapHandle = HeapCreate(0, 10*sizeof(IoCompletionInfo), 0);
	manager->maxIOCPThreads = min(isc_os_ncpus() + 1,
					MAX_IOCPTHREADS);

	/* Now Create the Completion Port */
	manager->hIoCompletionPort = CreateIoCompletionPort(
				     INVALID_HANDLE_VALUE, NULL,
				     0, manager->maxIOCPThreads);
	if (manager->hIoCompletionPort == NULL) {
		errval = GetLastError();
		isc__strerror(errval, strbuf, sizeof(strbuf));
		FATAL_ERROR(__FILE__, __LINE__,
				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
				ISC_MSG_FAILED,
				"CreateIoCompletionPort() failed "
				"during initialization: %s"),
				strbuf);
		exit(1);
	}
509

510
511
512
513
514
	/*
	 * Worker threads for servicing the I/O
 	 */
	iocompletionport_createthreads(manager->maxIOCPThreads, manager);
}
515

516

517
518
void
iocompletionport_exit(isc_socketmgr_t *manager) {
519
520

	REQUIRE(VALID_MANAGER(manager));
521
522
523
524
525
526
	if (manager->hIoCompletionPort != NULL) {
		/*  Get each of the service threads to exit
		*/
		signal_iocompletionport_exit(manager);
	}
}
527

528
/*
529
530
 * Add sockets in here and pass the sock data in as part of the
 * information needed.
531
 */
532
void
533
534
iocompletionport_update(isc_socket_t *sock) {
	HANDLE hiocp;
535
536

	REQUIRE(sock != NULL);
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
	if(sock->iocp == 0) {
		sock->iocp = 1;
		hiocp = CreateIoCompletionPort((HANDLE) sock->fd,
			sock->manager->hIoCompletionPort, (DWORD) sock,
			sock->manager->maxIOCPThreads);
		InterlockedIncrement(&iocp_total);

	}
}

void
socket_event_minit(sock_event_list *evlist) {
	BOOL bReset;
	int i;

552
	REQUIRE(evlist != NULL);
553
554
555
556
557
558
559
560
	/* Initialize the Event List */
	evlist->max_event = 0;
	evlist->total_events = 0;
	for (i = 0; i < MAX_EVENTS; i++) {
		evlist->aSockList[i] = NULL;
		evlist->aEventList[i] = (WSAEVENT) 0;
	}

561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
	evlist->aEventList[0] = WSACreateEvent();
	(evlist->max_event)++;
	bReset = WSAResetEvent(evlist->aEventList[0]);
}
/*
 * Event Thread Initialization
 */
isc_result_t
event_thread_create(events_thread_t **evthreadp, isc_socketmgr_t *manager) {
	events_thread_t *evthread;

	REQUIRE(VALID_MANAGER(manager));
	REQUIRE(evthreadp != NULL && *evthreadp == NULL);

	evthread = isc_mem_get(manager->mctx, sizeof(*evthread));
	socket_event_minit(&evthread->sockev_list);
	ISC_LINK_INIT(evthread, link);
	evthread->manager = manager;

	ISC_LIST_APPEND(manager->ev_threads, evthread, link);

582
	/*
583
	 * Start up the event wait thread.
584
	 */
585
586
587
	if (isc_thread_create(event_wait, evthread, &evthread->thread_handle) !=
	    ISC_R_SUCCESS) {
		isc_mem_put(manager->mctx, evthread, sizeof(*evthread));
588
		UNEXPECTED_ERROR(__FILE__, __LINE__,
589
				 "isc_thread_create() %s",
590
591
				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
						ISC_MSG_FAILED, "failed"));
592
		return (ISC_R_UNEXPECTED);
593
	}
594
595
	*evthreadp = evthread;
	return (ISC_R_SUCCESS);
596
597
}
/*
598
599
600
 * Locate a thread with space for additional events or create one if
 * necessary. The manager is locked at this point so the information
 * cannot be changed by another thread while we are searching.
601
 */
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
void
locate_available_thread(isc_socketmgr_t *manager) {
	events_thread_t *evthread;
	DWORD threadid = GetCurrentThreadId();

	evthread = ISC_LIST_HEAD(manager->ev_threads);
	while (evthread != NULL) {
		/*
		 * We need to find a thread with space to add an event
		 * If we find it, alert it to process the event change
		 * list
		 */
		if(threadid != evthread->thread_id &&
			evthread->sockev_list.max_event < MAX_EVENTS) {
			WSASetEvent(evthread->sockev_list.aEventList[0]);
			return;
		}
		evthread = ISC_LIST_NEXT(evthread, link);
	}
	/*
	 * We need to create a new thread as other threads are full.
	 * If we succeed in creating the thread, alert it to
	 * process the event change list since it will have space.
	 * If we are unable to create one, the event will stay on the
	 * list and the next event_wait thread will try again to add
	 * the event. It will call here again if it has no space.
	 */
	if (event_thread_create(&evthread, manager) == ISC_R_SUCCESS) {
		WSASetEvent(evthread->sockev_list.aEventList[0]);
	}

}

isc_boolean_t
socket_eventlist_add(event_change_t *evchange, sock_event_list *evlist,
		     isc_socketmgr_t *manager) {
638
	int max_event;
639
640
	isc_socket_t *sock;
	REQUIRE(evchange != NULL);
641

642
	sock = evchange->sock;
643
644
645
646
647
648
	REQUIRE(sock != NULL);
	REQUIRE(sock->hEvent != NULL);
	REQUIRE(evlist != NULL);

	max_event = evlist->max_event;
	if(max_event >= MAX_EVENTS) {
649
650
		locate_available_thread(manager);
		return (ISC_FALSE);
651
652
653
654
655
656
	}

	evlist->aSockList[max_event] = sock;
	evlist->aEventList[max_event] = sock->hEvent;
	evlist->max_event++;
	evlist->total_events++;
657
658
659
	sock->hAlert = evlist->aEventList[0];
	sock->evthread_id = GetCurrentThreadId();
	return (ISC_TRUE);
660
}
661

662
/*
663
664
 * Note that the eventLock is locked before calling this function.
 * All Events and associated sockets are closed here.
665
 */
666
667
isc_boolean_t
socket_eventlist_delete(event_change_t *evchange, sock_event_list *evlist) {
668
669
	int i;
	WSAEVENT hEvent;
670
	int iEvent = -1;
671
	isc_boolean_t dofree = ISC_FALSE;
672

673
674
	REQUIRE(evchange != NULL);
	/*  Make sure this is the right thread from which to delete the event */
675
	if (evchange->evthread_id != GetCurrentThreadId())
676
		return (ISC_FALSE);
677

678
679
680
	REQUIRE(evlist != NULL);
	REQUIRE(evchange->hEvent != NULL);
	hEvent = evchange->hEvent;
681

682
683
684
685
686
687
688
	/* Find the Event */
	for (i = 1; i < evlist->max_event; i++) {
		if (evlist->aEventList[i] == hEvent) {
			iEvent = i;
			break;
		}
	}
689

690
691
692
	/* Actual event start at 1 */
	if (iEvent < 1)
		return (ISC_FALSE);
693

694
	for(i = iEvent; i < (evlist->max_event - 1); i++) {
695
696
697
		evlist->aEventList[i] = evlist->aEventList[i + 1];
		evlist->aSockList[i] = evlist->aSockList[i + 1];
	}
698

699
700
701
702
703
	evlist->aEventList[evlist->max_event - 1] = 0;
	evlist->aSockList[evlist->max_event - 1] = NULL;

	/* Cleanup */
	WSACloseEvent(hEvent);
704
705
706
707

	LOCK(&evchange->sock->lock);
	if (evchange->sock->pending_close) {
		evchange->sock->pending_close = 0;
708
		closesocket(evchange->fd);
709
710
711
712
713
714
715
716
717
718
719
	}
	if (evchange->sock->pending_recv == 0 &&
	    evchange->sock->pending_send == 0 &&
	    evchange->sock->pending_free) {
		evchange->sock->pending_free = 0;
		dofree = ISC_TRUE;
	}
	UNLOCK(&evchange->sock->lock);
	if (dofree)
		free_socket(&evchange->sock);

720
721
	evlist->max_event--;
	evlist->total_events--;
722

723
	return (ISC_TRUE);
724
}
725

726
727
/*
 * Get the event changes off of the list and apply the
728
729
730
731
732
733
734
735
736
737
738
 * requested changes. The manager lock is taken out at
 * the start of this function to prevent other event_wait
 * threads processing the same information at the same
 * time. The queue may not be empty on exit since other
 * threads may be involved in processing the queue.
 *
 * The deletes are done first in order that there be space
 * available for the events being added in the same thread
 * in case the event list is almost full. This reduces the
 * probability of having to create another thread which would
 * increase overhead costs.
739
740
741
742
 */
isc_result_t
process_eventlist(sock_event_list *evlist, isc_socketmgr_t *manager) {
	event_change_t *evchange;
743
744
745
746
747
748
	event_change_t *next;
	isc_boolean_t del;

	REQUIRE(evlist != NULL);

	LOCK(&manager->lock);
749

750
751
752
	/*
	 * First the deletes.
	 */
753
	evchange = ISC_LIST_HEAD(manager->event_updates);
754
	while (evchange != NULL) {
755
756
		next = ISC_LIST_NEXT(evchange, link);
		del = ISC_FALSE;
757
		if (evchange->action == EVENT_DELETE) {
758
759
			del = socket_eventlist_delete(evchange, evlist);

760
761
762
763
			/*
			 * Delete only if this thread's socket list was
			 * updated.
			 */
764
765
766
767
768
769
			if (del) {
				ISC_LIST_DEQUEUE(manager->event_updates,
						 evchange, link);
				HeapFree(hHeapHandle, 0, evchange);
				manager->event_written--;
			}
770
		}
771
		evchange = next;
772
	}
773
774
775
776

	/*
	 * Now the adds.
	 */
777
778
779
780
	evchange = ISC_LIST_HEAD(manager->event_updates);
	while (evchange != NULL) {
		next = ISC_LIST_NEXT(evchange, link);
		del = ISC_FALSE;
781
		if (evchange->action == EVENT_ADD) {
782
783
			del = socket_eventlist_add(evchange, evlist, manager);

784
785
786
787
			/*
			 * Delete only if this thread's socket list was
			 * updated.
			 */
788
789
790
791
792
793
794
795
796
797
			if (del) {
				ISC_LIST_DEQUEUE(manager->event_updates,
						 evchange, link);
				HeapFree(hHeapHandle, 0, evchange);
				manager->event_written--;
			}
		}
		evchange = next;
	}
	UNLOCK(&manager->lock);
798
799
	return (ISC_R_SUCCESS);
}
800

801
802
803
804
805
/*
 * Add the event list changes to the queue and notify the
 * event loop
 */
static void
806
notify_eventlist(isc_socket_t *sock, isc_socketmgr_t *manager,
807
808
		 unsigned int action)
{
809
810

	event_change_t *evchange;
811
812
813

	REQUIRE(VALID_MANAGER(manager));
	REQUIRE(sock != NULL);
814

815
816
817
818
	evchange = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY,
			     sizeof(event_change_t));
	evchange->sock = sock;
	evchange->action = action;
819
	evchange->hEvent = sock->hEvent;
820
	evchange->fd = sock->fd;
821
	evchange->evthread_id = sock->evthread_id;
822

823
824
	LOCK(&manager->lock);
	ISC_LIST_APPEND(manager->event_updates, evchange, link);
825
	sock->manager->event_written++;
826
827
828
829
830
831
832
	UNLOCK(&manager->lock);

	/* Alert the Wait List */
	if (sock->hAlert != NULL)
		WSASetEvent(sock->hAlert);
	else
		WSASetEvent(manager->prime_alert);
833
}
834

835
836
837
838
839
840
841
/*
 * Note that the socket is already locked before calling this function
 */
isc_result_t
socket_event_add(isc_socket_t *sock, long type) {
	int stat;
	WSAEVENT hEvent;
842
843
	char strbuf[ISC_STRERRORSIZE];
	const char *msg;
844
845

	REQUIRE(sock != NULL);
846

847
848
849
	hEvent = WSACreateEvent();
	if (hEvent == WSA_INVALID_EVENT) {
		stat = WSAGetLastError();
850
		isc__strerror(stat, strbuf, sizeof(strbuf));
851
		msg = isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
852
853
854
				     ISC_MSG_FAILED, "failed"),
		UNEXPECTED_ERROR(__FILE__, __LINE__, "WSACreateEvent: %s: %s",
				 msg, strbuf);
855
856
857
858
		return (ISC_R_UNEXPECTED);
	}
	if (WSAEventSelect(sock->fd, hEvent, type) != 0) {
		stat = WSAGetLastError();
859
		isc__strerror(stat, strbuf, sizeof(strbuf));
860
		msg = isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
861
862
863
				     ISC_MSG_FAILED, "failed");
		UNEXPECTED_ERROR(__FILE__, __LINE__, "WSAEventSelect: %s: %s",
				 msg, strbuf);
864
865
866
		return (ISC_R_UNEXPECTED);
	}
	sock->hEvent = hEvent;
867

868
	sock->wait_type = type;
869
	notify_eventlist(sock, sock->manager, EVENT_ADD);
870
871
	return (ISC_R_SUCCESS);
}
872

Danny Mayer's avatar
Danny Mayer committed
873
/*
874
 * Note that the socket is not locked before calling this function
Danny Mayer's avatar
Danny Mayer committed
875
 */
876
877
878
879
880
void
socket_event_delete(isc_socket_t *sock) {

	REQUIRE(sock != NULL);
	REQUIRE(sock->hEvent != NULL);
Danny Mayer's avatar
Danny Mayer committed
881

882
883
884
885
886
887
	sock->wait_type = 0;
	sock->pending_close = 1;
	notify_eventlist(sock, sock->manager, EVENT_DELETE);
	sock->hEvent = NULL;
	sock->hAlert = NULL;
	sock->evthread_id = 0;
888
}
889

890
891
892
893
894
895
896
897
898
/*
 * Routine to cleanup and then close the socket.
 * Only close the socket here if it is NOT associated
 * with an event, otherwise the WSAWaitForMultipleEvents
 * may fail due to the fact that the the Wait should not
 * be running while closing an event or a socket.
 */
void
socket_close(isc_socket_t *sock) {
899
900

	REQUIRE(sock != NULL);
901
902

	sock->pending_close = 0;
903
904
	if (sock->hEvent != NULL)
		socket_event_delete(sock);
905
	else
906
		closesocket(sock->fd);
907

908
909
910
911
912
	if (sock->iocp) {
		sock->iocp = 0;
		InterlockedDecrement(&iocp_total);
	}
}
913

Danny Mayer's avatar
Danny Mayer committed
914
915
916
/*
 * Initialize socket services
 */
Andreas Gustafsson's avatar
Andreas Gustafsson committed
917
BOOL InitSockets() {
Danny Mayer's avatar
Danny Mayer committed
918
919
920
921
922
923
	WORD wVersionRequested;
	WSADATA wsaData;
	int err;

	/* Need Winsock 2.0 or better */
	wVersionRequested = MAKEWORD(2, 0);
924

Danny Mayer's avatar
Danny Mayer committed
925
	err = WSAStartup(wVersionRequested, &wsaData);
Andreas Gustafsson's avatar
Andreas Gustafsson committed
926
	if ( err != 0 ) {
Danny Mayer's avatar
Danny Mayer committed
927
		/* Tell the user that we could not find a usable Winsock DLL */
928
		return(FALSE);
Danny Mayer's avatar
Danny Mayer committed
929
	}
930
	return(TRUE);
Danny Mayer's avatar
Danny Mayer committed
931
932
}

Andreas Gustafsson's avatar
Andreas Gustafsson committed
933
int
934
internal_sendmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
935
936
		 struct msghdr *messagehdr, int flags, int *Error)
{
937
	int Result;
Danny Mayer's avatar
Danny Mayer committed
938
939
	DWORD BytesSent;
	DWORD Flags = flags;
940
941
942
	int total_sent;

	*Error = 0;
943
944
945
946
947
	Result = WSASendTo((SOCKET) sock->fd, messagehdr->msg_iov,
			   messagehdr->msg_iovlen, &BytesSent,
			   Flags, messagehdr->msg_name,
			   messagehdr->msg_namelen, (LPOVERLAPPED) lpo,
			   NULL);
948
949

	total_sent = (int) BytesSent;
950

951
952
953
954
	/* Check for errors.*/
	if (Result == SOCKET_ERROR) {

		*Error = WSAGetLastError();
955
956

		switch (*Error) {
957
958
959
960
961
962
		case WSA_IO_INCOMPLETE :
		case WSA_WAIT_IO_COMPLETION :
		case WSA_IO_PENDING :
			sock->pending_send++;
		case NO_ERROR :
			break;
963

964
965
966
967
968
969
970
		default :
			return (-1);
			break;
		}
	} else
		sock->pending_send++;
	if (lpo != NULL)
971
972
973
		return (0);
	else
		return (total_sent);
Danny Mayer's avatar
Danny Mayer committed
974
975
976
}

int
977
internal_recvmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
978
979
		 struct msghdr *messagehdr, int flags, int *Error)
{
980
981
982
	DWORD Flags = 0;
	DWORD NumBytes = 0;
	int total_bytes = 0;
Danny Mayer's avatar
Danny Mayer committed
983
	int Result;
Andreas Gustafsson's avatar
Andreas Gustafsson committed
984

985
986
	*Error = 0;
	Result = WSARecvFrom((SOCKET) sock->fd,
987
988
989
990
991
992
993
994
			     messagehdr->msg_iov,
			     messagehdr->msg_iovlen,
			     &NumBytes,
			     &Flags,
			     messagehdr->msg_name,
			     (int *)&(messagehdr->msg_namelen),
			     (LPOVERLAPPED) lpo,
			     NULL);
Andreas Gustafsson's avatar
Andreas Gustafsson committed
995

996
	total_bytes = (int) NumBytes;
Andreas Gustafsson's avatar
Andreas Gustafsson committed
997
998
999

	/* Check for errors. */
	if (Result == SOCKET_ERROR) {
1000
1001

		*Error = WSAGetLastError();
1002
1003

		switch (*Error) {
1004
1005
1006
1007
1008
1009
		case WSA_IO_INCOMPLETE:
		case WSA_WAIT_IO_COMPLETION:
		case WSA_IO_PENDING:
			sock->pending_recv++;
		case NO_ERROR:
			break;
Danny Mayer's avatar
Danny Mayer committed
1010

1011
1012
1013
1014
1015
1016
		default :
			return (-1);
			break;
		}
	} else
		sock->pending_recv++;
1017
1018

	/* Return the flags received in header */
1019
	messagehdr->msg_flags = Flags;
1020
	if (lpo != NULL)
1021
1022
1023
		return (-1);
	else
		return (total_bytes);
1024
}
Danny Mayer's avatar
Danny Mayer committed
1025
1026

static void
1027
1028
1029
manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
	    isc_logmodule_t *module, int level, const char *fmt, ...)
{
Danny Mayer's avatar
Danny Mayer committed
1030
1031
1032
	char msgbuf[2048];
	va_list ap;

1033
	if (!isc_log_wouldlog(isc_lctx, level))
Danny Mayer's avatar
Danny Mayer committed
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
		return;

	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
		      "sockmgr %p: %s", sockmgr, msgbuf);
}

1044
1045
1046
1047
1048
static void
socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   isc_msgcat_t *msgcat, int msgset, int message,
	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
1049

Danny Mayer's avatar
Danny Mayer committed
1050
1051
1052
1053
static void
socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   isc_msgcat_t *msgcat, int msgset, int message,
1054
1055
	   const char *fmt, ...)
{
Danny Mayer's avatar
Danny Mayer committed
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
	char msgbuf[2048];
	char peerbuf[256];
	va_list ap;

	if (! isc_log_wouldlog(isc_lctx, level))
		return;

	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p: %s", sock, msgbuf);
1071
	} else {
Mark Andrews's avatar
Mark Andrews committed
1072
		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
1073
1074
1075
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p %s: %s", sock, peerbuf, msgbuf);
Danny Mayer's avatar
Danny Mayer committed
1076
1077
	}
}
1078

Danny Mayer's avatar
Danny Mayer committed
1079
/*
1080
 * Make an fd SOCKET non-blocking.
Danny Mayer's avatar
Danny Mayer committed
1081
1082
 */
static isc_result_t
1083
make_nonblock(SOCKET fd) {
Danny Mayer's avatar
Danny Mayer committed
1084
1085
	int ret;
	unsigned long flags = 1;
1086
	char strbuf[ISC_STRERRORSIZE];
Danny Mayer's avatar
Danny Mayer committed
1087
1088

	/* Set the socket to non-blocking */
1089
	ret = ioctlsocket(fd, FIONBIO, &flags);
Danny Mayer's avatar
Danny Mayer committed
1090
1091

	if (ret == -1) {
1092
		isc__strerror(errno, strbuf, sizeof(strbuf));
Danny Mayer's avatar
Danny Mayer committed
1093
1094
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "ioctlsocket(%d, FIOBIO, %d): %s",
1095
				 fd, flags, strbuf);
Danny Mayer's avatar
Danny Mayer committed
1096
1097
1098
1099
1100
1101

		return (ISC_R_UNEXPECTED);
	}

	return (ISC_R_SUCCESS);
}
1102

Danny Mayer's avatar
Danny Mayer committed
1103
/*
1104
1105
1106
1107
1108
1109
1110
1111
 * Windows 2000 systems incorrectly cause UDP sockets using WASRecvFrom
 * to not work correctly, returning a WSACONNRESET error when a WSASendTo
 * fails with an "ICMP port unreachable" response and preventing the
 * socket from using the WSARecvFrom in subsequent operations.
 * The function below fixes this, but requires that Windows 2000
 * Service Pack 2 or later be installed on the system.  NT 4.0
 * systems are not affected by this and work correctly.
 * See Microsoft Knowledge Base Article Q263823 for details of this.
Danny Mayer's avatar
Danny Mayer committed
1112
 */
1113
1114
1115
1116
1117
1118
1119
1120
1121
isc_result_t
connection_reset_fix(SOCKET fd) {
	DWORD dwBytesReturned = 0;
	BOOL  bNewBehavior = FALSE;
	DWORD status;

	if(isc_win32os_majorversion() < 5)
		return (ISC_R_SUCCESS); /*  NT 4.0 has no problem */

1122
	/* disable bad behavior using IOCTL: SIO_UDP_CONNRESET */
1123
1124
1125
1126
1127
	status = WSAIoctl(fd, SIO_UDP_CONNRESET, &bNewBehavior,
			  sizeof(bNewBehavior), NULL, 0,
			  &dwBytesReturned, NULL, NULL);
	if (status != SOCKET_ERROR)
		return (ISC_R_SUCCESS);
1128
1129
1130
1131
1132
	else {
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "WSAIoctl(SIO_UDP_CONNRESET, oldBehaviour) %s",
				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
						ISC_MSG_FAILED, "failed"));
1133
		return (ISC_R_UNEXPECTED);
1134
	}
Danny Mayer's avatar
Danny Mayer committed
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
}

/*
 * Construct an iov array and attach it to the msghdr passed in.  This is
 * the SEND constructor, which will use the used region of the buffer
 * (if using a buffer list) or will use the internal region (if a single
 * buffer I/O is requested).
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
 */
static void
build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
1148
		  struct msghdr *msg, char *cmsg, WSABUF *iov)
1149
{
Danny Mayer's avatar
Danny Mayer committed
1150
1151
1152
1153
1154
1155
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t used;
	size_t write_count;
	size_t skip_count;

Mark Andrews's avatar
Mark Andrews committed
1156
	memset(msg, 0, sizeof(*msg));
Danny Mayer's avatar
Danny Mayer committed
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201

	if (sock->type == isc_sockettype_udp) {
		msg->msg_name = (void *)&dev->address.type.sa;
		msg->msg_namelen = dev->address.length;
	} else {
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
	}

	buffer = ISC_LIST_HEAD(dev->bufferlist);
	write_count = 0;
	iovcount = 0;

	/*
	 * Single buffer I/O?  Skip what we've done so far in this region.
	 */
	if (buffer == NULL) {
		write_count = dev->region.length - dev->n;
		iov[0].buf = (void *)(dev->region.base + dev->n);
		iov[0].len = write_count;
		iovcount = 1;

		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip the data in the buffer list that we have already written.
	 */
	skip_count = dev->n;
	while (buffer != NULL) {
		REQUIRE(ISC_BUFFER_VALID(buffer));
		if (skip_count < isc_buffer_usedlength(buffer))
			break;
		skip_count -= isc_buffer_usedlength(buffer);
		buffer = ISC_LIST_NEXT(buffer, link);
	}

	while (buffer != NULL) {
		INSIST(iovcount < MAXSCATTERGATHER_SEND);

		isc_buffer_usedregion(buffer, &used);

		if (used.length > 0) {
			iov[iovcount].buf = (void *)(used.base
1202
							  + skip_count);
Danny Mayer's avatar
Danny Mayer committed
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
			iov[iovcount].len = used.length - skip_count;
			write_count += (used.length - skip_count);
			skip_count = 0;
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

	INSIST(skip_count == 0);

 config:
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;
1216
	msg->msg_totallen = write_count;
Danny Mayer's avatar
Danny Mayer committed
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
}

/*
 * Construct an iov array and attach it to the msghdr passed in.  This is
 * the RECV constructor, which will use the available region of the buffer
 * (if using a buffer list) or will use the internal region (if a single
 * buffer I/O is requested).
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
 */
static void
build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
1230
		  struct msghdr *msg, char *cmsg, WSABUF *iov)
1231
{
Danny Mayer's avatar
Danny Mayer committed
1232
1233
1234
1235
1236
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t available;
	size_t read_count;

Mark Andrews's avatar
Mark Andrews committed
1237
	memset(msg, 0, sizeof(struct msghdr));
Danny Mayer's avatar
Danny Mayer committed
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259

	if (sock->type == isc_sockettype_udp) {
		memset(&dev->address, 0, sizeof(dev->address));
		msg->msg_name = (void *)&dev->address.type.sa;
		msg->msg_namelen = sizeof(dev->address.type);
	} else { /* TCP */
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
		dev->address = sock->address;
	}

	buffer = ISC_LIST_HEAD(dev->bufferlist);
	read_count = 0;

	/*
	 * Single buffer I/O?  Skip what we've done so far in this region.
	 */
	if (buffer == NULL) {
		read_count = dev->region.length - dev->n;
		iov[0].buf = (void *)(dev->region.base + dev->n);
		iov[0].len = read_count;
		iovcount = 1;
Andreas Gustafsson's avatar
Andreas Gustafsson committed
1260
	} else {
1261
1262
1263
1264
		/*
		 * Multibuffer I/O.
		 * Skip empty buffers.
		 */
Danny Mayer's avatar
Danny Mayer committed
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
		while (buffer != NULL) {
			REQUIRE(ISC_BUFFER_VALID(buffer));
			if (isc_buffer_availablelength(buffer) != 0)
				break;
			buffer = ISC_LIST_NEXT(buffer, link);
		}

		iovcount = 0;
		while (buffer != NULL) {
			INSIST(iovcount < MAXSCATTERGATHER_RECV);

			isc_buffer_availableregion(buffer, &available);

			if (available.length > 0) {
				iov[iovcount].buf = (void *)(available.base);
				iov[iovcount].len = available.length;
				read_count += available.length;
				iovcount++;
			}
			buffer = ISC_LIST_NEXT(buffer, link);
		}
	}

	/*
	 * If needed, set up to receive that one extra byte.  Note that
	 * we know there is at least one iov left, since we stole it
	 * at the top of this function.
	 */

	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;
1296
	msg->msg_totallen = read_count;
Danny Mayer's avatar
Danny Mayer committed
1297
1298
1299
1300
}

static void
set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1301
1302
		isc_socketevent_t *dev)
{
Danny Mayer's avatar
Danny Mayer committed
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
	if (sock->type == isc_sockettype_udp) {
		if (address != NULL)
			dev->address = *address;
		else
			dev->address = sock->address;
	} else if (sock->type == isc_sockettype_tcp) {
		INSIST(address == NULL);
		dev->address = sock->address;
	}
}

1314
1315
1316
1317
1318
1319
1320
1321
1322
static void
destroy_socketevent(isc_event_t *event) {
	isc_socketevent_t *ev = (isc_socketevent_t *)event;

	INSIST(ISC_LIST_EMPTY(ev->bufferlist));

	(ev->destroy)(event);
}

Danny Mayer's avatar
Danny Mayer committed
1323
1324
static isc_socketevent_t *
allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1325
1326
		     isc_taskaction_t action, const void *arg)
{
Danny Mayer's avatar
Danny Mayer committed
1327
1328
1329
1330
1331
	isc_socketevent_t *ev;

	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
						     sock, eventtype,
						     action, arg,
Mark Andrews's avatar
Mark Andrews committed
1332
						     sizeof(*ev));
Danny Mayer's avatar
Danny Mayer committed
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
	if (ev == NULL)
		return (NULL);

	ev->result = ISC_R_UNEXPECTED;
	ISC_LINK_INIT(ev, ev_link);
	ISC_LIST_INIT(ev->bufferlist);
	ev->region.base = NULL;
	ev->n = 0;
	ev->offset = 0;
	ev->attributes = 0;
1343
1344
	ev->destroy = ev->ev_destroy;
	ev->ev_destroy = destroy_socketevent;
Danny Mayer's avatar
Danny Mayer committed
1345
1346
1347
1348
1349
1350
1351
1352
1353

	return (ev);
}

#if defined(ISC_SOCKET_DEBUG)
static void
dump_msg(struct msghdr *msg, isc_socket_t *sock) {
	unsigned int i;

1354
	printf("MSGHDR %p, Socket #: %u\n", msg, sock->fd);
Danny Mayer's avatar
Danny Mayer committed
1355
1356
	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
1357
	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
Danny Mayer's avatar
Danny Mayer committed
1358
1359
1360
1361
1362
1363
1364
		printf("\t\t%d\tbase %p, len %d\n", i,
		       msg->msg_iov[i].buf,
		       msg->msg_iov[i].len);
}
#endif

static int
1365
completeio_recv(isc_socket_t *sock, isc_socketevent_t *dev,
1366
1367
		struct msghdr *messagehdr, int cc, int recv_errno)
{
Danny Mayer's avatar
Danny Mayer committed
1368
1369
1370
1371
	size_t actual_count;
	isc_buffer_t *buffer;

#define SOFT_OR_HARD(_system, _isc) \
1372
	if (recv_errno == _system) { \
Danny Mayer's avatar
Danny Mayer committed
1373
1374
1375
1376
1377
1378
		if (sock->connected) { \
			dev->result = _isc; \
			return (DOIO_HARD); \
		} \
		return (DOIO_SOFT); \
	}
1379

Danny Mayer's avatar
Danny Mayer committed
1380
#define ALWAYS_HARD(_system, _isc) \