socket.c 93.6 KB
Newer Older
Danny Mayer's avatar
Danny Mayer committed
1
/*
Mark Andrews's avatar
Mark Andrews committed
2
3
 * Copyright (C) 2004  Internet Systems Consortium, Inc. ("ISC")
 * Copyright (C) 2000-2003  Internet Software Consortium.
Danny Mayer's avatar
Danny Mayer committed
4
5
6
7
8
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
Mark Andrews's avatar
Mark Andrews committed
9
10
11
12
13
14
15
 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
Danny Mayer's avatar
Danny Mayer committed
16
17
 */

18
/* $Id: socket.c,v 1.31 2004/05/03 23:54:38 marka Exp $ */
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

/* This code has been rewritten to take advantage of Windows Sockets
 * I/O Completion Ports and Events. I/O Completion Ports is ONLY
 * available on Windows NT, Windows 2000 and Windows XP series of
 * the Windows Operating Systems. In CANNOT run on Windows 95, Windows 98
 * or the follow-ons to those Systems.
 *
 * This code is by nature multithreaded and takes advantage of various
 * features to pass on information through the completion port for
 * when I/O is completed.  All sends and receives are completed through
 * the completion port. Due to an implementation bug in Windows 2000,
 * Service Pack 2 must installed on the system for this code to run correctly.
 * For details on this problem see Knowledge base article Q263823.
 * The code checks for this. The number of Completion Port Worker threads
 * used is the total number of CPU's + 1. This increases the likelihood that
 * a Worker Thread is available for processing a completed request.
 *
 * All accepts and connects are accomplished through the WSAEventSelect()
 * function and the event_wait loop. Events are added to and deleted from
 * each event_wait thread via a common event_update stack owned by the socket
 * manager. If the event_wait thread runs out of array space in the events
 * array it will look for another event_wait thread to add the event. If it
 * fails to find another one it will create a new thread to handle the
 * outstanding event.
 *
 * A future enhancement is to use AcceptEx to take avantage of Overlapped
 * I/O which allows for enhanced performance of TCP connections.
 * This will also reduce the number of events that are waited on by the
 * event_wait threads to just the connect sockets and reduce the number
 * additional threads required.
 *
 * XXXPDM 5 August, 2002
 */
Danny Mayer's avatar
Danny Mayer committed
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

#define MAKE_EXTERNAL 1
#include <config.h>

#include <sys/types.h>

#ifndef _WINSOCKAPI_
#define _WINSOCKAPI_   /* Prevent inclusion of winsock.h in windows.h */
#endif

#include <errno.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <io.h>
#include <fcntl.h>
69
#include <process.h>
Danny Mayer's avatar
Danny Mayer committed
70
71
72
73
74
75
76
77
78
79

#include <isc/buffer.h>
#include <isc/bufferlist.h>
#include <isc/condition.h>
#include <isc/list.h>
#include <isc/log.h>
#include <isc/mem.h>
#include <isc/msgs.h>
#include <isc/mutex.h>
#include <isc/net.h>
80
#include <isc/os.h>
Danny Mayer's avatar
Danny Mayer committed
81
82
83
84
#include <isc/platform.h>
#include <isc/print.h>
#include <isc/region.h>
#include <isc/socket.h>
85
86
#include <isc/strerror.h>
#include <isc/syslog.h>
Danny Mayer's avatar
Danny Mayer committed
87
88
89
#include <isc/task.h>
#include <isc/thread.h>
#include <isc/util.h>
90
#include <isc/win32os.h>
Danny Mayer's avatar
Danny Mayer committed
91

92
#include "errno2result.h"
93

94
95
96
97
98
99
100
101
102
103
104
/*
 * Define this macro to control the behavior of connection
 * resets on UDP sockets.  See Microsoft KnowledgeBase Article Q263823
 * for details.
 * NOTE: This requires that Windows 2000 systems install Service Pack 2
 * or later.
 */
#ifndef SIO_UDP_CONNRESET 
#define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12) 
#endif

Danny Mayer's avatar
Danny Mayer committed
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/*
 * Some systems define the socket length argument as an int, some as size_t,
 * some as socklen_t.  This is here so it can be easily changed if needed.
 */
#ifndef ISC_SOCKADDR_LEN_T
#define ISC_SOCKADDR_LEN_T unsigned int
#endif

/*
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
 *
 * For some reason, BSDI (and perhaps others) will sometimes return <0
 * from recv() but will have errno==0.  This is broken, but we have to
 * work around it here.
 */
121
#define SOFT_ERROR(e)	((e) == WSAEINTR || \
122
			 (e) == WSA_IO_PENDING || \
123
			 (e) == WSAEWOULDBLOCK || \
124
125
126
			 (e) == EWOULDBLOCK || \
			 (e) == EINTR || \
			 (e) == EAGAIN || \
Danny Mayer's avatar
Danny Mayer committed
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
			 (e) == 0)

#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)

/*
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
#define TRACE_LEVEL		90
#define CORRECTNESS_LEVEL	70
#define IOEVENT_LEVEL		60
#define EVENT_LEVEL		50
#define CREATION_LEVEL		20

#define TRACE		DLVL(TRACE_LEVEL)
#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
#define IOEVENT		DLVL(IOEVENT_LEVEL)
#define EVENT		DLVL(EVENT_LEVEL)
#define CREATION	DLVL(CREATION_LEVEL)

typedef isc_event_t intev_t;

152
153
#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
#define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
Danny Mayer's avatar
Danny Mayer committed
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181

/*
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
#ifdef ISC_PLATFORM_HAVEIPV6
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

/*
 * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
 * a setsockopt() like interface to request timestamps, and if the OS
 * doesn't do it for us, call gettimeofday() on every UDP receive?
 */

/*
 * We really  don't want to try and use these control messages. Win32
 * doesn't have this mechanism
 */
#undef USE_CMSG

/*
 * Message header for recvmsg and sendmsg calls.
 * Used value-result for recvmsg, value only for sendmsg.
 */
182

Danny Mayer's avatar
Danny Mayer committed
183
184
185
186

struct msghdr {
        void	*msg_name;              /* optional address */
        u_int   msg_namelen;            /* size of address */
Danny Mayer's avatar
Danny Mayer committed
187
        WSABUF  *msg_iov;		/* scatter/gather array */
Danny Mayer's avatar
Danny Mayer committed
188
189
190
191
192
        u_int   msg_iovlen;             /* # elements in msg_iov */
        void	*msg_control;           /* ancillary data, see below */
        u_int   msg_controllen;         /* ancillary data buffer len */
        int     msg_flags;              /* flags on received message */
} msghdr;
193
	
194
195
196
197
198
/*
 * The number of times a send operation is repeated if the result is EINTR.
 */
#define NRETRIES 10

Danny Mayer's avatar
Danny Mayer committed
199
200
201
202
203
204
struct isc_socket {
	/* Not locked. */
	unsigned int		magic;
	isc_socketmgr_t	       *manager;
	isc_mutex_t		lock;
	isc_sockettype_t	type;
205
206
207
208
209
210
	OVERLAPPED		overlapped;
	/* Pointers to scatter/gather buffers */
	WSABUF			iov[ISC_SOCKET_MAXSCATTERGATHER];
	size_t			totalBytes;
	WSAEVENT		hEvent;		/* Event Handle */
	long			wait_type;	/* Events to wait on */
211
212
213
	WSAEVENT		hAlert;		/* Alert Event Handle */
	DWORD			evthread_id;	/* Event Thread Id for socket */

Danny Mayer's avatar
Danny Mayer committed
214
215
216
217

	/* Locked by socket lock. */
	ISC_LINK(isc_socket_t)	link;
	unsigned int		references;
218
	SOCKET			fd;
Danny Mayer's avatar
Danny Mayer committed
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
	int			pf;

	ISC_LIST(isc_socketevent_t)		send_list;
	ISC_LIST(isc_socketevent_t)		recv_list;
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
	isc_socket_connev_t		       *connect_ev;

	/*
	 * Internal events.  Posted when a descriptor is readable or
	 * writable.  These are statically allocated and never freed.
	 * They will be set to non-purgable before use.
	 */
	intev_t			readable_ev;
	intev_t			writable_ev;

	isc_sockaddr_t		address;  /* remote address */

236
	unsigned int		pending_close : 1,
Danny Mayer's avatar
Danny Mayer committed
237
				pending_accept : 1,
238
239
				iocp : 1,	/* I/O Completion Port */
				listener : 1,	/* listener socket */
Danny Mayer's avatar
Danny Mayer committed
240
241
				connected : 1,
				connecting : 1, /* connect pending */
242
243
244
245
				bound : 1,	/* bound to local addr */
				pending_free: 1;
	unsigned int		pending_recv;
	unsigned int		pending_send;
Danny Mayer's avatar
Danny Mayer committed
246
247
};

248
249
250
251
252
253
254
255
/*
 * I/O Completion ports Info structures
 */

static HANDLE hHeapHandle = NULL;
static int iocp_total = 0;
typedef struct IoCompletionInfo {
	OVERLAPPED			overlapped;
256
	isc_socketevent_t		*dev;
257
	int				request_type;
258
	struct msghdr			messagehdr;
259
260
261
262
} IoCompletionInfo;

/*
 * Define a maximum number of I/O Completion Port worker threads
263
264
 * to handle the load on the Completion Port. The actual number
 * used is the number of CPU's + 1.
265
266
267
268
269
270
271
272
273
274
275
 */
#define MAX_IOCPTHREADS 20

/*
 * event_change structure to handle adds and deletes from the list of
 * events in the Wait
 */
typedef struct event_change event_change_t;

struct event_change {
	isc_socket_t			*sock;
276
277
	WSAEVENT			hEvent;
	DWORD				evthread_id;
278
279
280
281
282
283
284
	SOCKET				fd;
	unsigned int			action;
	ISC_LINK(event_change_t)	link;
};

/*
 * Note: We are using an array here since *WaitForMultiple* wants an array
285
286
 * WARNING: This value may not be greater than 64 since the 
 * WSAWaitForMultipleEvents function is limited to 64 events.
287
288
289
290
291
292
293
294
295
296
297
298
299
300
 */

#define MAX_EVENTS 64

/*
 * List of events being waited on and their associated sockets
 */
typedef struct sock_event_list {
	int max_event;
	int total_events;
	isc_socket_t			*aSockList[MAX_EVENTS];
	WSAEVENT			aEventList[MAX_EVENTS];
} sock_event_list;

301
302
303
304
305
306
307
308
309
310
311
312
313
/*
 * Thread Event structure for managing the threads handling events
 */
typedef struct events_thread events_thread_t;

struct events_thread {
	isc_thread_t			thread_handle;	/* Thread's handle */
	DWORD				thread_id;	/* Thread's id */
	sock_event_list			sockev_list;
	isc_socketmgr_t			*manager;
	ISC_LINK(events_thread_t)	link;
};

314
315
316
#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)

Danny Mayer's avatar
Danny Mayer committed
317
318
struct isc_socketmgr {
	/* Not locked. */
319
320
321
	unsigned int			magic;
	isc_mem_t		       *mctx;
	isc_mutex_t			lock;
Danny Mayer's avatar
Danny Mayer committed
322
	/* Locked by manager lock. */
323
324
325
326
327
328
329
330
331
332
333
	ISC_LIST(event_change_t)	event_updates;
	ISC_LIST(isc_socket_t)		socklist;
	int				event_written;
	WSAEVENT			prime_alert;
	isc_boolean_t			bShutdown;
	ISC_LIST(events_thread_t)	ev_threads;
	isc_condition_t			shutdown_ok;
	HANDLE				hIoCompletionPort;
	int				maxIOCPThreads;
	HANDLE				hIOCPThreads[MAX_IOCPTHREADS];
	DWORD				dwIOCPThreadIds[MAX_IOCPTHREADS];
Danny Mayer's avatar
Danny Mayer committed
334
335
336
337
338
339
340
341
342
343
};

#define CLOSED		0	/* this one must be zero */
#define MANAGED		1
#define CLOSE_PENDING	2

/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
344
#define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
Danny Mayer's avatar
Danny Mayer committed
345

346
static isc_threadresult_t WINAPI event_wait(void *uap);
347
static isc_threadresult_t WINAPI SocketIoThread(LPVOID ThreadContext);
Danny Mayer's avatar
Danny Mayer committed
348
349
static void free_socket(isc_socket_t **);

350
351
352
353
354
355
356
357
358
enum {
	SOCKET_RECV,
	SOCKET_SEND,
};

enum {
	EVENT_ADD,
	EVENT_DELETE
};
Danny Mayer's avatar
Danny Mayer committed
359

360
361
#if defined(ISC_SOCKET_DEBUG)
/*
362
 * This is used to dump the contents of the sock structure
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
 * You should make sure that the sock is locked before
 * dumping it. Since the code uses simple printf() statements
 * it should only be used interactively.
 */
void
sock_dump(isc_socket_t *sock) {
	isc_socketevent_t *ldev;
	isc_socket_newconnev_t *ndev;
	isc_sockaddr_t addr;
	char socktext[256];


	isc_socket_getpeername(sock, &addr);
	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
	printf("Remote Socket: %s\n", socktext);
	isc_socket_getsockname(sock, &addr);
	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
	printf("This Socket: %s\n", socktext);

	printf("\n\t\tSock Dump\n");
	printf("\t\tfd: %u\n", sock->fd);
	printf("\t\treferences: %d\n", sock->references);
	printf("\t\tpending_accept: %d\n", sock->pending_accept);
	printf("\t\tpending_close: %d\n", sock->pending_close);
	printf("\t\tconnecting: %d\n", sock->connecting);
	printf("\t\tconnected: %d\n", sock->connected);
	printf("\t\tbound: %d\n", sock->bound);
	printf("\t\tiocp: %d\n", sock->iocp);
	printf("\t\tsocket type: %d\n", sock->type);

	printf("\n\t\tSock Recv List\n");
	ldev = ISC_LIST_HEAD(sock->recv_list);
	while (ldev != NULL) {
		printf("\t\tdev: %p\n", ldev);
		ldev = ISC_LIST_NEXT(ldev, ev_link);
	}
	printf("\n\t\tSock Send List\n");
	ldev = ISC_LIST_HEAD(sock->send_list);
	while (ldev != NULL) {
		printf("\t\tdev: %p\n", ldev);
		ldev = ISC_LIST_NEXT(ldev, ev_link);
	}
	printf("\n\t\tSock Accept List\n");
	ndev = ISC_LIST_HEAD(sock->accept_list);
	while (ndev != NULL) {
		printf("\t\tdev: %p\n", ldev);
		ndev = ISC_LIST_NEXT(ndev, ev_link);
	}
}
#endif

/*  This function will add an entry to the I/O completion port
 *  that will signal the I/O thread to exit (gracefully)
 */
static void
signal_iocompletionport_exit(isc_socketmgr_t *manager) {
	int i;
	int errval;
	char strbuf[ISC_STRERRORSIZE];

423
	REQUIRE(VALID_MANAGER(manager));
424
	for (i = 0; i < manager->maxIOCPThreads; i++) {
425
426
		if (!PostQueuedCompletionStatus(manager->hIoCompletionPort,
						0, 0, 0)) {
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
			errval = GetLastError();
			isc__strerror(errval, strbuf, sizeof(strbuf));
			FATAL_ERROR(__FILE__, __LINE__,
				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
				ISC_MSG_FAILED,
				"Can't request service thread to exit: %s"),
				strbuf);
		}
	}
}

/*
 * Create the worker threads for the I/O Completion Port
 */
void
iocompletionport_createthreads(int total_threads, isc_socketmgr_t *manager) {
	int errval;
	char strbuf[ISC_STRERRORSIZE];
445
	int i;
446

447
448
	INSIST(total_threads > 0);
	REQUIRE(VALID_MANAGER(manager));
449
450
451
452
	/*
	 * We need at least one
	 */
	for (i = 0; i < total_threads; i++) {
453
		manager->hIOCPThreads[i] = CreateThread(NULL, 0, SocketIoThread,
454
455
456
457
458
459
460
461
462
463
464
465
466
467
						manager, 0,
						&manager->dwIOCPThreadIds[i]);
		if(manager->hIOCPThreads[i] == NULL) {
			errval = GetLastError();
			isc__strerror(errval, strbuf, sizeof(strbuf));
			FATAL_ERROR(__FILE__, __LINE__,
				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
				ISC_MSG_FAILED,
				"Can't create IOCP thread: %s"),
				strbuf);
		}
	}
}

468
/*
469
 *  Create/initialise the I/O completion port
470
 */
471
472
473
474
void
iocompletionport_init(isc_socketmgr_t *manager) {
	int errval;
	char strbuf[ISC_STRERRORSIZE];
475
476

	REQUIRE(VALID_MANAGER(manager));
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
	/*
	 * Create a private heap to handle the socket overlapped structure
	 * The miniumum number of structures is 10, there is no maximum
	 */
	hHeapHandle = HeapCreate(0, 10*sizeof(IoCompletionInfo), 0);
	manager->maxIOCPThreads = min(isc_os_ncpus() + 1,
					MAX_IOCPTHREADS);

	/* Now Create the Completion Port */
	manager->hIoCompletionPort = CreateIoCompletionPort(
				     INVALID_HANDLE_VALUE, NULL,
				     0, manager->maxIOCPThreads);
	if (manager->hIoCompletionPort == NULL) {
		errval = GetLastError();
		isc__strerror(errval, strbuf, sizeof(strbuf));
		FATAL_ERROR(__FILE__, __LINE__,
				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
				ISC_MSG_FAILED,
				"CreateIoCompletionPort() failed "
				"during initialization: %s"),
				strbuf);
		exit(1);
	}
	
	/*
	 * Worker threads for servicing the I/O
 	 */
	iocompletionport_createthreads(manager->maxIOCPThreads, manager);
}
	
507

508
509
void
iocompletionport_exit(isc_socketmgr_t *manager) {
510
511

	REQUIRE(VALID_MANAGER(manager));
512
513
514
515
516
517
	if (manager->hIoCompletionPort != NULL) {
		/*  Get each of the service threads to exit
		*/
		signal_iocompletionport_exit(manager);
	}
}
518

519
/*
520
521
 * Add sockets in here and pass the sock data in as part of the
 * information needed.
522
 */
523
void
524
525
iocompletionport_update(isc_socket_t *sock) {
	HANDLE hiocp;
526
527

	REQUIRE(sock != NULL);
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
	if(sock->iocp == 0) {
		sock->iocp = 1;
		hiocp = CreateIoCompletionPort((HANDLE) sock->fd,
			sock->manager->hIoCompletionPort, (DWORD) sock,
			sock->manager->maxIOCPThreads);
		InterlockedIncrement(&iocp_total);

	}
}

void
socket_event_minit(sock_event_list *evlist) {
	BOOL bReset;
	int i;

543
	REQUIRE(evlist != NULL);
544
545
546
547
548
549
550
551
	/* Initialize the Event List */
	evlist->max_event = 0;
	evlist->total_events = 0;
	for (i = 0; i < MAX_EVENTS; i++) {
		evlist->aSockList[i] = NULL;
		evlist->aEventList[i] = (WSAEVENT) 0;
	}

552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
	evlist->aEventList[0] = WSACreateEvent();
	(evlist->max_event)++;
	bReset = WSAResetEvent(evlist->aEventList[0]);
}
/*
 * Event Thread Initialization
 */
isc_result_t
event_thread_create(events_thread_t **evthreadp, isc_socketmgr_t *manager) {
	events_thread_t *evthread;

	REQUIRE(VALID_MANAGER(manager));
	REQUIRE(evthreadp != NULL && *evthreadp == NULL);

	evthread = isc_mem_get(manager->mctx, sizeof(*evthread));
	socket_event_minit(&evthread->sockev_list);
	ISC_LINK_INIT(evthread, link);
	evthread->manager = manager;

	ISC_LIST_APPEND(manager->ev_threads, evthread, link);

573
	/*
574
	 * Start up the event wait thread.
575
	 */
576
577
578
	if (isc_thread_create(event_wait, evthread, &evthread->thread_handle) !=
	    ISC_R_SUCCESS) {
		isc_mem_put(manager->mctx, evthread, sizeof(*evthread));
579
		UNEXPECTED_ERROR(__FILE__, __LINE__,
580
				 "isc_thread_create() %s",
581
582
				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
						ISC_MSG_FAILED, "failed"));
583
		return (ISC_R_UNEXPECTED);
584
	}
585
586
	*evthreadp = evthread;
	return (ISC_R_SUCCESS);
587
588
}
/*
589
590
591
 * Locate a thread with space for additional events or create one if
 * necessary. The manager is locked at this point so the information
 * cannot be changed by another thread while we are searching.
592
 */
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
void
locate_available_thread(isc_socketmgr_t *manager) {
	events_thread_t *evthread;
	DWORD threadid = GetCurrentThreadId();

	evthread = ISC_LIST_HEAD(manager->ev_threads);
	while (evthread != NULL) {
		/*
		 * We need to find a thread with space to add an event
		 * If we find it, alert it to process the event change
		 * list
		 */
		if(threadid != evthread->thread_id &&
			evthread->sockev_list.max_event < MAX_EVENTS) {
			WSASetEvent(evthread->sockev_list.aEventList[0]);
			return;
		}
		evthread = ISC_LIST_NEXT(evthread, link);
	}
	/*
	 * We need to create a new thread as other threads are full.
	 * If we succeed in creating the thread, alert it to
	 * process the event change list since it will have space.
	 * If we are unable to create one, the event will stay on the
	 * list and the next event_wait thread will try again to add
	 * the event. It will call here again if it has no space.
	 */
	if (event_thread_create(&evthread, manager) == ISC_R_SUCCESS) {
		WSASetEvent(evthread->sockev_list.aEventList[0]);
	}

}

isc_boolean_t
socket_eventlist_add(event_change_t *evchange, sock_event_list *evlist,
		     isc_socketmgr_t *manager) {
629
	int max_event;
630
631
	isc_socket_t *sock;
	REQUIRE(evchange != NULL);
632

633
	sock = evchange->sock;
634
635
636
637
638
639
	REQUIRE(sock != NULL);
	REQUIRE(sock->hEvent != NULL);
	REQUIRE(evlist != NULL);

	max_event = evlist->max_event;
	if(max_event >= MAX_EVENTS) {
640
641
		locate_available_thread(manager);
		return (ISC_FALSE);
642
643
644
645
646
647
	}

	evlist->aSockList[max_event] = sock;
	evlist->aEventList[max_event] = sock->hEvent;
	evlist->max_event++;
	evlist->total_events++;
648
649
650
	sock->hAlert = evlist->aEventList[0];
	sock->evthread_id = GetCurrentThreadId();
	return (ISC_TRUE);
651
}
652

653
/*
654
655
 * Note that the eventLock is locked before calling this function.
 * All Events and associated sockets are closed here.
656
 */
657
658
isc_boolean_t
socket_eventlist_delete(event_change_t *evchange, sock_event_list *evlist) {
659
660
	int i;
	WSAEVENT hEvent;
661
	int iEvent = -1;
662

663
664
	REQUIRE(evchange != NULL);
	/*  Make sure this is the right thread from which to delete the event */
665
	if (evchange->evthread_id != GetCurrentThreadId())
666
		return (ISC_FALSE);
667

668
669
670
	REQUIRE(evlist != NULL);
	REQUIRE(evchange->hEvent != NULL);
	hEvent = evchange->hEvent;
671

672
673
674
675
676
677
678
	/* Find the Event */
	for (i = 1; i < evlist->max_event; i++) {
		if (evlist->aEventList[i] == hEvent) {
			iEvent = i;
			break;
		}
	}
679

680
681
682
	/* Actual event start at 1 */
	if (iEvent < 1)
		return (ISC_FALSE);
683

684
	for(i = iEvent; i < (evlist->max_event - 1); i++) {
685
686
687
		evlist->aEventList[i] = evlist->aEventList[i + 1];
		evlist->aSockList[i] = evlist->aSockList[i + 1];
	}
688

689
690
691
692
693
694
695
	evlist->aEventList[evlist->max_event - 1] = 0;
	evlist->aSockList[evlist->max_event - 1] = NULL;

	/* Cleanup */
	WSACloseEvent(hEvent);
	if (evchange->fd >= 0)
		closesocket(evchange->fd);
696
697
	evlist->max_event--;
	evlist->total_events--;
698

699
	return (ISC_TRUE);
700
}
701

702
703
/*
 * Get the event changes off of the list and apply the
704
705
706
707
708
709
710
711
712
713
714
 * requested changes. The manager lock is taken out at
 * the start of this function to prevent other event_wait
 * threads processing the same information at the same
 * time. The queue may not be empty on exit since other
 * threads may be involved in processing the queue.
 *
 * The deletes are done first in order that there be space
 * available for the events being added in the same thread
 * in case the event list is almost full. This reduces the
 * probability of having to create another thread which would
 * increase overhead costs.
715
716
717
718
 */
isc_result_t
process_eventlist(sock_event_list *evlist, isc_socketmgr_t *manager) {
	event_change_t *evchange;
719
720
721
722
723
724
	event_change_t *next;
	isc_boolean_t del;

	REQUIRE(evlist != NULL);

	LOCK(&manager->lock);
725

726
727
728
	/*
	 * First the deletes.
	 */
729
	evchange = ISC_LIST_HEAD(manager->event_updates);
730
	while (evchange != NULL) {
731
732
		next = ISC_LIST_NEXT(evchange, link);
		del = ISC_FALSE;
733
		if (evchange->action == EVENT_DELETE) {
734
735
			del = socket_eventlist_delete(evchange, evlist);

736
737
738
739
			/*
			 * Delete only if this thread's socket list was
			 * updated.
			 */
740
741
742
743
744
745
			if (del) {
				ISC_LIST_DEQUEUE(manager->event_updates,
						 evchange, link);
				HeapFree(hHeapHandle, 0, evchange);
				manager->event_written--;
			}
746
		}
747
		evchange = next;
748
	}
749
750
751
752

	/*
	 * Now the adds.
	 */
753
754
755
756
	evchange = ISC_LIST_HEAD(manager->event_updates);
	while (evchange != NULL) {
		next = ISC_LIST_NEXT(evchange, link);
		del = ISC_FALSE;
757
		if (evchange->action == EVENT_ADD) {
758
759
			del = socket_eventlist_add(evchange, evlist, manager);

760
761
762
763
			/*
			 * Delete only if this thread's socket list was
			 * updated.
			 */
764
765
766
767
768
769
770
771
772
773
			if (del) {
				ISC_LIST_DEQUEUE(manager->event_updates,
						 evchange, link);
				HeapFree(hHeapHandle, 0, evchange);
				manager->event_written--;
			}
		}
		evchange = next;
	}
	UNLOCK(&manager->lock);
774
775
	return (ISC_R_SUCCESS);
}
776

777
778
779
780
781
/*
 * Add the event list changes to the queue and notify the
 * event loop
 */
static void
782
notify_eventlist(isc_socket_t *sock, isc_socketmgr_t *manager,
783
784
		 unsigned int action)
{
785
786

	event_change_t *evchange;
787
788
789

	REQUIRE(VALID_MANAGER(manager));
	REQUIRE(sock != NULL);
790
791
792
793
794
	
	evchange = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY,
			     sizeof(event_change_t));
	evchange->sock = sock;
	evchange->action = action;
795
	evchange->hEvent = sock->hEvent;
796
	evchange->fd = sock->fd;
797
	evchange->evthread_id = sock->evthread_id;
798

799
800
	LOCK(&manager->lock);
	ISC_LIST_APPEND(manager->event_updates, evchange, link);
801
	sock->manager->event_written++;
802
803
804
805
806
807
808
	UNLOCK(&manager->lock);

	/* Alert the Wait List */
	if (sock->hAlert != NULL)
		WSASetEvent(sock->hAlert);
	else
		WSASetEvent(manager->prime_alert);
809
}
810

811
812
813
814
815
816
817
/*
 * Note that the socket is already locked before calling this function
 */
isc_result_t
socket_event_add(isc_socket_t *sock, long type) {
	int stat;
	WSAEVENT hEvent;
818
819
	char strbuf[ISC_STRERRORSIZE];
	const char *msg;
820
821

	REQUIRE(sock != NULL);
822

823
824
825
	hEvent = WSACreateEvent();
	if (hEvent == WSA_INVALID_EVENT) {
		stat = WSAGetLastError();
826
827
828
829
830
		isc__strerror(stat, strbuf, sizeof(strbuf));
		msg = isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,	
				     ISC_MSG_FAILED, "failed"),
		UNEXPECTED_ERROR(__FILE__, __LINE__, "WSACreateEvent: %s: %s",
				 msg, strbuf);
831
832
833
834
		return (ISC_R_UNEXPECTED);
	}
	if (WSAEventSelect(sock->fd, hEvent, type) != 0) {
		stat = WSAGetLastError();
835
836
837
838
839
		isc__strerror(stat, strbuf, sizeof(strbuf));
		msg = isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,	
				     ISC_MSG_FAILED, "failed");
		UNEXPECTED_ERROR(__FILE__, __LINE__, "WSAEventSelect: %s: %s",
				 msg, strbuf);
840
841
842
		return (ISC_R_UNEXPECTED);
	}
	sock->hEvent = hEvent;
843

844
	sock->wait_type = type;
845
	notify_eventlist(sock, sock->manager, EVENT_ADD);
846
847
	return (ISC_R_SUCCESS);
}
848

Danny Mayer's avatar
Danny Mayer committed
849
/*
850
 * Note that the socket is not locked before calling this function
Danny Mayer's avatar
Danny Mayer committed
851
 */
852
853
854
855
856
void
socket_event_delete(isc_socket_t *sock) {

	REQUIRE(sock != NULL);
	REQUIRE(sock->hEvent != NULL);
Danny Mayer's avatar
Danny Mayer committed
857

858
859
860
	if (sock->hEvent != NULL) {
		sock->wait_type = 0;
		sock->pending_close = 1;
861
		notify_eventlist(sock, sock->manager, EVENT_DELETE);
862
		sock->hEvent = NULL;
863
864
		sock->hAlert = NULL;
		sock->evthread_id = 0;
865
866
	}
}
867

868
869
870
871
872
873
874
875
876
/*
 * Routine to cleanup and then close the socket.
 * Only close the socket here if it is NOT associated
 * with an event, otherwise the WSAWaitForMultipleEvents
 * may fail due to the fact that the the Wait should not
 * be running while closing an event or a socket.
 */
void
socket_close(isc_socket_t *sock) {
877
878

	REQUIRE(sock != NULL);
879
880
881
882
883
884
885
886
887
888
889
890
	sock->pending_close = 1;
	if (sock->hEvent != NULL)
		socket_event_delete(sock);
	else {
		closesocket(sock->fd);
	}
	if (sock->iocp) {
		sock->iocp = 0;
		InterlockedDecrement(&iocp_total);
	}

}
891

Danny Mayer's avatar
Danny Mayer committed
892
893
894
/*
 * Initialize socket services
 */
Andreas Gustafsson's avatar
Andreas Gustafsson committed
895
BOOL InitSockets() {
Danny Mayer's avatar
Danny Mayer committed
896
897
898
899
900
901
902
903
	WORD wVersionRequested;
	WSADATA wsaData;
	int err;

	/* Need Winsock 2.0 or better */
	wVersionRequested = MAKEWORD(2, 0);
 
	err = WSAStartup(wVersionRequested, &wsaData);
Andreas Gustafsson's avatar
Andreas Gustafsson committed
904
	if ( err != 0 ) {
Danny Mayer's avatar
Danny Mayer committed
905
		/* Tell the user that we could not find a usable Winsock DLL */
906
		return(FALSE);
Danny Mayer's avatar
Danny Mayer committed
907
	}
908
	return(TRUE);
Danny Mayer's avatar
Danny Mayer committed
909
910
}

Andreas Gustafsson's avatar
Andreas Gustafsson committed
911
int
912
internal_sendmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
913
914
		 struct msghdr *messagehdr, int flags, int *Error)
{
915
	int Result;
Danny Mayer's avatar
Danny Mayer committed
916
917
	DWORD BytesSent;
	DWORD Flags = flags;
918
919
920
921
	int total_sent;

	*Error = 0;
	Result = WSASendTo((SOCKET) sock->fd, 
922
923
			messagehdr->msg_iov, 
			messagehdr->msg_iovlen, 
924
925
			&BytesSent,
			Flags,
926
927
			messagehdr->msg_name,
			messagehdr->msg_namelen,
928
929
930
931
			(LPOVERLAPPED) lpo,
			NULL);

	total_sent = (int) BytesSent;
Danny Mayer's avatar
Danny Mayer committed
932
    
933
934
935
936
937
938
	/* Check for errors.*/
	if (Result == SOCKET_ERROR) {

		*Error = WSAGetLastError();
        
	        switch (*Error) {
939
940
941
942
943
944
		case WSA_IO_INCOMPLETE :
		case WSA_WAIT_IO_COMPLETION :
		case WSA_IO_PENDING :
			sock->pending_send++;
		case NO_ERROR :
			break;
945

946
947
948
949
950
951
952
		default :
			return (-1);
			break;
		}
	} else
		sock->pending_send++;
	if (lpo != NULL)
953
954
955
		return (0);
	else
		return (total_sent);
Danny Mayer's avatar
Danny Mayer committed
956
957
958
}

int
959
internal_recvmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
960
961
		 struct msghdr *messagehdr, int flags, int *Error)
{
962
963
964
	DWORD Flags = 0;
	DWORD NumBytes = 0;
	int total_bytes = 0;
Danny Mayer's avatar
Danny Mayer committed
965
	int Result;
Andreas Gustafsson's avatar
Andreas Gustafsson committed
966

967
968
	*Error = 0;
	Result = WSARecvFrom((SOCKET) sock->fd,
969
970
971
972
973
974
975
976
			     messagehdr->msg_iov,
			     messagehdr->msg_iovlen,
			     &NumBytes,
			     &Flags,
			     messagehdr->msg_name,
			     (int *)&(messagehdr->msg_namelen),
			     (LPOVERLAPPED) lpo,
			     NULL);
Andreas Gustafsson's avatar
Andreas Gustafsson committed
977

978
	total_bytes = (int) NumBytes;
Andreas Gustafsson's avatar
Andreas Gustafsson committed
979
980
981

	/* Check for errors. */
	if (Result == SOCKET_ERROR) {
982
983

		*Error = WSAGetLastError();
Danny Mayer's avatar
Danny Mayer committed
984
        
985
	        switch (*Error) {
986
987
988
989
990
991
		case WSA_IO_INCOMPLETE:
		case WSA_WAIT_IO_COMPLETION:
		case WSA_IO_PENDING:
			sock->pending_recv++;
		case NO_ERROR:
			break;
Danny Mayer's avatar
Danny Mayer committed
992

993
994
995
996
997
998
		default :
			return (-1);
			break;
		}
	} else
		sock->pending_recv++;
999
1000

	/* Return the flags received in header */
1001
	messagehdr->msg_flags = Flags;
1002
	if (lpo != NULL)
1003
1004
1005
1006
		return (-1);
	else
		return (total_bytes);
} 
Danny Mayer's avatar
Danny Mayer committed
1007
1008

static void
1009
1010
1011
manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
	    isc_logmodule_t *module, int level, const char *fmt, ...)
{
Danny Mayer's avatar
Danny Mayer committed
1012
1013
1014
	char msgbuf[2048];
	va_list ap;

1015
	if (!isc_log_wouldlog(isc_lctx, level))
Danny Mayer's avatar
Danny Mayer committed
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
		return;

	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
		      "sockmgr %p: %s", sockmgr, msgbuf);
}

1026
1027
1028
1029
1030
static void
socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   isc_msgcat_t *msgcat, int msgset, int message,
	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
1031

Danny Mayer's avatar
Danny Mayer committed
1032
1033
1034
1035
static void
socket_log(isc_socket_t *sock, isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   isc_msgcat_t *msgcat, int msgset, int message,
1036
1037
	   const char *fmt, ...)
{
Danny Mayer's avatar
Danny Mayer committed
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
	char msgbuf[2048];
	char peerbuf[256];
	va_list ap;

	if (! isc_log_wouldlog(isc_lctx, level))
		return;

	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p: %s", sock, msgbuf);
1053
	} else {
Mark Andrews's avatar
Mark Andrews committed
1054
		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
1055
1056
1057
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
			       "socket %p %s: %s", sock, peerbuf, msgbuf);
Danny Mayer's avatar
Danny Mayer committed
1058
1059
	}
}
1060

Danny Mayer's avatar
Danny Mayer committed
1061
/*
1062
 * Make an fd SOCKET non-blocking.
Danny Mayer's avatar
Danny Mayer committed
1063
1064
 */
static isc_result_t
1065
make_nonblock(SOCKET fd) {
Danny Mayer's avatar
Danny Mayer committed
1066
1067
	int ret;
	unsigned long flags = 1;
1068
	char strbuf[ISC_STRERRORSIZE];
Danny Mayer's avatar
Danny Mayer committed
1069
1070

	/* Set the socket to non-blocking */
1071
	ret = ioctlsocket(fd, FIONBIO, &flags);
Danny Mayer's avatar
Danny Mayer committed
1072
1073

	if (ret == -1) {
1074
		isc__strerror(errno, strbuf, sizeof(strbuf));
Danny Mayer's avatar
Danny Mayer committed
1075
1076
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "ioctlsocket(%d, FIOBIO, %d): %s",
1077
				 fd, flags, strbuf);
Danny Mayer's avatar
Danny Mayer committed
1078
1079
1080
1081
1082
1083

		return (ISC_R_UNEXPECTED);
	}

	return (ISC_R_SUCCESS);
}
1084

Danny Mayer's avatar
Danny Mayer committed
1085
/*
1086
1087
1088
1089
1090
1091
1092
1093
 * Windows 2000 systems incorrectly cause UDP sockets using WASRecvFrom
 * to not work correctly, returning a WSACONNRESET error when a WSASendTo
 * fails with an "ICMP port unreachable" response and preventing the
 * socket from using the WSARecvFrom in subsequent operations.
 * The function below fixes this, but requires that Windows 2000
 * Service Pack 2 or later be installed on the system.  NT 4.0
 * systems are not affected by this and work correctly.
 * See Microsoft Knowledge Base Article Q263823 for details of this.
Danny Mayer's avatar
Danny Mayer committed
1094
 */
1095
1096
1097
1098
1099
1100
1101
1102
1103
isc_result_t
connection_reset_fix(SOCKET fd) {
	DWORD dwBytesReturned = 0;
	BOOL  bNewBehavior = FALSE;
	DWORD status;

	if(isc_win32os_majorversion() < 5)
		return (ISC_R_SUCCESS); /*  NT 4.0 has no problem */

1104
	/* disable bad behavior using IOCTL: SIO_UDP_CONNRESET */
1105
1106
1107
1108
1109
	status = WSAIoctl(fd, SIO_UDP_CONNRESET, &bNewBehavior,
			  sizeof(bNewBehavior), NULL, 0,
			  &dwBytesReturned, NULL, NULL);
	if (status != SOCKET_ERROR)
		return (ISC_R_SUCCESS);
1110
1111
1112
1113
1114
	else {
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "WSAIoctl(SIO_UDP_CONNRESET, oldBehaviour) %s",
				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
						ISC_MSG_FAILED, "failed"));
1115
		return (ISC_R_UNEXPECTED);
1116
	}
Danny Mayer's avatar
Danny Mayer committed
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
}

/*
 * Construct an iov array and attach it to the msghdr passed in.  This is
 * the SEND constructor, which will use the used region of the buffer
 * (if using a buffer list) or will use the internal region (if a single
 * buffer I/O is requested).
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
 *
 * If write_countp != NULL, *write_countp will hold the number of bytes
 * this transaction can send.
 */
static void
build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
		  struct msghdr *msg, char *cmsg,
1134
1135
		  WSABUF *iov, size_t *write_countp)
{
Danny Mayer's avatar
Danny Mayer committed
1136
1137
1138
1139
1140
1141
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t used;
	size_t write_count;
	size_t skip_count;

Mark Andrews's avatar
Mark Andrews committed
1142
	memset(msg, 0, sizeof(*msg));
Danny Mayer's avatar
Danny Mayer committed
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187

	if (sock->type == isc_sockettype_udp) {
		msg->msg_name = (void *)&dev->address.type.sa;
		msg->msg_namelen = dev->address.length;
	} else {
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
	}

	buffer = ISC_LIST_HEAD(dev->bufferlist);
	write_count = 0;
	iovcount = 0;

	/*
	 * Single buffer I/O?  Skip what we've done so far in this region.
	 */
	if (buffer == NULL) {
		write_count = dev->region.length - dev->n;
		iov[0].buf = (void *)(dev->region.base + dev->n);
		iov[0].len = write_count;
		iovcount = 1;

		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip the data in the buffer list that we have already written.
	 */
	skip_count = dev->n;
	while (buffer != NULL) {
		REQUIRE(ISC_BUFFER_VALID(buffer));
		if (skip_count < isc_buffer_usedlength(buffer))
			break;
		skip_count -= isc_buffer_usedlength(buffer);
		buffer = ISC_LIST_NEXT(buffer, link);
	}

	while (buffer != NULL) {
		INSIST(iovcount < MAXSCATTERGATHER_SEND);

		isc_buffer_usedregion(buffer, &used);

		if (used.length > 0) {
			iov[iovcount].buf = (void *)(used.base
1188
							  + skip_count);
Danny Mayer's avatar
Danny Mayer committed
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
			iov[iovcount].len = used.length - skip_count;
			write_count += (used.length - skip_count);
			skip_count = 0;
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

	INSIST(skip_count == 0);

 config:
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;

	if (write_countp != NULL)
		*write_countp = write_count;
}

/*
 * Construct an iov array and attach it to the msghdr passed in.  This is
 * the RECV constructor, which will use the available region of the buffer
 * (if using a buffer list) or will use the internal region (if a single
 * buffer I/O is requested).
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
 *
 * If read_countp != NULL, *read_countp will hold the number of bytes
 * this transaction can receive.
 */
static void
build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev,
		  struct msghdr *msg, char *cmsg,
1222
1223
		  WSABUF *iov, size_t *read_countp)
{
Danny Mayer's avatar
Danny Mayer committed
1224
1225
1226
1227
1228
	unsigned int iovcount;
	isc_buffer_t *buffer;
	isc_region_t available;
	size_t read_count;

Mark Andrews's avatar
Mark Andrews committed
1229
	memset(msg, 0, sizeof(struct msghdr));
Danny Mayer's avatar
Danny Mayer committed
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251

	if (sock->type == isc_sockettype_udp) {
		memset(&dev->address, 0, sizeof(dev->address));
		msg->msg_name = (void *)&dev->address.type.sa;
		msg->msg_namelen = sizeof(dev->address.type);
	} else { /* TCP */
		msg->msg_name = NULL;
		msg->msg_namelen = 0;
		dev->address = sock->address;
	}

	buffer = ISC_LIST_HEAD(dev->bufferlist);
	read_count = 0;

	/*
	 * Single buffer I/O?  Skip what we've done so far in this region.
	 */
	if (buffer == NULL) {
		read_count = dev->region.length - dev->n;
		iov[0].buf = (void *)(dev->region.base + dev->n);
		iov[0].len = read_count;
		iovcount = 1;
Andreas Gustafsson's avatar
Andreas Gustafsson committed
1252
	} else {
1253
1254
1255
1256
		/*
		 * Multibuffer I/O.
		 * Skip empty buffers.
		 */
Danny Mayer's avatar
Danny Mayer committed
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
		while (buffer != NULL) {
			REQUIRE(ISC_BUFFER_VALID(buffer));
			if (isc_buffer_availablelength(buffer) != 0)
				break;
			buffer = ISC_LIST_NEXT(buffer, link);
		}

		iovcount = 0;
		while (buffer != NULL) {
			INSIST(iovcount < MAXSCATTERGATHER_RECV);

			isc_buffer_availableregion(buffer, &available);

			if (available.length > 0) {
				iov[iovcount].buf = (void *)(available.base);
				iov[iovcount].len = available.length;
				read_count += available.length;
				iovcount++;
			}
			buffer = ISC_LIST_NEXT(buffer, link);
		}
	}

	/*
	 * If needed, set up to receive that one extra byte.  Note that
	 * we know there is at least one iov left, since we stole it
	 * at the top of this function.
	 */

	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;

	if (read_countp != NULL)
		*read_countp = read_count;
}

static void
set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1295
1296
		isc_socketevent_t *dev)
{
Danny Mayer's avatar
Danny Mayer committed
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
	if (sock->type == isc_sockettype_udp) {
		if (address != NULL)
			dev->address = *address;
		else
			dev->address = sock->address;
	} else if (sock->type == isc_sockettype_tcp) {
		INSIST(address == NULL);
		dev->address = sock->address;
	}
}

static isc_socketevent_t *
allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype,
1310
1311
		     isc_taskaction_t action, const void *arg)
{
Danny Mayer's avatar
Danny Mayer committed
1312
1313
1314
1315
1316
	isc_socketevent_t *ev;

	ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx,
						     sock, eventtype,
						     action, arg,
Mark Andrews's avatar
Mark Andrews committed
1317
						     sizeof(*ev));
Danny Mayer's avatar
Danny Mayer committed
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
	if (ev == NULL)
		return (NULL);

	ev->result = ISC_R_UNEXPECTED;
	ISC_LINK_INIT(ev, ev_link);
	ISC_LIST_INIT(ev->bufferlist);
	ev->region.base = NULL;
	ev->n = 0;
	ev->offset = 0;
	ev->attributes = 0;

	return (ev);
}

#if defined(ISC_SOCKET_DEBUG)
static void
dump_msg(struct msghdr *msg, isc_socket_t *sock) {
	unsigned int i;

1337
	printf("MSGHDR %p, Socket #: %u\n", msg, sock->fd);
Danny Mayer's avatar
Danny Mayer committed
1338
1339
	printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen);
	printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen);
1340
	for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
Danny Mayer's avatar
Danny Mayer committed
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
		printf("\t\t%d\tbase %p, len %d\n", i,
		       msg->msg_iov[i].buf,
		       msg->msg_iov[i].len);
}
#endif

#define DOIO_SUCCESS		0	/* i/o ok, event sent */
#define DOIO_SOFT		1	/* i/o ok, soft error, no event sent */
#define DOIO_HARD		2	/* i/o error, event sent */
#define DOIO_EOF		3	/* EOF, no event sent */

static int
1353
completeio_recv(isc_socket_t *sock, isc_socketevent_t *dev,
1354
1355
		struct msghdr *messagehdr, int cc, int recv_errno)
{
Danny Mayer's avatar
Danny Mayer committed
1356
1357
1358
1359
	size_t actual_count;
	isc_buffer_t *buffer;

#define SOFT_OR_HARD(_system, _isc) \
1360
	if (recv_errno == _system) { \
Danny Mayer's avatar
Danny Mayer committed
1361
1362
1363
1364
1365
1366
		if (sock->connected) { \
			dev->result = _isc; \
			return (DOIO_HARD); \
		} \
		return (DOIO_SOFT); \
	}
1367

Danny Mayer's avatar
Danny Mayer committed
1368
#define ALWAYS_HARD(_system, _isc) \
1369
	if (recv_errno == _system) { \
Danny Mayer's avatar
Danny Mayer committed
1370
1371
1372
1373
		dev->result = _isc; \
		return (DOIO_HARD); \
	}

1374
1375
1376
1377
1378
	if (recv_errno != 0) {

		if (SOFT_ERROR(recv_errno))
			return (DOIO_SOFT);

1379
1380
1381
1382
1383
		SOFT_OR_HARD(WSAECONNREFUSED, ISC_R_CONNREFUSED);
		SOFT_OR_HARD(WSAENETUNREACH, ISC_R_NETUNREACH);
		SOFT_OR_HARD(WSAEHOSTUNREACH, ISC_R_HOSTUNREACH);
		SOFT_OR_HARD(WSAECONNRESET, ISC_R_CONNECTIONRESET);
		SOFT_OR_HARD(WSAENETRESET, ISC_R_CONNECTIONRESET);
1384
		SOFT_OR_HARD(WSAECONNABORTED, ISC_R_CONNECTIONRESET);
1385
1386
		SOFT_OR_HARD(WSAEDISCON, ISC_R_CONNECTIONRESET);
		SOFT_OR_HARD(WSAENETDOWN, ISC_R_NETDOWN);
1387
1388
1389
1390
		ALWAYS_HARD(ERROR_OPERATION_ABORTED, ISC_R_CONNECTIONRESET);
		ALWAYS_HARD(ERROR_PORT_UNREACHABLE, ISC_R_HOSTUNREACH);
		ALWAYS_HARD(ERROR_HOST_UNREACHABLE, ISC_R_HOSTUNREACH);
		ALWAYS_HARD(ERROR_NETWORK_UNREACHABLE, ISC_R_NETUNREACH);
1391
		ALWAYS_HARD(WSAENOBUFS, ISC_R_NORESOURCES);
Danny Mayer's avatar
Danny Mayer committed
1392
1393
1394
1395

#undef SOFT_OR_HARD
#undef ALWAYS_HARD

1396
		dev->result = isc__errno2result(recv_errno);
Danny Mayer's avatar
Danny Mayer committed
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
		return (DOIO_HARD);
	}

	/*
	 * On TCP, zero length reads indicate EOF, while on
	 * UDP, zero length reads are perfectly valid, although
	 * strange.
	 */
	if ((sock->type == isc_sockettype_tcp) && (cc == 0))
		return (DOIO_EOF);

1408
	if (sock->type == isc_sockettype_udp) {
1409
		dev->address.length = messagehdr->msg_namelen;
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
		if (isc_sockaddr_getport(&dev->address) == 0) {
			if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
				socket_log(sock, &dev->address, IOEVENT,
					   isc_msgcat, ISC_MSGSET_SOCKET,
					   ISC_MSG_ZEROPORT, 
					   "dropping source port zero packet");
			}
			return (DOIO_SOFT);
		}
	}
Danny Mayer's avatar
Danny Mayer committed
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461

	socket_log(sock, &dev->address, IOEVENT,
		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
		   "packet received correctly");

	/*
	 * Overflow bit detection.  If we received MORE bytes than we should,
	 * this indicates an overflow situation.  Set the flag in the
	 * dev entry and adjust how much we read by one.
	 */
#ifdef ISC_NET_RECVOVERFLOW
	if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
		dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
		cc--;
	}
#endif

	/*
	 * update the buffers (if any) and the i/o count
	 */
	dev->n += cc;
	actual_count = cc;
	buffer = ISC_LIST_HEAD(dev->bufferlist);
	while (buffer != NULL && actual_count > 0) {
		REQUIRE(ISC_BUFFER_VALID(buffer));
		if (isc_buffer_availablelength(buffer) <= actual_count) {
			actual_count -= isc_buffer_availablelength(buffer);
			isc_buffer_add(buffer,
				       isc_buffer_availablelength(buffer));
		} else {
			isc_buffer_add(buffer, actual_count);
			actual_count = 0;
			break;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
		if (buffer == NULL) {
			INSIST(actual_count == 0);
		}
	}

	/*
	 * If we read less than we expected, update counters,
1462
	 * and let the upper layer handle it.
Danny Mayer's avatar
Danny Mayer committed
1463
	 */
1464
	if (((size_t)cc != sock->totalBytes) && (dev->n < dev->minimum))
Danny Mayer's avatar
Danny Mayer committed
1465
1466
1467
1468
1469
1470
1471
1472
		return (DOIO_SOFT);

	/*
	 * Full reads are posted, or partials if partials are ok.
	 */
	dev->result = ISC_R_SUCCESS;
	return (DOIO_SUCCESS);
}