socket.c 103 KB
Newer Older
Danny Mayer's avatar
Danny Mayer committed
1
/*
Tinderbox User's avatar
Tinderbox User committed
2
 * Copyright (C) 2004-2014  Internet Systems Consortium, Inc. ("ISC")
Mark Andrews's avatar
Mark Andrews committed
3
 * Copyright (C) 2000-2003  Internet Software Consortium.
Danny Mayer's avatar
Danny Mayer committed
4
 *
Automatic Updater's avatar
Automatic Updater committed
5
 * Permission to use, copy, modify, and/or distribute this software for any
Danny Mayer's avatar
Danny Mayer committed
6 7 8
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
Mark Andrews's avatar
Mark Andrews committed
9 10 11 12 13 14 15
 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
Danny Mayer's avatar
Danny Mayer committed
16 17
 */

Mark Andrews's avatar
Mark Andrews committed
18
/* $Id$ */
19

20 21
/* This code uses functions which are only available on Server 2003 and
 * higher, and Windows XP and higher.
22 23 24
 *
 * This code is by nature multithreaded and takes advantage of various
 * features to pass on information through the completion port for
25 26
 * when I/O is completed.  All sends, receives, accepts, and connects are
 * completed through the completion port.
27
 *
28 29 30
 * The number of Completion Port Worker threads used is the total number
 * of CPU's + 1. This increases the likelihood that a Worker Thread is
 * available for processing a completed request.
31 32 33
 *
 * XXXPDM 5 August, 2002
 */
Danny Mayer's avatar
Danny Mayer committed
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50

#define MAKE_EXTERNAL 1
#include <config.h>

#include <sys/types.h>

#ifndef _WINSOCKAPI_
#define _WINSOCKAPI_   /* Prevent inclusion of winsock.h in windows.h */
#endif

#include <errno.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <io.h>
#include <fcntl.h>
51
#include <process.h>
Danny Mayer's avatar
Danny Mayer committed
52

53
#include <isc/app.h>
Danny Mayer's avatar
Danny Mayer committed
54 55 56 57 58 59 60 61 62
#include <isc/buffer.h>
#include <isc/bufferlist.h>
#include <isc/condition.h>
#include <isc/list.h>
#include <isc/log.h>
#include <isc/mem.h>
#include <isc/msgs.h>
#include <isc/mutex.h>
#include <isc/net.h>
63
#include <isc/once.h>
64
#include <isc/os.h>
Danny Mayer's avatar
Danny Mayer committed
65 66 67 68
#include <isc/platform.h>
#include <isc/print.h>
#include <isc/region.h>
#include <isc/socket.h>
69
#include <isc/stats.h>
70 71
#include <isc/strerror.h>
#include <isc/syslog.h>
Danny Mayer's avatar
Danny Mayer committed
72 73 74
#include <isc/task.h>
#include <isc/thread.h>
#include <isc/util.h>
75
#include <isc/win32os.h>
Danny Mayer's avatar
Danny Mayer committed
76

77 78
#include <mswsock.h>

79
#include "errno2result.h"
80

Evan Hunt's avatar
Evan Hunt committed
81 82 83 84 85 86 87
/*
 * Set by the -T dscp option on the command line. If set to a value
 * other than -1, we check to make sure DSCP values match it, and
 * assert if not.
 */
int isc_dscp_check_value = -1;

88 89 90 91 92 93 94 95 96 97 98
/*
 * How in the world can Microsoft exist with APIs like this?
 * We can't actually call this directly, because it turns out
 * no library exports this function.  Instead, we need to
 * issue a runtime call to get the address.
 */
LPFN_CONNECTEX ISCConnectEx;
LPFN_ACCEPTEX ISCAcceptEx;
LPFN_GETACCEPTEXSOCKADDRS ISCGetAcceptExSockaddrs;

/*
Francis Dupont's avatar
Francis Dupont committed
99
 * Run expensive internal consistency checks.
100 101 102 103 104 105 106 107
 */
#ifdef ISC_SOCKET_CONSISTENCY_CHECKS
#define CONSISTENT(sock) consistent(sock)
#else
#define CONSISTENT(sock) do {} while (0)
#endif
static void consistent(isc_socket_t *sock);

108 109 110 111 112 113 114
/*
 * Define this macro to control the behavior of connection
 * resets on UDP sockets.  See Microsoft KnowledgeBase Article Q263823
 * for details.
 * NOTE: This requires that Windows 2000 systems install Service Pack 2
 * or later.
 */
115 116
#ifndef SIO_UDP_CONNRESET
#define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12)
117 118
#endif

Danny Mayer's avatar
Danny Mayer committed
119 120 121 122 123 124 125 126 127 128 129 130
/*
 * Some systems define the socket length argument as an int, some as size_t,
 * some as socklen_t.  This is here so it can be easily changed if needed.
 */
#ifndef ISC_SOCKADDR_LEN_T
#define ISC_SOCKADDR_LEN_T unsigned int
#endif

/*
 * Define what the possible "soft" errors can be.  These are non-fatal returns
 * of various network related functions, like recv() and so on.
 */
131
#define SOFT_ERROR(e)	((e) == WSAEINTR || \
132
			 (e) == WSAEWOULDBLOCK || \
133 134 135
			 (e) == EWOULDBLOCK || \
			 (e) == EINTR || \
			 (e) == EAGAIN || \
Danny Mayer's avatar
Danny Mayer committed
136 137
			 (e) == 0)

138 139 140 141 142 143 144
/*
 * Pending errors are not really errors and should be
 * kept separate
 */
#define PENDING_ERROR(e) ((e) == WSA_IO_PENDING || (e) == 0)

#define DOIO_SUCCESS	  0       /* i/o ok, event sent */
145 146 147
#define DOIO_SOFT	  1       /* i/o ok, soft error, no event sent */
#define DOIO_HARD	  2       /* i/o error, event sent */
#define DOIO_EOF	  3       /* EOF, no event sent */
148
#define DOIO_PENDING	  4       /* status when i/o is in process */
149
#define DOIO_NEEDMORE	  5       /* IO was processed, but we need more due to minimum */
150

Danny Mayer's avatar
Danny Mayer committed
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
#define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)

/*
 * DLVL(90)  --  Function entry/exit and other tracing.
 * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
 * DLVL(60)  --  Socket data send/receive
 * DLVL(50)  --  Event tracing, including receiving/sending completion events.
 * DLVL(20)  --  Socket creation/destruction.
 */
#define TRACE_LEVEL		90
#define CORRECTNESS_LEVEL	70
#define IOEVENT_LEVEL		60
#define EVENT_LEVEL		50
#define CREATION_LEVEL		20

#define TRACE		DLVL(TRACE_LEVEL)
#define CORRECTNESS	DLVL(CORRECTNESS_LEVEL)
#define IOEVENT		DLVL(IOEVENT_LEVEL)
#define EVENT		DLVL(EVENT_LEVEL)
#define CREATION	DLVL(CREATION_LEVEL)

typedef isc_event_t intev_t;

174 175 176 177 178 179 180 181 182 183 184 185 186
/*
 * Socket State
 */
enum {
  SOCK_INITIALIZED,	/* Socket Initialized */
  SOCK_OPEN,		/* Socket opened but nothing yet to do */
  SOCK_DATA,		/* Socket sending or receiving data */
  SOCK_LISTEN,		/* TCP Socket listening for connects */
  SOCK_ACCEPT,		/* TCP socket is waiting to accept */
  SOCK_CONNECT,		/* TCP Socket connecting */
  SOCK_CLOSED,		/* Socket has been closed */
};

187 188
#define SOCKET_MAGIC		ISC_MAGIC('I', 'O', 'i', 'o')
#define VALID_SOCKET(t)		ISC_MAGIC_VALID(t, SOCKET_MAGIC)
Danny Mayer's avatar
Danny Mayer committed
189 190 191 192 193 194 195 196 197 198 199 200 201 202

/*
 * IPv6 control information.  If the socket is an IPv6 socket we want
 * to collect the destination address and interface so the client can
 * set them on outgoing packets.
 */
#ifdef ISC_PLATFORM_HAVEIPV6
#ifndef USE_CMSG
#define USE_CMSG	1
#endif
#endif

/*
 * We really  don't want to try and use these control messages. Win32
203
 * doesn't have this mechanism before XP.
Danny Mayer's avatar
Danny Mayer committed
204 205 206 207 208 209 210 211
 */
#undef USE_CMSG

/*
 * Message header for recvmsg and sendmsg calls.
 * Used value-result for recvmsg, value only for sendmsg.
 */
struct msghdr {
212 213
	SOCKADDR_STORAGE to_addr;	/* UDP send/recv address */
	int      to_addr_len;		/* length of the address */
Automatic Updater's avatar
Automatic Updater committed
214 215 216 217
	WSABUF  *msg_iov;		/* scatter/gather array */
	u_int   msg_iovlen;             /* # elements in msg_iov */
	void	*msg_control;           /* ancillary data, see below */
	u_int   msg_controllen;         /* ancillary data buffer len */
218
	u_int	msg_totallen;		/* total length of this message */
Danny Mayer's avatar
Danny Mayer committed
219
} msghdr;
Automatic Updater's avatar
Automatic Updater committed
220

221 222
/*
 * The size to raise the receive buffer to.
223 224 225
 */
#define RCVBUFSIZE (32*1024)

226
/*
227 228
 * The number of times a send operation is repeated if the result
 * is WSAEINTR.
229 230 231
 */
#define NRETRIES 10

Danny Mayer's avatar
Danny Mayer committed
232 233 234 235 236 237
struct isc_socket {
	/* Not locked. */
	unsigned int		magic;
	isc_socketmgr_t	       *manager;
	isc_mutex_t		lock;
	isc_sockettype_t	type;
238

239 240
	/* Pointers to scatter/gather buffers */
	WSABUF			iov[ISC_SOCKET_MAXSCATTERGATHER];
241

Danny Mayer's avatar
Danny Mayer committed
242 243
	/* Locked by socket lock. */
	ISC_LINK(isc_socket_t)	link;
244 245 246
	unsigned int		references; /* EXTERNAL references */
	SOCKET			fd;	/* file handle */
	int			pf;	/* protocol family */
247 248 249
	char			name[16];
	void *			tag;

250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
	/*
	 * Each recv() call uses this buffer.  It is a per-socket receive
	 * buffer that allows us to decouple the system recv() from the
	 * recv_list done events.  This means the items on the recv_list
	 * can be removed without having to cancel pending system recv()
	 * calls.  It also allows us to read-ahead in some cases.
	 */
	struct {
		SOCKADDR_STORAGE	from_addr;	   // UDP send/recv address
		int		from_addr_len;	   // length of the address
		char		*base;		   // the base of the buffer
		char		*consume_position; // where to start copying data from next
		unsigned int	len;		   // the actual size of this buffer
		unsigned int	remaining;	   // the number of bytes remaining
	} recvbuf;

Danny Mayer's avatar
Danny Mayer committed
266 267 268 269 270 271 272
	ISC_LIST(isc_socketevent_t)		send_list;
	ISC_LIST(isc_socketevent_t)		recv_list;
	ISC_LIST(isc_socket_newconnev_t)	accept_list;
	isc_socket_connev_t		       *connect_ev;

	isc_sockaddr_t		address;  /* remote address */

273
	unsigned int		listener : 1,	/* listener socket */
Danny Mayer's avatar
Danny Mayer committed
274
				connected : 1,
275
				pending_connect : 1, /* connect pending */
276 277
				bound : 1,	/* bound to local addr */
				dupped : 1;     /* created by isc_socket_dup() */
278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
	unsigned int		pending_iocp;	/* Should equal the counters below. Debug. */
	unsigned int		pending_recv;  /* Number of outstanding recv() calls. */
	unsigned int		pending_send;  /* Number of outstanding send() calls. */
	unsigned int		pending_accept; /* Number of outstanding accept() calls. */
	unsigned int		state; /* Socket state. Debugging and consistency checking. */
	int			state_lineno;  /* line which last touched state */
};

#define _set_state(sock, _state) do { (sock)->state = (_state); (sock)->state_lineno = __LINE__; } while (0)

/*
 * Buffer structure
 */
typedef struct buflist buflist_t;

struct buflist {
	void			*buf;
	unsigned int		buflen;
	ISC_LINK(buflist_t)	link;
Danny Mayer's avatar
Danny Mayer committed
297 298
};

299 300 301 302 303 304
/*
 * I/O Completion ports Info structures
 */

static HANDLE hHeapHandle = NULL;
typedef struct IoCompletionInfo {
305 306 307 308 309 310 311 312 313
	OVERLAPPED		overlapped;
	isc_socketevent_t	*dev;  /* send()/recv() done event */
	isc_socket_connev_t	*cdev; /* connect() done event */
	isc_socket_newconnev_t	*adev; /* accept() done event */
	void			*acceptbuffer;
	DWORD			received_bytes;
	int			request_type;
	struct msghdr		messagehdr;
	ISC_LIST(buflist_t)	bufferlist;	/*%< list of buffers */
314 315 316 317
} IoCompletionInfo;

/*
 * Define a maximum number of I/O Completion Port worker threads
318 319
 * to handle the load on the Completion Port. The actual number
 * used is the number of CPU's + 1.
320 321 322
 */
#define MAX_IOCPTHREADS 20

323 324 325
#define SOCKET_MANAGER_MAGIC	ISC_MAGIC('I', 'O', 'm', 'g')
#define VALID_MANAGER(m)	ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)

Danny Mayer's avatar
Danny Mayer committed
326 327
struct isc_socketmgr {
	/* Not locked. */
328 329 330
	unsigned int			magic;
	isc_mem_t		       *mctx;
	isc_mutex_t			lock;
331 332
	isc_stats_t		       *stats;

Danny Mayer's avatar
Danny Mayer committed
333
	/* Locked by manager lock. */
334 335 336 337 338 339 340
	ISC_LIST(isc_socket_t)		socklist;
	isc_boolean_t			bShutdown;
	isc_condition_t			shutdown_ok;
	HANDLE				hIoCompletionPort;
	int				maxIOCPThreads;
	HANDLE				hIOCPThreads[MAX_IOCPTHREADS];
	DWORD				dwIOCPThreadIds[MAX_IOCPTHREADS];
341 342 343 344 345 346 347 348 349 350 351 352 353 354

	/*
	 * Debugging.
	 * Modified by InterlockedIncrement() and InterlockedDecrement()
	 */
	LONG				totalSockets;
	LONG				iocp_total;
};

enum {
	SOCKET_RECV,
	SOCKET_SEND,
	SOCKET_ACCEPT,
	SOCKET_CONNECT
Danny Mayer's avatar
Danny Mayer committed
355 356 357 358 359 360
};

/*
 * send() and recv() iovec counts
 */
#define MAXSCATTERGATHER_SEND	(ISC_SOCKET_MAXSCATTERGATHER)
361
#define MAXSCATTERGATHER_RECV	(ISC_SOCKET_MAXSCATTERGATHER)
Danny Mayer's avatar
Danny Mayer committed
362

363 364 365 366
static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
				  isc_sockettype_t type,
				  isc_socket_t **socketp,
				  isc_socket_t *dup_socket);
367
static isc_threadresult_t WINAPI SocketIoThread(LPVOID ThreadContext);
368 369 370 371 372 373 374 375 376 377 378 379
static void maybe_free_socket(isc_socket_t **, int);
static void free_socket(isc_socket_t **, int);
static isc_boolean_t senddone_is_active(isc_socket_t *sock, isc_socketevent_t *dev);
static isc_boolean_t acceptdone_is_active(isc_socket_t *sock, isc_socket_newconnev_t *dev);
static isc_boolean_t connectdone_is_active(isc_socket_t *sock, isc_socket_connev_t *dev);
static void send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev);
static void send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev);
static void send_acceptdone_event(isc_socket_t *sock, isc_socket_newconnev_t **adev);
static void send_connectdone_event(isc_socket_t *sock, isc_socket_connev_t **cdev);
static void send_recvdone_abort(isc_socket_t *sock, isc_result_t result);
static void queue_receive_event(isc_socket_t *sock, isc_task_t *task, isc_socketevent_t *dev);
static void queue_receive_request(isc_socket_t *sock);
380 381

/*
382
 * This is used to dump the contents of the sock structure
383 384 385 386 387 388 389 390
 * You should make sure that the sock is locked before
 * dumping it. Since the code uses simple printf() statements
 * it should only be used interactively.
 */
void
sock_dump(isc_socket_t *sock) {
	isc_socketevent_t *ldev;
	isc_socket_newconnev_t *ndev;
391 392

#if 0
393 394 395 396 397 398 399 400 401
	isc_sockaddr_t addr;
	char socktext[256];

	isc_socket_getpeername(sock, &addr);
	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
	printf("Remote Socket: %s\n", socktext);
	isc_socket_getsockname(sock, &addr);
	isc_sockaddr_format(&addr, socktext, sizeof(socktext));
	printf("This Socket: %s\n", socktext);
402
#endif
403 404 405 406 407

	printf("\n\t\tSock Dump\n");
	printf("\t\tfd: %u\n", sock->fd);
	printf("\t\treferences: %d\n", sock->references);
	printf("\t\tpending_accept: %d\n", sock->pending_accept);
408
	printf("\t\tconnecting: %d\n", sock->pending_connect);
409 410
	printf("\t\tconnected: %d\n", sock->connected);
	printf("\t\tbound: %d\n", sock->bound);
411
	printf("\t\tpending_iocp: %d\n", sock->pending_iocp);
412 413 414 415 416 417 418 419
	printf("\t\tsocket type: %d\n", sock->type);

	printf("\n\t\tSock Recv List\n");
	ldev = ISC_LIST_HEAD(sock->recv_list);
	while (ldev != NULL) {
		printf("\t\tdev: %p\n", ldev);
		ldev = ISC_LIST_NEXT(ldev, ev_link);
	}
420

421 422 423 424 425 426
	printf("\n\t\tSock Send List\n");
	ldev = ISC_LIST_HEAD(sock->send_list);
	while (ldev != NULL) {
		printf("\t\tdev: %p\n", ldev);
		ldev = ISC_LIST_NEXT(ldev, ev_link);
	}
427

428 429 430 431 432 433 434
	printf("\n\t\tSock Accept List\n");
	ndev = ISC_LIST_HEAD(sock->accept_list);
	while (ndev != NULL) {
		printf("\t\tdev: %p\n", ldev);
		ndev = ISC_LIST_NEXT(ndev, ev_link);
	}
}
435 436 437 438 439 440

static void
socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   isc_msgcat_t *msgcat, int msgset, int message,
	   const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
441 442 443 444 445 446 447 448 449 450

/*  This function will add an entry to the I/O completion port
 *  that will signal the I/O thread to exit (gracefully)
 */
static void
signal_iocompletionport_exit(isc_socketmgr_t *manager) {
	int i;
	int errval;
	char strbuf[ISC_STRERRORSIZE];

451
	REQUIRE(VALID_MANAGER(manager));
452
	for (i = 0; i < manager->maxIOCPThreads; i++) {
453 454
		if (!PostQueuedCompletionStatus(manager->hIoCompletionPort,
						0, 0, 0)) {
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472
			errval = GetLastError();
			isc__strerror(errval, strbuf, sizeof(strbuf));
			FATAL_ERROR(__FILE__, __LINE__,
				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
				ISC_MSG_FAILED,
				"Can't request service thread to exit: %s"),
				strbuf);
		}
	}
}

/*
 * Create the worker threads for the I/O Completion Port
 */
void
iocompletionport_createthreads(int total_threads, isc_socketmgr_t *manager) {
	int errval;
	char strbuf[ISC_STRERRORSIZE];
473
	int i;
474

475 476
	INSIST(total_threads > 0);
	REQUIRE(VALID_MANAGER(manager));
477 478 479 480
	/*
	 * We need at least one
	 */
	for (i = 0; i < total_threads; i++) {
481
		manager->hIOCPThreads[i] = CreateThread(NULL, 0, SocketIoThread,
482 483
						manager, 0,
						&manager->dwIOCPThreadIds[i]);
484
		if (manager->hIOCPThreads[i] == NULL) {
485 486 487 488 489 490 491
			errval = GetLastError();
			isc__strerror(errval, strbuf, sizeof(strbuf));
			FATAL_ERROR(__FILE__, __LINE__,
				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
				ISC_MSG_FAILED,
				"Can't create IOCP thread: %s"),
				strbuf);
492
			exit(1);
493 494 495 496
		}
	}
}

497
/*
498
 *  Create/initialise the I/O completion port
499
 */
500 501 502 503
void
iocompletionport_init(isc_socketmgr_t *manager) {
	int errval;
	char strbuf[ISC_STRERRORSIZE];
504 505

	REQUIRE(VALID_MANAGER(manager));
506 507
	/*
	 * Create a private heap to handle the socket overlapped structure
Francis Dupont's avatar
Francis Dupont committed
508
	 * The minimum number of structures is 10, there is no maximum
509
	 */
510 511 512 513 514 515 516 517 518 519 520 521 522 523
	hHeapHandle = HeapCreate(0, 10 * sizeof(IoCompletionInfo), 0);
	if (hHeapHandle == NULL) {
		errval = GetLastError();
		isc__strerror(errval, strbuf, sizeof(strbuf));
		FATAL_ERROR(__FILE__, __LINE__,
			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
					   ISC_MSG_FAILED,
					   "HeapCreate() failed during "
					   "initialization: %s"),
			    strbuf);
		exit(1);
	}

	manager->maxIOCPThreads = min(isc_os_ncpus() + 1, MAX_IOCPTHREADS);
524 525 526

	/* Now Create the Completion Port */
	manager->hIoCompletionPort = CreateIoCompletionPort(
527 528
			INVALID_HANDLE_VALUE, NULL,
			0, manager->maxIOCPThreads);
529 530 531 532 533 534 535 536 537 538 539
	if (manager->hIoCompletionPort == NULL) {
		errval = GetLastError();
		isc__strerror(errval, strbuf, sizeof(strbuf));
		FATAL_ERROR(__FILE__, __LINE__,
				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
				ISC_MSG_FAILED,
				"CreateIoCompletionPort() failed "
				"during initialization: %s"),
				strbuf);
		exit(1);
	}
540

541 542
	/*
	 * Worker threads for servicing the I/O
Automatic Updater's avatar
Automatic Updater committed
543
	 */
544 545
	iocompletionport_createthreads(manager->maxIOCPThreads, manager);
}
546

547
/*
548 549
 * Associate a socket with an IO Completion Port.  This allows us to queue events for it
 * and have our worker pool of threads process them.
550
 */
551
void
552 553
iocompletionport_update(isc_socket_t *sock) {
	HANDLE hiocp;
554
	char strbuf[ISC_STRERRORSIZE];
555

556
	REQUIRE(VALID_SOCKET(sock));
557

558 559 560 561 562 563
	hiocp = CreateIoCompletionPort((HANDLE)sock->fd,
		sock->manager->hIoCompletionPort, (ULONG_PTR)sock, 0);

	if (hiocp == NULL) {
		DWORD errval = GetLastError();
		isc__strerror(errval, strbuf, sizeof(strbuf));
564 565 566 567
		isc_log_iwrite(isc_lctx,
				ISC_LOGCATEGORY_GENERAL,
				ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
				isc_msgcat, ISC_MSGSET_SOCKET,
Automatic Updater's avatar
Automatic Updater committed
568
				ISC_MSG_TOOMANYHANDLES,
569 570 571 572 573 574 575 576 577 578 579 580 581 582 583
				"iocompletionport_update: failed to open"
				" io completion port: %s",
				strbuf);

		/* XXXMLG temporary hack to make failures detected.
		 * This function should return errors to the caller, not
		 * exit here.
		 */
		FATAL_ERROR(__FILE__, __LINE__,
				isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
				ISC_MSG_FAILED,
				"CreateIoCompletionPort() failed "
				"during initialization: %s"),
				strbuf);
		exit(1);
584 585
	}

586
	InterlockedIncrement(&sock->manager->iocp_total);
587
}
588

589
/*
590 591 592
 * Routine to cleanup and then close the socket.
 * Only close the socket here if it is NOT associated
 * with an event, otherwise the WSAWaitForMultipleEvents
593
 * may fail due to the fact that the Wait should not
594 595
 * be running while closing an event or a socket.
 * The socket is locked before calling this function
596
 */
597 598
void
socket_close(isc_socket_t *sock) {
599

600
	REQUIRE(sock != NULL);
601

602 603 604 605 606
	if (sock->fd != INVALID_SOCKET) {
		closesocket(sock->fd);
		sock->fd = INVALID_SOCKET;
		_set_state(sock, SOCK_CLOSED);
		InterlockedDecrement(&sock->manager->totalSockets);
607
	}
608
}
609

610 611
static isc_once_t initialise_once = ISC_ONCE_INIT;
static isc_boolean_t initialised = ISC_FALSE;
612

613 614 615 616 617 618 619 620 621 622
static void
initialise(void) {
	WORD wVersionRequested;
	WSADATA wsaData;
	int err;
	SOCKET sock;
	GUID GUIDConnectEx = WSAID_CONNECTEX;
	GUID GUIDAcceptEx = WSAID_ACCEPTEX;
	GUID GUIDGetAcceptExSockaddrs = WSAID_GETACCEPTEXSOCKADDRS;
	DWORD dwBytes;
623

624 625 626 627 628 629 630 631 632 633 634 635
	/* Need Winsock 2.2 or better */
	wVersionRequested = MAKEWORD(2, 2);

	err = WSAStartup(wVersionRequested, &wsaData);
	if (err != 0) {
		char strbuf[ISC_STRERRORSIZE];
		isc__strerror(err, strbuf, sizeof(strbuf));
		FATAL_ERROR(__FILE__, __LINE__, "WSAStartup() %s: %s",
			    isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
					   ISC_MSG_FAILED, "failed"),
			    strbuf);
		exit(1);
636 637
	}
	/*
638 639 640 641
	 * The following APIs do not exist as functions in a library, but we must
	 * ask winsock for them.  They are "extensions" -- but why they cannot be
	 * actual functions is beyond me.  So, ask winsock for the pointers to the
	 * functions we need.
642
	 */
643 644 645 646 647 648 649
	sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
	INSIST(sock != INVALID_SOCKET);
	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
		 &GUIDConnectEx, sizeof(GUIDConnectEx),
		 &ISCConnectEx, sizeof(ISCConnectEx),
		 &dwBytes, NULL, NULL);
	INSIST(err == 0);
650

651 652 653 654 655
	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
		 &GUIDAcceptEx, sizeof(GUIDAcceptEx),
		 &ISCAcceptEx, sizeof(ISCAcceptEx),
		 &dwBytes, NULL, NULL);
	INSIST(err == 0);
656

657 658 659 660 661
	err = WSAIoctl(sock,  SIO_GET_EXTENSION_FUNCTION_POINTER,
		 &GUIDGetAcceptExSockaddrs, sizeof(GUIDGetAcceptExSockaddrs),
		 &ISCGetAcceptExSockaddrs, sizeof(ISCGetAcceptExSockaddrs),
		 &dwBytes, NULL, NULL);
	INSIST(err == 0);
662

663
	closesocket(sock);
664

665
	initialised = ISC_TRUE;
666
}
667

668
/*
669
 * Initialize socket services
670
 */
671 672 673 674 675 676 677 678 679 680 681
void
InitSockets(void) {
	RUNTIME_CHECK(isc_once_do(&initialise_once,
				  initialise) == ISC_R_SUCCESS);
	if (!initialised)
		exit(1);
}

int
internal_sendmsg(isc_socket_t *sock, IoCompletionInfo *lpo,
		 struct msghdr *messagehdr, int flags, int *Error)
682
{
683 684 685 686
	int Result;
	DWORD BytesSent;
	DWORD Flags = flags;
	int total_sent;
687

688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711
	*Error = 0;
	Result = WSASendTo(sock->fd, messagehdr->msg_iov,
			   messagehdr->msg_iovlen, &BytesSent,
			   Flags, (SOCKADDR *)&messagehdr->to_addr,
			   messagehdr->to_addr_len, (LPWSAOVERLAPPED)lpo,
			   NULL);

	total_sent = (int)BytesSent;

	/* Check for errors.*/
	if (Result == SOCKET_ERROR) {
		*Error = WSAGetLastError();

		switch (*Error) {
		case WSA_IO_INCOMPLETE:
		case WSA_WAIT_IO_COMPLETION:
		case WSA_IO_PENDING:
		case NO_ERROR:		/* Strange, but okay */
			sock->pending_iocp++;
			sock->pending_send++;
			break;

		default:
			return (-1);
712 713
			break;
		}
714 715 716
	} else {
		sock->pending_iocp++;
		sock->pending_send++;
717
	}
718

719 720 721 722 723 724 725 726 727 728 729 730
	if (lpo != NULL)
		return (0);
	else
		return (total_sent);
}

static void
queue_receive_request(isc_socket_t *sock) {
	DWORD Flags = 0;
	DWORD NumBytes = 0;
	int Result;
	int Error;
731
	int need_retry;
732
	WSABUF iov[1];
733
	IoCompletionInfo *lpo = NULL;
734 735
	isc_result_t isc_result;

736 737 738
 retry:
	need_retry = ISC_FALSE;

739
	/*
740
	 * If we already have a receive pending, do nothing.
741
	 */
742 743 744
	if (sock->pending_recv > 0) {
		if (lpo != NULL)
			HeapFree(hHeapHandle, 0, lpo);
745
		return;
746
	}
747

748 749 750
	/*
	 * If no one is waiting, do nothing.
	 */
751 752 753
	if (ISC_LIST_EMPTY(sock->recv_list)) {
		if (lpo != NULL)
			HeapFree(hHeapHandle, 0, lpo);
754
		return;
755
	}
756

757 758
	INSIST(sock->recvbuf.remaining == 0);
	INSIST(sock->fd != INVALID_SOCKET);
759

760 761
	iov[0].len = sock->recvbuf.len;
	iov[0].buf = sock->recvbuf.base;
762

763 764 765 766 767 768 769
	if (lpo == NULL) {
		lpo = (IoCompletionInfo *)HeapAlloc(hHeapHandle,
						    HEAP_ZERO_MEMORY,
						    sizeof(IoCompletionInfo));
		RUNTIME_CHECK(lpo != NULL);
	} else
		ZeroMemory(lpo, sizeof(IoCompletionInfo));
770
	lpo->request_type = SOCKET_RECV;
771

772
	sock->recvbuf.from_addr_len = sizeof(sock->recvbuf.from_addr);
773

774 775 776 777 778 779
	Error = 0;
	Result = WSARecvFrom((SOCKET)sock->fd, iov, 1,
			     &NumBytes, &Flags,
			     (SOCKADDR *)&sock->recvbuf.from_addr,
			     &sock->recvbuf.from_addr_len,
			     (LPWSAOVERLAPPED)lpo, NULL);
780

781 782 783
	/* Check for errors. */
	if (Result == SOCKET_ERROR) {
		Error = WSAGetLastError();
784

785 786 787 788 789
		switch (Error) {
		case WSA_IO_PENDING:
			sock->pending_iocp++;
			sock->pending_recv++;
			break;
790

791
		/* direct error: no completion event */
792 793 794
		case ERROR_HOST_UNREACHABLE:
		case WSAENETRESET:
		case WSAECONNRESET:
795 796 797
			if (!sock->connected) {
				/* soft error */
				need_retry = ISC_TRUE;
798
				break;
799 800
			}
			/* FALLTHROUGH */
801

802 803
		default:
			isc_result = isc__errno2result(Error);
804
			if (isc_result == ISC_R_UNEXPECTED)
805 806 807 808
				UNEXPECTED_ERROR(__FILE__, __LINE__,
					"WSARecvFrom: Windows error code: %d, isc result %d",
					Error, isc_result);
			send_recvdone_abort(sock, isc_result);
809 810
			HeapFree(hHeapHandle, 0, lpo);
			lpo = NULL;
811 812
			break;
		}
813 814 815 816 817 818 819
	} else {
		/*
		 * The recv() finished immediately, but we will still get
		 * a completion event.  Rather than duplicate code, let
		 * that thread handle sending the data along its way.
		 */
		sock->pending_iocp++;
820
		sock->pending_recv++;
821
	}
822

823 824 825 826 827 828 829
	socket_log(__LINE__, sock, NULL, IOEVENT,
		   isc_msgcat, ISC_MSGSET_SOCKET,
		   ISC_MSG_DOIORECV,
		   "queue_io_request: fd %d result %d error %d",
		   sock->fd, Result, Error);

	CONSISTENT(sock);
830

831 832
	if (need_retry)
		goto retry;
833
}
Danny Mayer's avatar
Danny Mayer committed
834 835

static void
836 837 838
manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category,
	    isc_logmodule_t *module, int level, const char *fmt, ...)
{
Danny Mayer's avatar
Danny Mayer committed
839 840 841
	char msgbuf[2048];
	va_list ap;

842
	if (!isc_log_wouldlog(isc_lctx, level))
Danny Mayer's avatar
Danny Mayer committed
843 844 845 846 847 848 849 850 851 852
		return;

	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	isc_log_write(isc_lctx, category, module, level,
		      "sockmgr %p: %s", sockmgr, msgbuf);
}

853
static void
854
socket_log(int lineno, isc_socket_t *sock, isc_sockaddr_t *address,
Danny Mayer's avatar
Danny Mayer committed
855 856
	   isc_logcategory_t *category, isc_logmodule_t *module, int level,
	   isc_msgcat_t *msgcat, int msgset, int message,
857 858
	   const char *fmt, ...)
{
Danny Mayer's avatar
Danny Mayer committed
859 860 861 862
	char msgbuf[2048];
	char peerbuf[256];
	va_list ap;

863 864

	if (!isc_log_wouldlog(isc_lctx, level))
Danny Mayer's avatar
Danny Mayer committed
865 866 867 868 869 870 871 872 873
		return;

	va_start(ap, fmt);
	vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
	va_end(ap);

	if (address == NULL) {
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
874
			       "socket %p line %d: %s", sock, lineno, msgbuf);
875
	} else {
Mark Andrews's avatar
Mark Andrews committed
876
		isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
877 878
		isc_log_iwrite(isc_lctx, category, module, level,
			       msgcat, msgset, message,
879 880
				   "socket %p line %d peer %s: %s", sock, lineno,
				   peerbuf, msgbuf);
Danny Mayer's avatar
Danny Mayer committed
881
	}
882

Danny Mayer's avatar
Danny Mayer committed
883
}
884

Danny Mayer's avatar
Danny Mayer committed
885
/*
886
 * Make an fd SOCKET non-blocking.
Danny Mayer's avatar
Danny Mayer committed
887 888
 */
static isc_result_t
889
make_nonblock(SOCKET fd) {
Danny Mayer's avatar
Danny Mayer committed
890 891
	int ret;
	unsigned long flags = 1;
892
	char strbuf[ISC_STRERRORSIZE];
Danny Mayer's avatar
Danny Mayer committed
893 894

	/* Set the socket to non-blocking */
895
	ret = ioctlsocket(fd, FIONBIO, &flags);
Danny Mayer's avatar
Danny Mayer committed
896 897

	if (ret == -1) {
898
		isc__strerror(errno, strbuf, sizeof(strbuf));
Danny Mayer's avatar
Danny Mayer committed
899 900
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "ioctlsocket(%d, FIOBIO, %d): %s",
901
				 fd, flags, strbuf);
Danny Mayer's avatar
Danny Mayer committed
902 903 904 905 906 907

		return (ISC_R_UNEXPECTED);
	}

	return (ISC_R_SUCCESS);
}
908

Danny Mayer's avatar
Danny Mayer committed
909
/*
910
 * Windows 2000 systems incorrectly cause UDP sockets using WSARecvFrom
911 912 913 914 915 916 917
 * to not work correctly, returning a WSACONNRESET error when a WSASendTo
 * fails with an "ICMP port unreachable" response and preventing the
 * socket from using the WSARecvFrom in subsequent operations.
 * The function below fixes this, but requires that Windows 2000
 * Service Pack 2 or later be installed on the system.  NT 4.0
 * systems are not affected by this and work correctly.
 * See Microsoft Knowledge Base Article Q263823 for details of this.
Danny Mayer's avatar
Danny Mayer committed
918
 */
919 920 921 922 923 924
isc_result_t
connection_reset_fix(SOCKET fd) {
	DWORD dwBytesReturned = 0;
	BOOL  bNewBehavior = FALSE;
	DWORD status;

925
	if (isc_win32os_majorversion() < 5)
926 927
		return (ISC_R_SUCCESS); /*  NT 4.0 has no problem */

928
	/* disable bad behavior using IOCTL: SIO_UDP_CONNRESET */
929 930 931 932 933
	status = WSAIoctl(fd, SIO_UDP_CONNRESET, &bNewBehavior,
			  sizeof(bNewBehavior), NULL, 0,
			  &dwBytesReturned, NULL, NULL);
	if (status != SOCKET_ERROR)
		return (ISC_R_SUCCESS);
934 935 936 937 938
	else {
		UNEXPECTED_ERROR(__FILE__, __LINE__,
				 "WSAIoctl(SIO_UDP_CONNRESET, oldBehaviour) %s",
				 isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
						ISC_MSG_FAILED, "failed"));
939
		return (ISC_R_UNEXPECTED);
940
	}
Danny Mayer's avatar
Danny Mayer committed
941 942 943 944 945 946 947 948 949 950 951 952 953
}

/*
 * Construct an iov array and attach it to the msghdr passed in.  This is
 * the SEND constructor, which will use the used region of the buffer
 * (if using a buffer list) or will use the internal region (if a single
 * buffer I/O is requested).
 *
 * Nothing can be NULL, and the done event must list at least one buffer
 * on the buffer linked list for this function to be meaningful.
 */
static void
build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev,
954 955
		  struct msghdr *msg, char *cmsg, WSABUF *iov,
		  IoCompletionInfo  *lpo)
956
{
Danny Mayer's avatar
Danny Mayer committed
957 958
	unsigned int iovcount;
	isc_buffer_t *buffer;
959
	buflist_t  *cpbuffer;
Danny Mayer's avatar
Danny Mayer committed
960 961 962 963
	isc_region_t used;
	size_t write_count;
	size_t skip_count;

Mark Andrews's avatar
Mark Andrews committed
964
	memset(msg, 0, sizeof(*msg));
Danny Mayer's avatar
Danny Mayer committed
965

966
	memmove(&msg->to_addr, &dev->address.type, dev->address.length);
967
	msg->to_addr_len = dev->address.length;
Danny Mayer's avatar
Danny Mayer committed
968 969 970 971 972 973 974 975 976 977

	buffer = ISC_LIST_HEAD(dev->bufferlist);
	write_count = 0;
	iovcount = 0;

	/*
	 * Single buffer I/O?  Skip what we've done so far in this region.
	 */
	if (buffer == NULL) {
		write_count = dev->region.length - dev->n;
978 979 980 981 982 983 984 985 986 987
		cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
		RUNTIME_CHECK(cpbuffer != NULL);
		cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, write_count);
		RUNTIME_CHECK(cpbuffer->buf != NULL);

		socket_log(__LINE__, sock, NULL, TRACE,
		   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
		   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
		   cpbuffer->buf, write_count);

988
		memmove(cpbuffer->buf,(dev->region.base + dev->n), write_count);
989
		cpbuffer->buflen = (unsigned int)write_count;
990 991
		ISC_LIST_ENQUEUE(lpo->bufferlist, cpbuffer, link);
		iov[0].buf = cpbuffer->buf;
992
		iov[0].len = (u_long)write_count;
Danny Mayer's avatar
Danny Mayer committed
993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016
		iovcount = 1;

		goto config;
	}

	/*
	 * Multibuffer I/O.
	 * Skip the data in the buffer list that we have already written.
	 */
	skip_count = dev->n;
	while (buffer != NULL) {
		REQUIRE(ISC_BUFFER_VALID(buffer));
		if (skip_count < isc_buffer_usedlength(buffer))
			break;
		skip_count -= isc_buffer_usedlength(buffer);
		buffer = ISC_LIST_NEXT(buffer, link);
	}

	while (buffer != NULL) {
		INSIST(iovcount < MAXSCATTERGATHER_SEND);

		isc_buffer_usedregion(buffer, &used);

		if (used.length > 0) {
1017
			int uselen = (int)(used.length - skip_count);
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
			cpbuffer = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, sizeof(buflist_t));
			RUNTIME_CHECK(cpbuffer != NULL);
			cpbuffer->buf = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, uselen);
			RUNTIME_CHECK(cpbuffer->buf != NULL);

			socket_log(__LINE__, sock, NULL, TRACE,
			   isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
			   "alloc_buffer %p %d %p %d", cpbuffer, sizeof(buflist_t),
			   cpbuffer->buf, write_count);

1028
			memmove(cpbuffer->buf,(used.base + skip_count), uselen);
1029 1030
			cpbuffer->buflen = uselen;
			iov[iovcount].buf = cpbuffer->buf;
1031
			iov[iovcount].len = (u_long)(used.length - skip_count);
1032
			write_count += uselen;
Danny Mayer's avatar
Danny Mayer committed
1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
			skip_count = 0;
			iovcount++;
		}
		buffer = ISC_LIST_NEXT(buffer, link);
	}

	INSIST(skip_count == 0);

 config:
	msg->msg_iov = iov;
	msg->msg_iovlen = iovcount;
1044
	msg->msg_totallen = (u_int)write_count;
Danny Mayer's avatar
Danny Mayer committed
1045 1046 1047 1048
}

static void
set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock,
1049 1050
		isc_socketevent_t *dev)
{
Danny Mayer's avatar
Danny Mayer committed
1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
	if (sock->type == isc_sockettype_udp) {
		if (address != NULL)
			dev->address = *address;
		else
			dev->address = sock->address;
	} else if (sock->type == isc_sockettype_tcp) {
		INSIST(address == NULL);
		dev->address = sock->address;
	}
}

1062 1063 1064 1065 1066 1067 1068 1069 1070
static void
destroy_socketevent(isc_event_t *event) {
	isc_socketevent_t *ev = (isc_socketevent_t *)event;

	INSIST(ISC_LIST_EMPTY(ev->bufferlist));

	(ev->destroy)(event);
}

Danny Mayer's avatar
Danny Mayer committed
1071
static isc_socketevent_t *
Evan Hunt's avatar
Evan Hunt committed
1072 1073
allocate_socketevent(isc_mem_t *mctx, isc_socket_t *sock,
		     isc_eventtype_t eventtype, isc_taskaction_t action,
Mark Andrews's avatar