rbtdb.c 283 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
2
 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3
 *
4 5 6
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 8 9
 *
 * See the COPYRIGHT file distributed with this work for additional
 * information regarding copyright ownership.
Bob Halley's avatar
Bob Halley committed
10 11
 */

12
/*! \file */
David Lawrence's avatar
David Lawrence committed
13

Mark Andrews's avatar
Mark Andrews committed
14
/* #define inline */
15

16
#include <inttypes.h>
17
#include <stdbool.h>
Mark Andrews's avatar
Mark Andrews committed
18

Mark Andrews's avatar
Mark Andrews committed
19
#include <isc/atomic.h>
20
#include <isc/crc64.h>
21
#include <isc/event.h>
22
#include <isc/file.h>
23 24
#include <isc/hash.h>
#include <isc/heap.h>
Evan Hunt's avatar
Evan Hunt committed
25
#include <isc/hex.h>
26
#include <isc/mem.h>
27
#include <isc/mutex.h>
28
#include <isc/once.h>
29
#include <isc/platform.h>
30
#include <isc/print.h>
31
#include <isc/random.h>
32
#include <isc/refcount.h>
Bob Halley's avatar
Bob Halley committed
33
#include <isc/rwlock.h>
34
#include <isc/serial.h>
35 36
#include <isc/socket.h>
#include <isc/stdio.h>
37
#include <isc/string.h>
38
#include <isc/task.h>
39
#include <isc/time.h>
Michael Graff's avatar
Michael Graff committed
40
#include <isc/util.h>
Bob Halley's avatar
Bob Halley committed
41

42
#include <dns/callbacks.h>
43 44
#include <dns/db.h>
#include <dns/dbiterator.h>
45
#include <dns/events.h>
46
#include <dns/fixedname.h>
47
#include <dns/lib.h>
48
#include <dns/log.h>
49
#include <dns/masterdump.h>
50
#include <dns/nsec.h>
51
#include <dns/nsec3.h>
52
#include <dns/rbt.h>
53
#include <dns/rdata.h>
Bob Halley's avatar
Bob Halley committed
54 55
#include <dns/rdataset.h>
#include <dns/rdatasetiter.h>
56
#include <dns/rdataslab.h>
57
#include <dns/rdatastruct.h>
58
#include <dns/result.h>
59
#include <dns/stats.h>
60
#include <dns/time.h>
61 62
#include <dns/view.h>
#include <dns/zone.h>
63
#include <dns/zonekey.h>
Bob Halley's avatar
Bob Halley committed
64

Evan Hunt's avatar
Evan Hunt committed
65
#ifndef WIN32
66
#include <sys/mman.h>
67
#else /* ifndef WIN32 */
Evan Hunt's avatar
Evan Hunt committed
68 69
#define PROT_READ   0x01
#define PROT_WRITE  0x02
70
#define MAP_PRIVATE 0x0002
Evan Hunt's avatar
Evan Hunt committed
71
#define MAP_FAILED  ((void *)-1)
72
#endif /* ifndef WIN32 */
73

Bob Halley's avatar
Bob Halley committed
74 75
#include "rbtdb.h"

76
#define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '4')
77

78 79 80 81 82
#define CHECK(op)                            \
	do {                                 \
		result = (op);               \
		if (result != ISC_R_SUCCESS) \
			goto failure;        \
83 84 85
	} while (0)

/*
Evan Hunt's avatar
Evan Hunt committed
86
 * This is the map file header for RBTDB images.  It is populated, and then
87 88 89 90 91 92 93
 * written, as the LAST thing done to the file.  Writing this last (with
 * zeros in the header area initially) will ensure that the header is only
 * valid when the RBTDB image is also valid.
 */
typedef struct rbtdb_file_header rbtdb_file_header_t;

/* Header length, always the same size regardless of structure size */
94
#define RBTDB_HEADER_LENGTH 1024
95 96

struct rbtdb_file_header {
Evan Hunt's avatar
Evan Hunt committed
97 98
	char version1[32];
	uint32_t ptrsize;
99
	unsigned int bigendian : 1;
Evan Hunt's avatar
Evan Hunt committed
100 101 102
	uint64_t tree;
	uint64_t nsec;
	uint64_t nsec3;
103 104

	char version2[32]; /* repeated; must match version1 */
105 106
};

107
/*%
108 109 110
 * Note that "impmagic" is not the first four bytes of the struct, so
 * ISC_MAGIC_VALID cannot be used.
 */
111 112
#define VALID_RBTDB(rbtdb) \
	((rbtdb) != NULL && (rbtdb)->common.impmagic == RBTDB_MAGIC)
Bob Halley's avatar
Bob Halley committed
113

114 115
typedef uint32_t rbtdb_serial_t;
typedef uint32_t rbtdb_rdatatype_t;
Bob Halley's avatar
Bob Halley committed
116

117
#define RBTDB_RDATATYPE_BASE(type) ((dns_rdatatype_t)((type)&0xFFFF))
Evan Hunt's avatar
Evan Hunt committed
118
#define RBTDB_RDATATYPE_EXT(type)  ((dns_rdatatype_t)((type) >> 16))
119 120 121
#define RBTDB_RDATATYPE_VALUE(base, ext)              \
	((rbtdb_rdatatype_t)(((uint32_t)ext) << 16) | \
	 (((uint32_t)base) & 0xffff))
Bob Halley's avatar
Bob Halley committed
122

123
#define RBTDB_RDATATYPE_SIGNSEC \
124
	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
125
#define RBTDB_RDATATYPE_SIGNSEC3 \
126
	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec3)
Bob Halley's avatar
Bob Halley committed
127
#define RBTDB_RDATATYPE_SIGNS \
128
	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
Bob Halley's avatar
Bob Halley committed
129
#define RBTDB_RDATATYPE_SIGCNAME \
130
	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
131
#define RBTDB_RDATATYPE_SIGDNAME \
132
	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
133
#define RBTDB_RDATATYPE_SIGDS \
134
	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ds)
135 136
#define RBTDB_RDATATYPE_SIGSOA \
	RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_soa)
137
#define RBTDB_RDATATYPE_NCACHEANY RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
Bob Halley's avatar
Bob Halley committed
138

Evan Hunt's avatar
Evan Hunt committed
139
#define RBTDB_INITLOCK(l)    isc_rwlock_init((l), 0, 0)
140
#define RBTDB_DESTROYLOCK(l) isc_rwlock_destroy(l)
Evan Hunt's avatar
Evan Hunt committed
141 142
#define RBTDB_LOCK(l, t)     RWLOCK((l), (t))
#define RBTDB_UNLOCK(l, t)   RWUNLOCK((l), (t))
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164

/*
 * Since node locking is sensitive to both performance and memory footprint,
 * we need some trick here.  If we have both high-performance rwlock and
 * high performance and small-memory reference counters, we use rwlock for
 * node lock and isc_refcount for node references.  In this case, we don't have
 * to protect the access to the counters by locks.
 * Otherwise, we simply use ordinary mutex lock for node locking, and use
 * simple integers as reference counters which is protected by the lock.
 * In most cases, we can simply use wrapper macros such as NODE_LOCK and
 * NODE_UNLOCK.  In some other cases, however, we need to protect reference
 * counters first and then protect other parts of a node as read-only data.
 * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
 * provided for these special cases.  When we can use the efficient backend
 * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
 * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
 * section including the access to the reference counter.
 * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
 * section is also protected by NODE_STRONGLOCK().
 */
typedef isc_rwlock_t nodelock_t;

Evan Hunt's avatar
Evan Hunt committed
165
#define NODE_INITLOCK(l)    isc_rwlock_init((l), 0, 0)
166
#define NODE_DESTROYLOCK(l) isc_rwlock_destroy(l)
Evan Hunt's avatar
Evan Hunt committed
167 168 169 170
#define NODE_LOCK(l, t)	    RWLOCK((l), (t))
#define NODE_UNLOCK(l, t)   RWUNLOCK((l), (t))
#define NODE_TRYUPGRADE(l)  isc_rwlock_tryupgrade(l)
#define NODE_DOWNGRADE(l)   isc_rwlock_downgrade(l)
171

172 173
/*%
 * Whether to rate-limit updating the LRU to avoid possible thread contention.
174 175
 * Updating LRU requires write locking, so we don't do it every time the
 * record is touched - only after some time passes.
176 177
 */
#ifndef DNS_RBTDB_LIMITLRUUPDATE
178 179 180 181 182 183 184
#define DNS_RBTDB_LIMITLRUUPDATE 1
#endif

/*% Time after which we update LRU for glue records, 5 minutes */
#define DNS_RBTDB_LRUUPDATE_GLUE 300
/*% Time after which we update LRU for all other records, 10 minutes */
#define DNS_RBTDB_LRUUPDATE_REGULAR 600
185

186
/*
187
 * Allow clients with a virtual time of up to 5 minutes in the past to see
188 189 190 191
 * records that would have otherwise have expired.
 */
#define RBTDB_VIRTUAL 300

192
struct noqname {
Evan Hunt's avatar
Evan Hunt committed
193 194 195
	dns_name_t name;
	void *neg;
	void *negsig;
196
	dns_rdatatype_t type;
197 198
};

Bob Halley's avatar
Bob Halley committed
199
typedef struct rdatasetheader {
Automatic Updater's avatar
Automatic Updater committed
200 201 202
	/*%
	 * Locked by the owning node's lock.
	 */
Evan Hunt's avatar
Evan Hunt committed
203 204
	rbtdb_serial_t serial;
	dns_ttl_t rdh_ttl;
205
	rbtdb_rdatatype_t type;
206
	atomic_uint_least16_t attributes;
Evan Hunt's avatar
Evan Hunt committed
207 208 209 210 211 212 213
	dns_trust_t trust;
	struct noqname *noqname;
	struct noqname *closest;
	unsigned int is_mmapped : 1;
	unsigned int next_is_relative : 1;
	unsigned int node_is_relative : 1;
	unsigned int resign_lsb : 1;
Automatic Updater's avatar
Automatic Updater committed
214 215 216 217 218
	/*%<
	 * We don't use the LIST macros, because the LIST structure has
	 * both head and tail pointers, and is doubly linked.
	 */

219
	struct rdatasetheader *next;
Automatic Updater's avatar
Automatic Updater committed
220 221 222 223 224 225 226
	/*%<
	 * If this is the top header for an rdataset, 'next' points
	 * to the top header for the next rdataset (i.e., the next type).
	 * Otherwise, it points up to the header whose down pointer points
	 * at this header.
	 */

227
	struct rdatasetheader *down;
Automatic Updater's avatar
Automatic Updater committed
228 229 230 231 232
	/*%<
	 * Points to the header for the next older version of
	 * this rdataset.
	 */

233
	atomic_uint_fast32_t count;
Automatic Updater's avatar
Automatic Updater committed
234 235 236
	/*%<
	 * Monotonously increased every time this rdataset is bound so that
	 * it is used as the base of the starting point in DNS responses
Mark Andrews's avatar
Mark Andrews committed
237
	 * when the "cyclic" rrset-order is required.
Automatic Updater's avatar
Automatic Updater committed
238 239
	 */

240
	dns_rbtnode_t *node;
Evan Hunt's avatar
Evan Hunt committed
241
	isc_stdtime_t last_used;
242
	ISC_LINK(struct rdatasetheader) link;
Automatic Updater's avatar
Automatic Updater committed
243

244
	unsigned int heap_index;
Automatic Updater's avatar
Automatic Updater committed
245 246 247
	/*%<
	 * Used for TTL-based cache cleaning.
	 */
248
	isc_stdtime_t resign;
249 250 251 252 253
	/*%<
	 * Case vector.  If the bit is set then the corresponding
	 * character in the owner name needs to be AND'd with 0x20,
	 * rendering that character upper case.
	 */
254
	unsigned char upper[32];
Bob Halley's avatar
Bob Halley committed
255 256
} rdatasetheader_t;

257 258
typedef ISC_LIST(rdatasetheader_t) rdatasetheaderlist_t;
typedef ISC_LIST(dns_rbtnode_t) rbtnodelist_t;
259

260
#define RDATASET_ATTR_NONEXISTENT 0x0001
261
/*%< May be potentially served as stale data. */
Evan Hunt's avatar
Evan Hunt committed
262 263 264 265 266 267 268 269 270 271 272
#define RDATASET_ATTR_STALE	     0x0002
#define RDATASET_ATTR_IGNORE	     0x0004
#define RDATASET_ATTR_RETAIN	     0x0008
#define RDATASET_ATTR_NXDOMAIN	     0x0010
#define RDATASET_ATTR_RESIGN	     0x0020
#define RDATASET_ATTR_STATCOUNT	     0x0040
#define RDATASET_ATTR_OPTOUT	     0x0080
#define RDATASET_ATTR_NEGATIVE	     0x0100
#define RDATASET_ATTR_PREFETCH	     0x0200
#define RDATASET_ATTR_CASESET	     0x0400
#define RDATASET_ATTR_ZEROTTL	     0x0800
273
#define RDATASET_ATTR_CASEFULLYLOWER 0x1000
274
/*%< Ancient - awaiting cleanup. */
275
#define RDATASET_ATTR_ANCIENT 0x2000
276

Michael Graff's avatar
Michael Graff committed
277 278 279 280 281 282 283
/*
 * XXX
 * When the cache will pre-expire data (due to memory low or other
 * situations) before the rdataset's TTL has expired, it MUST
 * respect the RETAIN bit and not expire the data until its TTL is
 * expired.
 */
284

285
#undef IGNORE /* WIN32 winbase.h defines this. */
286

287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
#define EXISTS(header)                                 \
	((atomic_load_acquire(&(header)->attributes) & \
	  RDATASET_ATTR_NONEXISTENT) == 0)
#define NONEXISTENT(header)                            \
	((atomic_load_acquire(&(header)->attributes) & \
	  RDATASET_ATTR_NONEXISTENT) != 0)
#define IGNORE(header)                                 \
	((atomic_load_acquire(&(header)->attributes) & \
	  RDATASET_ATTR_IGNORE) != 0)
#define RETAIN(header)                                 \
	((atomic_load_acquire(&(header)->attributes) & \
	  RDATASET_ATTR_RETAIN) != 0)
#define NXDOMAIN(header)                               \
	((atomic_load_acquire(&(header)->attributes) & \
	  RDATASET_ATTR_NXDOMAIN) != 0)
#define STALE(header)                                                          \
	((atomic_load_acquire(&(header)->attributes) & RDATASET_ATTR_STALE) != \
	 0)
#define RESIGN(header)                                 \
	((atomic_load_acquire(&(header)->attributes) & \
	  RDATASET_ATTR_RESIGN) != 0)
#define OPTOUT(header)                                 \
	((atomic_load_acquire(&(header)->attributes) & \
	  RDATASET_ATTR_OPTOUT) != 0)
#define NEGATIVE(header)                               \
	((atomic_load_acquire(&(header)->attributes) & \
	  RDATASET_ATTR_NEGATIVE) != 0)
#define PREFETCH(header)                               \
	((atomic_load_acquire(&(header)->attributes) & \
	  RDATASET_ATTR_PREFETCH) != 0)
#define CASESET(header)                                \
	((atomic_load_acquire(&(header)->attributes) & \
	  RDATASET_ATTR_CASESET) != 0)
#define ZEROTTL(header)                                \
	((atomic_load_acquire(&(header)->attributes) & \
	  RDATASET_ATTR_ZEROTTL) != 0)
#define CASEFULLYLOWER(header)                         \
	((atomic_load_acquire(&(header)->attributes) & \
	  RDATASET_ATTR_CASEFULLYLOWER) != 0)
#define ANCIENT(header)                                \
	((atomic_load_acquire(&(header)->attributes) & \
	  RDATASET_ATTR_ANCIENT) != 0)
#define STATCOUNT(header)                              \
	((atomic_load_acquire(&(header)->attributes) & \
	  RDATASET_ATTR_STATCOUNT) != 0)

#define RDATASET_ATTR_GET(header, attribute) \
	(atomic_load_acquire(&(header)->attributes) & attribute)
#define RDATASET_ATTR_SET(header, attribute) \
	atomic_fetch_or_release(&(header)->attributes, attribute)
#define RDATASET_ATTR_CLR(header, attribute) \
	atomic_fetch_and_release(&(header)->attributes, ~(attribute))
339

340
#define ACTIVE(header, now)             \
341 342 343
	(((header)->rdh_ttl > (now)) || \
	 ((header)->rdh_ttl == (now) && ZEROTTL(header)))

344
#define DEFAULT_NODE_LOCK_COUNT	   53 /*%< Should be prime. */
345
#define RBTDB_GLUE_TABLE_INIT_SIZE 2U
346 347 348 349 350 351 352 353 354 355 356 357 358

/*%
 * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
 * There is a tradeoff issue about configuring this value: if this is too
 * small, it may cause heavier contention between threads; if this is too large,
 * LRU purge algorithm won't work well (entries tend to be purged prematurely).
 * The default value should work well for most environments, but this can
 * also be configurable at compilation time via the
 * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable.  This value must be larger than
 * 1 due to the assumption of overmem_purge().
 */
#ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
#if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
359
#error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
360
#else /* if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 */
361
#define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
362 363
#endif /* if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 */
#else  /* ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
364
#define DEFAULT_CACHE_NODE_LOCK_COUNT 97
365
#endif /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
Bob Halley's avatar
Bob Halley committed
366 367

typedef struct {
368
	nodelock_t lock;
Automatic Updater's avatar
Automatic Updater committed
369
	/* Protected in the refcount routines. */
370
	isc_refcount_t references;
Automatic Updater's avatar
Automatic Updater committed
371
	/* Locked by lock. */
372
	bool exiting;
373 374 375
} rbtdb_nodelock_t;

typedef struct rbtdb_changed {
376
	dns_rbtnode_t *node;
Evan Hunt's avatar
Evan Hunt committed
377
	bool dirty;
378
	ISC_LINK(struct rbtdb_changed) link;
379 380
} rbtdb_changed_t;

381
typedef ISC_LIST(rbtdb_changed_t) rbtdb_changedlist_t;
382

383
typedef enum { dns_db_insecure, dns_db_partial, dns_db_secure } dns_db_secure_t;
384

385 386
typedef struct dns_rbtdb dns_rbtdb_t;

387
/* Reason for expiring a record from cache */
388
typedef enum { expire_lru, expire_ttl, expire_flush } expire_t;
389

390 391 392 393
typedef struct rbtdb_glue rbtdb_glue_t;

typedef struct rbtdb_glue_table_node {
	struct rbtdb_glue_table_node *next;
Evan Hunt's avatar
Evan Hunt committed
394 395
	dns_rbtnode_t *node;
	rbtdb_glue_t *glue_list;
396 397
} rbtdb_glue_table_node_t;

398 399 400 401 402 403
typedef enum {
	rdataset_ttl_fresh,
	rdataset_ttl_stale,
	rdataset_ttl_ancient
} rdataset_ttl_t;

404
typedef struct rbtdb_version {
Automatic Updater's avatar
Automatic Updater committed
405
	/* Not locked */
406
	rbtdb_serial_t serial;
Evan Hunt's avatar
Evan Hunt committed
407
	dns_rbtdb_t *rbtdb;
Automatic Updater's avatar
Automatic Updater committed
408 409 410 411 412
	/*
	 * Protected in the refcount routines.
	 * XXXJT: should we change the lock policy based on the refcount
	 * performance?
	 */
413
	isc_refcount_t references;
Automatic Updater's avatar
Automatic Updater committed
414
	/* Locked by database lock. */
Evan Hunt's avatar
Evan Hunt committed
415 416 417
	bool writer;
	bool commit_ok;
	rbtdb_changedlist_t changed_list;
418 419 420
	rdatasetheaderlist_t resigned_list;
	ISC_LINK(struct rbtdb_version) link;
	dns_db_secure_t secure;
Evan Hunt's avatar
Evan Hunt committed
421
	bool havensec3;
422
	/* NSEC3 parameters */
Evan Hunt's avatar
Evan Hunt committed
423 424 425 426
	dns_hash_t hash;
	uint8_t flags;
	uint16_t iterations;
	uint8_t salt_length;
427
	unsigned char salt[DNS_NSEC3_SALTSIZE];
428 429

	/*
430
	 * records and xfrsize are covered by rwlock.
431
	 */
432
	isc_rwlock_t rwlock;
Evan Hunt's avatar
Evan Hunt committed
433
	uint64_t records;
434
	uint64_t xfrsize;
435

Evan Hunt's avatar
Evan Hunt committed
436 437 438
	isc_rwlock_t glue_rwlock;
	size_t glue_table_size;
	size_t glue_table_nodecount;
439
	rbtdb_glue_table_node_t **glue_table;
440 441
} rbtdb_version_t;

442
typedef ISC_LIST(rbtdb_version_t) rbtdb_versionlist_t;
443

444
struct dns_rbtdb {
Automatic Updater's avatar
Automatic Updater committed
445
	/* Unlocked. */
446
	dns_db_t common;
447
	/* Locks the data in this struct */
448
	isc_rwlock_t lock;
449
	/* Locks the tree structure (prevents nodes appearing/disappearing) */
450
	isc_rwlock_t tree_lock;
451
	/* Locks for individual tree nodes */
Evan Hunt's avatar
Evan Hunt committed
452
	unsigned int node_lock_count;
453
	rbtdb_nodelock_t *node_locks;
Evan Hunt's avatar
Evan Hunt committed
454 455 456 457 458
	dns_rbtnode_t *origin_node;
	dns_rbtnode_t *nsec3_origin_node;
	dns_stats_t *rrsetstats;     /* cache DB only */
	isc_stats_t *cachestats;     /* cache DB only */
	isc_stats_t *gluecachestats; /* zone DB only */
Automatic Updater's avatar
Automatic Updater committed
459
	/* Locked by lock. */
Evan Hunt's avatar
Evan Hunt committed
460 461 462 463 464 465 466 467
	unsigned int active;
	isc_refcount_t references;
	unsigned int attributes;
	rbtdb_serial_t current_serial;
	rbtdb_serial_t least_serial;
	rbtdb_serial_t next_serial;
	rbtdb_version_t *current_version;
	rbtdb_version_t *future_version;
468
	rbtdb_versionlist_t open_versions;
Evan Hunt's avatar
Evan Hunt committed
469 470 471
	isc_task_t *task;
	dns_dbnode_t *soanode;
	dns_dbnode_t *nsnode;
Automatic Updater's avatar
Automatic Updater committed
472

473 474 475
	/*
	 * Maximum length of time to keep using a stale answer past its
	 * normal TTL expiry.
476 477
	 */
	dns_ttl_t serve_stale_ttl;
478

Automatic Updater's avatar
Automatic Updater committed
479 480 481 482 483
	/*
	 * This is a linked list used to implement the LRU cache.  There will
	 * be node_lock_count linked lists here.  Nodes in bucket 1 will be
	 * placed on the linked list rdatasets[1].
	 */
484
	rdatasetheaderlist_t *rdatasets;
485

Automatic Updater's avatar
Automatic Updater committed
486 487
	/*%
	 * Temporary storage for stale cache nodes and dynamically deleted
488
	 * nodes that await being cleaned up.
Automatic Updater's avatar
Automatic Updater committed
489
	 */
490
	rbtnodelist_t *deadnodes;
Automatic Updater's avatar
Automatic Updater committed
491 492

	/*
493 494 495 496
	 * Heaps.  These are used for TTL based expiry in a cache,
	 * or for zone resigning in a zone DB.  hmctx is the memory
	 * context to use for the heap (which differs from the main
	 * database memory context in the case of a cache).
Automatic Updater's avatar
Automatic Updater committed
497
	 */
Evan Hunt's avatar
Evan Hunt committed
498
	isc_mem_t *hmctx;
499
	isc_heap_t **heaps;
Tinderbox User's avatar
Tinderbox User committed
500

501 502 503
	/*
	 * Base values for the mmap() code.
	 */
Evan Hunt's avatar
Evan Hunt committed
504
	void *mmap_location;
505
	size_t mmap_size;
Automatic Updater's avatar
Automatic Updater committed
506 507

	/* Locked by tree_lock. */
508 509 510
	dns_rbt_t *tree;
	dns_rbt_t *nsec;
	dns_rbt_t *nsec3;
Automatic Updater's avatar
Automatic Updater committed
511 512

	/* Unlocked */
513
	unsigned int quantum;
514
};
Bob Halley's avatar
Bob Halley committed
515

Evan Hunt's avatar
Evan Hunt committed
516
#define RBTDB_ATTR_LOADED  0x01
517
#define RBTDB_ATTR_LOADING 0x02
518

519 520
#define KEEPSTALE(rbtdb) ((rbtdb)->serve_stale_ttl > 0)

521
/*%
522 523 524
 * Search Context
 */
typedef struct {
Evan Hunt's avatar
Evan Hunt committed
525 526 527 528
	dns_rbtdb_t *rbtdb;
	rbtdb_version_t *rbtversion;
	rbtdb_serial_t serial;
	unsigned int options;
529
	dns_rbtnodechain_t chain;
Evan Hunt's avatar
Evan Hunt committed
530 531 532 533 534 535 536 537
	bool copy_name;
	bool need_cleanup;
	bool wild;
	dns_rbtnode_t *zonecut;
	rdatasetheader_t *zonecut_rdataset;
	rdatasetheader_t *zonecut_sigrdataset;
	dns_fixedname_t zonecut_name;
	isc_stdtime_t now;
538 539
} rbtdb_search_t;

540
/*%
541 542 543
 * Load Context
 */
typedef struct {
Evan Hunt's avatar
Evan Hunt committed
544
	dns_rbtdb_t *rbtdb;
545
	isc_stdtime_t now;
546 547
} rbtdb_load_t;

Ondřej Surý's avatar
Ondřej Surý committed
548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599
static void
delete_callback(void *data, void *arg);
static void
rdataset_disassociate(dns_rdataset_t *rdataset);
static isc_result_t
rdataset_first(dns_rdataset_t *rdataset);
static isc_result_t
rdataset_next(dns_rdataset_t *rdataset);
static void
rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
static void
rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
static unsigned int
rdataset_count(dns_rdataset_t *rdataset);
static isc_result_t
rdataset_getnoqname(dns_rdataset_t *rdataset, dns_name_t *name,
		    dns_rdataset_t *neg, dns_rdataset_t *negsig);
static isc_result_t
rdataset_getclosest(dns_rdataset_t *rdataset, dns_name_t *name,
		    dns_rdataset_t *neg, dns_rdataset_t *negsig);
static inline bool
need_headerupdate(rdatasetheader_t *header, isc_stdtime_t now);
static void
update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, isc_stdtime_t now);
static void
expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, bool tree_locked,
	      expire_t reason);
static void
overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, isc_stdtime_t now,
	      bool tree_locked);
static isc_result_t
resign_insert(dns_rbtdb_t *rbtdb, int idx, rdatasetheader_t *newheader);
static void
resign_delete(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
	      rdatasetheader_t *header);
static void
prune_tree(isc_task_t *task, isc_event_t *event);
static void
rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust);
static void
rdataset_expire(dns_rdataset_t *rdataset);
static void
rdataset_clearprefetch(dns_rdataset_t *rdataset);
static void
rdataset_setownercase(dns_rdataset_t *rdataset, const dns_name_t *name);
static void
rdataset_getownercase(const dns_rdataset_t *rdataset, dns_name_t *name);
static isc_result_t
rdataset_addglue(dns_rdataset_t *rdataset, dns_dbversion_t *version,
		 dns_message_t *msg);
static void
free_gluetable(rbtdb_version_t *version);
600 601
static isc_result_t
nodefullname(dns_db_t *db, dns_dbnode_t *node, dns_name_t *name);
602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618

static dns_rdatasetmethods_t rdataset_methods = { rdataset_disassociate,
						  rdataset_first,
						  rdataset_next,
						  rdataset_current,
						  rdataset_clone,
						  rdataset_count,
						  NULL, /* addnoqname */
						  rdataset_getnoqname,
						  NULL, /* addclosest */
						  rdataset_getclosest,
						  rdataset_settrust,
						  rdataset_expire,
						  rdataset_clearprefetch,
						  rdataset_setownercase,
						  rdataset_getownercase,
						  rdataset_addglue };
619 620 621 622 623 624 625 626

static dns_rdatasetmethods_t slab_methods = {
	rdataset_disassociate,
	rdataset_first,
	rdataset_next,
	rdataset_current,
	rdataset_clone,
	rdataset_count,
627 628 629 630 631 632 633 634 635 636
	NULL, /* addnoqname */
	NULL, /* getnoqname */
	NULL, /* addclosest */
	NULL, /* getclosest */
	NULL, /* settrust */
	NULL, /* expire */
	NULL, /* clearprefetch */
	NULL, /* setownercase */
	NULL, /* getownercase */
	NULL  /* addglue */
Bob Halley's avatar
Bob Halley committed
637 638
};

Ondřej Surý's avatar
Ondřej Surý committed
639 640 641 642 643 644 645 646
static void
rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
static isc_result_t
rdatasetiter_first(dns_rdatasetiter_t *iterator);
static isc_result_t
rdatasetiter_next(dns_rdatasetiter_t *iterator);
static void
rdatasetiter_current(dns_rdatasetiter_t *iterator, dns_rdataset_t *rdataset);
Bob Halley's avatar
Bob Halley committed
647 648

static dns_rdatasetitermethods_t rdatasetiter_methods = {
649
	rdatasetiter_destroy, rdatasetiter_first, rdatasetiter_next,
Automatic Updater's avatar
Automatic Updater committed
650
	rdatasetiter_current
651 652
};

Bob Halley's avatar
Bob Halley committed
653
typedef struct rbtdb_rdatasetiter {
654
	dns_rdatasetiter_t common;
Evan Hunt's avatar
Evan Hunt committed
655
	rdatasetheader_t *current;
Bob Halley's avatar
Bob Halley committed
656 657
} rbtdb_rdatasetiter_t;

658 659 660 661 662 663 664 665 666
/*
 * Note that these iterators, unless created with either DNS_DB_NSEC3ONLY or
 * DNS_DB_NONSEC3, will transparently move between the last node of the
 * "regular" RBT ("chain" field) and the root node of the NSEC3 RBT
 * ("nsec3chain" field) of the database in question, as if the latter was a
 * successor to the former in lexical order.  The "current" field always holds
 * the address of either "chain" or "nsec3chain", depending on which RBT is
 * being traversed at given time.
 */
Ondřej Surý's avatar
Ondřej Surý committed
667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685
static void
dbiterator_destroy(dns_dbiterator_t **iteratorp);
static isc_result_t
dbiterator_first(dns_dbiterator_t *iterator);
static isc_result_t
dbiterator_last(dns_dbiterator_t *iterator);
static isc_result_t
dbiterator_seek(dns_dbiterator_t *iterator, const dns_name_t *name);
static isc_result_t
dbiterator_prev(dns_dbiterator_t *iterator);
static isc_result_t
dbiterator_next(dns_dbiterator_t *iterator);
static isc_result_t
dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep,
		   dns_name_t *name);
static isc_result_t
dbiterator_pause(dns_dbiterator_t *iterator);
static isc_result_t
dbiterator_origin(dns_dbiterator_t *iterator, dns_name_t *name);
686 687

static dns_dbiteratormethods_t dbiterator_methods = {
688 689 690
	dbiterator_destroy, dbiterator_first, dbiterator_last,
	dbiterator_seek,    dbiterator_prev,  dbiterator_next,
	dbiterator_current, dbiterator_pause, dbiterator_origin
691 692
};

693 694
#define DELETION_BATCH_MAX 64

695
/*
696
 * If 'paused' is true, then the tree lock is not being held.
697
 */
698
typedef struct rbtdb_dbiterator {
Evan Hunt's avatar
Evan Hunt committed
699 700 701 702 703 704 705 706 707
	dns_dbiterator_t common;
	bool paused;
	bool new_origin;
	isc_rwlocktype_t tree_locked;
	isc_result_t result;
	dns_fixedname_t name;
	dns_fixedname_t origin;
	dns_rbtnodechain_t chain;
	dns_rbtnodechain_t nsec3chain;
708
	dns_rbtnodechain_t *current;
Evan Hunt's avatar
Evan Hunt committed
709 710 711 712 713
	dns_rbtnode_t *node;
	dns_rbtnode_t *deletions[DELETION_BATCH_MAX];
	int delcnt;
	bool nsec3only;
	bool nonsec3;
714 715
} rbtdb_dbiterator_t;

Evan Hunt's avatar
Evan Hunt committed
716
#define IS_STUB(rbtdb)	(((rbtdb)->common.attributes & DNS_DBATTR_STUB) != 0)
717
#define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
718

Ondřej Surý's avatar
Ondřej Surý committed
719 720 721 722 723 724 725 726 727 728 729
static void
free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event);
static void
overmem(dns_db_t *db, bool over);
static void
setnsec3parameters(dns_db_t *db, rbtdb_version_t *version);
static void
setownercase(rdatasetheader_t *header, const dns_name_t *name);

static bool
match_header_version(rbtdb_file_header_t *header);
730

731 732 733
/* Pad to 32 bytes */
static char FILE_VERSION[32] = "\0";

734 735 736
/*%
 * 'init_count' is used to initialize 'newheader->count' which inturn
 * is used to determine where in the cycle rrset-order cyclic starts.
Francis Dupont's avatar
Francis Dupont committed
737
 * We don't lock this as we don't care about simultaneous updates.
738 739
 *
 * Note:
740
 *      Both init_count and header->count can be UINT32_MAX.
741
 *      The count on the returned rdataset however can't be as
742 743
 *      that indicates that the database does not implement cyclic
 *      processing.
744
 */
745
static atomic_uint_fast32_t init_count;
746

747 748 749 750 751 752
/*
 * Locking
 *
 * If a routine is going to lock more than one lock in this module, then
 * the locking must be done in the following order:
 *
753
 *      Tree Lock
754
 *
755 756
 *      Node Lock       (Only one from the set may be locked at one time by
 *                       any caller)
757
 *
758
 *      Database Lock
759 760 761 762
 *
 * Failure to follow this hierarchy can result in deadlock.
 */

763 764 765
/*
 * Deleting Nodes
 *
766
 * For zone databases the node for the origin of the zone MUST NOT be deleted.
767 768
 */

Evan Hunt's avatar
Evan Hunt committed
769 770 771 772 773
/*
 * Debugging routines
 */
#ifdef DEBUG
static void
Evan Hunt's avatar
Evan Hunt committed
774 775
hexdump(const char *desc, unsigned char *data, size_t size) {
	char hexdump[BUFSIZ * 2 + 1];
Evan Hunt's avatar
Evan Hunt committed
776 777
	isc_buffer_t b;
	isc_region_t r;
778
	isc_result_t result;
Evan Hunt's avatar
Evan Hunt committed
779
	size_t bytes;
780 781 782 783 784 785 786 787 788 789 790 791 792 793

	fprintf(stderr, "%s: ", desc);
	do {
		isc_buffer_init(&b, hexdump, sizeof(hexdump));
		r.base = data;
		r.length = bytes = (size > BUFSIZ) ? BUFSIZ : size;
		result = isc_hex_totext(&r, 0, "", &b);
		RUNTIME_CHECK(result == ISC_R_SUCCESS);
		isc_buffer_putuint8(&b, 0);
		fprintf(stderr, "%s", hexdump);
		data += bytes;
		size -= bytes;
	} while (size > 0);
	fprintf(stderr, "\n");
Evan Hunt's avatar
Evan Hunt committed
794
}
795
#endif /* ifdef DEBUG */
Evan Hunt's avatar
Evan Hunt committed
796

797 798 799 800 801 802 803 804 805 806 807
/* Fixed RRSet helper macros */

#define DNS_RDATASET_LENGTH 2;

#if DNS_RDATASET_FIXED
#define DNS_RDATASET_ORDER 2
#define DNS_RDATASET_COUNT (count * 4)
#else /* !DNS_RDATASET_FIXED */
#define DNS_RDATASET_ORDER 0
#define DNS_RDATASET_COUNT 0
#endif /* DNS_RDATASET_FIXED */
808

809 810 811 812
/*
 * DB Routines
 */

Bob Halley's avatar
Bob Halley committed
813
static void
Evan Hunt's avatar
Evan Hunt committed
814
attach(dns_db_t *source, dns_db_t **targetp) {
Automatic Updater's avatar
Automatic Updater committed
815
	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source;
Bob Halley's avatar
Bob Halley committed
816

Automatic Updater's avatar
Automatic Updater committed
817
	REQUIRE(VALID_RBTDB(rbtdb));
Bob Halley's avatar
Bob Halley committed
818

819
	isc_refcount_increment(&rbtdb->references);
Bob Halley's avatar
Bob Halley committed
820

Automatic Updater's avatar
Automatic Updater committed
821
	*targetp = source;
Bob Halley's avatar
Bob Halley committed
822 823 824
}

static void
Evan Hunt's avatar
Evan Hunt committed
825
free_rbtdb_callback(isc_task_t *task, isc_event_t *event) {
Automatic Updater's avatar
Automatic Updater committed
826
	dns_rbtdb_t *rbtdb = event->ev_arg;
827

Automatic Updater's avatar
Automatic Updater committed
828
	UNUSED(task);
829

830
	free_rbtdb(rbtdb, true, event);
831 832
}

833
static void
Evan Hunt's avatar
Evan Hunt committed
834
update_cachestats(dns_rbtdb_t *rbtdb, isc_result_t result) {
835
	INSIST(IS_CACHE(rbtdb));
836

837
	if (rbtdb->cachestats == NULL) {
838
		return;
839
	}
840 841 842 843 844 845 846 847 848 849 850 851

	switch (result) {
	case ISC_R_SUCCESS:
	case DNS_R_CNAME:
	case DNS_R_DNAME:
	case DNS_R_DELEGATION:
	case DNS_R_NCACHENXDOMAIN:
	case DNS_R_NCACHENXRRSET:
		isc_stats_increment(rbtdb->cachestats,
				    dns_cachestatscounter_hits);
		break;
	default:
Tinderbox User's avatar
Tinderbox User committed
852
		isc_stats_increment(rbtdb->cachestats,
853 854 855 856
				    dns_cachestatscounter_misses);
	}
}

857
static bool
Evan Hunt's avatar
Evan Hunt committed
858
do_stats(rdatasetheader_t *header) {
859
	return (EXISTS(header) && STATCOUNT(header));
860 861
}

862
static void
863 864
update_rrsetstats(dns_rbtdb_t *rbtdb, const rbtdb_rdatatype_t htype,
		  const uint_least16_t hattributes, const bool increment) {
865 866 867
	dns_rdatastatstype_t statattributes = 0;
	dns_rdatastatstype_t base = 0;
	dns_rdatastatstype_t type;
868 869 870
	rdatasetheader_t *header = &(rdatasetheader_t){
		.type = htype, .attributes = ATOMIC_VAR_INIT(hattributes)
	};
871

872 873 874 875
	if (!do_stats(header)) {
		return;
	}

876 877 878
	/* At the moment we count statistics only for cache DB */
	INSIST(IS_CACHE(rbtdb));

879
	if (NEGATIVE(header)) {
880
		if (NXDOMAIN(header)) {
881
			statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN;
882
		} else {
883 884 885
			statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET;
			base = RBTDB_RDATATYPE_EXT(header->type);
		}
886
	} else {
887
		base = RBTDB_RDATATYPE_BASE(header->type);
888
	}
889

890
	if (STALE(header)) {
891
		statattributes |= DNS_RDATASTATSTYPE_ATTR_STALE;
892 893 894 895
	}
	if (ANCIENT(header)) {
		statattributes |= DNS_RDATASTATSTYPE_ATTR_ANCIENT;
	}
896

897
	type = DNS_RDATASTATSTYPE_VALUE(base, statattributes);
898
	if (increment) {
899
		dns_rdatasetstats_increment(rbtdb->rrsetstats, type);
900
	} else {
901
		dns_rdatasetstats_decrement(rbtdb->rrsetstats, type);
902
	}
903 904
}

905
static void
Evan Hunt's avatar
Evan Hunt committed
906 907
set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) {
	int idx;
Automatic Updater's avatar
Automatic Updater committed
908
	isc_heap_t *heap;
Evan Hunt's avatar
Evan Hunt committed
909
	dns_ttl_t oldttl;
Automatic Updater's avatar
Automatic Updater committed
910

911 912
	if (!IS_CACHE(rbtdb)) {
		header->rdh_ttl = newttl;
913
		return;
914 915 916 917
	}

	oldttl = header->rdh_ttl;
	header->rdh_ttl = newttl;
918

Automatic Updater's avatar
Automatic Updater committed
919 920 921 922 923
	/*
	 * It's possible the rbtdb is not a cache.  If this is the case,
	 * we will not have a heap, and we move on.  If we do, though,
	 * we might need to adjust things.
	 */
924
	if (header->heap_index == 0 || newttl == oldttl) {
Automatic Updater's avatar
Automatic Updater committed
925
		return;
926
	}
Automatic Updater's avatar
Automatic Updater committed
927
	idx = header->node->locknum;
928
	if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL) {
929
		return;
930
	}
Automatic Updater's avatar
Automatic Updater committed
931 932
	heap = rbtdb->heaps[idx];

933
	if (newttl < oldttl) {
Automatic Updater's avatar
Automatic Updater committed
934
		isc_heap_increased(heap, header->heap_index);
935
	} else {
Automatic Updater's avatar
Automatic Updater committed
936
		isc_heap_decreased(heap, header->heap_index);
937
	}
938 939 940
}

/*%
941
 * These functions allow the heap code to rank the priority of each
942
 * element.  It returns true if v1 happens "sooner" than v2.
943
 */
944
static bool
Evan Hunt's avatar
Evan Hunt committed
945
ttl_sooner(void *v1, void *v2) {
Automatic Updater's avatar
Automatic Updater committed
946 947
	rdatasetheader_t *h1 = v1;
	rdatasetheader_t *h2 = v2;
948

949
	return (h1->rdh_ttl < h2->rdh_ttl);
950 951
}

952 953 954 955
/*%
 * Return which RRset should be resigned sooner.  If the RRsets have the
 * same signing time, prefer the other RRset over the SOA RRset.
 */
956
static bool
Evan Hunt's avatar
Evan Hunt committed
957
resign_sooner(void *v1, void *v2) {
Automatic Updater's avatar
Automatic Updater committed
958 959
	rdatasetheader_t *h1 = v1;
	rdatasetheader_t *h2 = v2;
960

961
	return (h1->resign < h2->resign ||
962 963 964
		(h1->resign == h2->resign && h1->resign_lsb < h2->resign_lsb) ||
		(h1->resign == h2->resign &&