rbtdb.c 289 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
2
 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3
 *
4 5 6
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
7 8 9
 *
 * See the COPYRIGHT file distributed with this work for additional
 * information regarding copyright ownership.
Bob Halley's avatar
Bob Halley committed
10 11
 */

12
/*! \file */
David Lawrence's avatar
David Lawrence committed
13

14 15
#include <config.h>

Mark Andrews's avatar
Mark Andrews committed
16
/* #define inline */
17

18
#include <inttypes.h>
19
#include <stdbool.h>
Mark Andrews's avatar
Mark Andrews committed
20

21
#include <isc/crc64.h>
22
#include <isc/event.h>
23
#include <isc/heap.h>
24
#include <isc/file.h>
Evan Hunt's avatar
Evan Hunt committed
25
#include <isc/hex.h>
26
#include <isc/mem.h>
27
#include <isc/mutex.h>
28
#include <isc/once.h>
29
#include <isc/platform.h>
30
#include <isc/print.h>
31
#include <isc/random.h>
32
#include <isc/refcount.h>
Bob Halley's avatar
Bob Halley committed
33
#include <isc/rwlock.h>
34
#include <isc/serial.h>
35 36
#include <isc/socket.h>
#include <isc/stdio.h>
37
#include <isc/string.h>
38
#include <isc/task.h>
39
#include <isc/time.h>
Michael Graff's avatar
Michael Graff committed
40
#include <isc/util.h>
41
#include <isc/hash.h>
Bob Halley's avatar
Bob Halley committed
42

43
#include <dns/callbacks.h>
44 45
#include <dns/db.h>
#include <dns/dbiterator.h>
46
#include <dns/events.h>
47
#include <dns/fixedname.h>
48
#include <dns/lib.h>
49
#include <dns/log.h>
50
#include <dns/masterdump.h>
51
#include <dns/nsec.h>
52
#include <dns/nsec3.h>
53
#include <dns/rbt.h>
54
#include <dns/rdata.h>
Bob Halley's avatar
Bob Halley committed
55 56
#include <dns/rdataset.h>
#include <dns/rdatasetiter.h>
57
#include <dns/rdataslab.h>
58
#include <dns/rdatastruct.h>
59
#include <dns/result.h>
60
#include <dns/stats.h>
61
#include <dns/time.h>
62
#include <dns/version.h>
63 64
#include <dns/view.h>
#include <dns/zone.h>
65
#include <dns/zonekey.h>
Bob Halley's avatar
Bob Halley committed
66

Evan Hunt's avatar
Evan Hunt committed
67
#ifndef WIN32
68
#include <sys/mman.h>
Evan Hunt's avatar
Evan Hunt committed
69 70 71 72 73 74
#else
#define PROT_READ	0x01
#define PROT_WRITE	0x02
#define MAP_PRIVATE	0x0002
#define MAP_FAILED	((void *)-1)
#endif
75

76 77 78
#ifdef DNS_RBTDB_VERSION64
#include "rbtdb64.h"
#else
Bob Halley's avatar
Bob Halley committed
79
#include "rbtdb.h"
80
#endif
Bob Halley's avatar
Bob Halley committed
81

82
#ifdef DNS_RBTDB_VERSION64
83
#define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '8')
84
#else
85
#define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '4')
86 87
#endif

88 89 90 91 92 93
#define CHECK(op) \
	do { result = (op); \
		if (result != ISC_R_SUCCESS) goto failure; \
	} while (0)

/*
Evan Hunt's avatar
Evan Hunt committed
94
 * This is the map file header for RBTDB images.  It is populated, and then
95 96 97 98 99 100 101
 * written, as the LAST thing done to the file.  Writing this last (with
 * zeros in the header area initially) will ensure that the header is only
 * valid when the RBTDB image is also valid.
 */
typedef struct rbtdb_file_header rbtdb_file_header_t;

/* Header length, always the same size regardless of structure size */
102
#define RBTDB_HEADER_LENGTH	1024
103 104 105

struct rbtdb_file_header {
	char version1[32];
106
	uint32_t ptrsize;
107
	unsigned int bigendian:1;
108 109 110
	uint64_t tree;
	uint64_t nsec;
	uint64_t nsec3;
111 112 113 114 115

	char version2[32];  		/* repeated; must match version1 */
};


116
/*%
117 118 119
 * Note that "impmagic" is not the first four bytes of the struct, so
 * ISC_MAGIC_VALID cannot be used.
 */
120
#define VALID_RBTDB(rbtdb)      ((rbtdb) != NULL && \
Automatic Updater's avatar
Automatic Updater committed
121
				 (rbtdb)->common.impmagic == RBTDB_MAGIC)
Bob Halley's avatar
Bob Halley committed
122

123
#ifdef DNS_RBTDB_VERSION64
124
typedef uint64_t                    rbtdb_serial_t;
125
/*%
126 127 128 129 130 131
 * Make casting easier in symbolic debuggers by using different names
 * for the 64 bit version.
 */
#define dns_rbtdb_t dns_rbtdb64_t
#define rdatasetheader_t rdatasetheader64_t
#define rbtdb_version_t rbtdb_version64_t
132 133

#define once once64
134
#define FILE_VERSION FILE_VERSION64
Mark Andrews's avatar
Mark Andrews committed
135
#define init_count init_count64
136 137 138 139 140

#define cache_methods cache_methods64
#define dbiterator_methods dbiterator_methods64
#define rdataset_methods rdataset_methods64
#define rdatasetiter_methods rdatasetiter_methods64
141
#define slab_methods slab_methods64
142 143 144
#define zone_methods zone_methods64

#define activeempty activeempty64
145
#define activeemtpynode activeemtpynode64
146
#define add32 add64
147 148 149
#define add_changed add_changed64
#define add_empty_wildcards add_empty_wildcards64
#define add_wildcard_magic add_wildcard_magic64
150 151
#define addclosest addclosest64
#define addnoqname addnoqname64
152
#define addrdataset addrdataset64
153
#define adjust_quantum adjust_quantum64
154
#define allocate_version allocate_tversion64
155 156 157 158 159 160 161 162 163 164
#define allrdatasets allrdatasets64
#define attach attach64
#define attachnode attachnode64
#define attachversion attachversion64
#define beginload beginload64
#define bind_rdataset bind_rdataset64
#define cache_find cache_find64
#define cache_findrdataset cache_findrdataset64
#define cache_findzonecut cache_findzonecut64
#define cache_zonecut_callback cache_zonecut_callback64
165
#define check_stale_header check_stale_header64
166 167 168
#define clean_cache_node clean_cache_node64
#define clean_stale_headers clean_stale_headers64
#define clean_zone_node clean_zone_node64
169 170
#define cleanup_dead_nodes cleanup_dead_nodes64
#define cleanup_dead_nodes_callback cleanup_dead_nodes_callback64
171
#define cleanup_nondirty cleanup_nondirty64
172
#define closeversion closeversion64
173
#define cname_and_other_data cname_and_other_data64
174 175 176 177 178 179 180 181 182 183 184 185
#define createiterator createiterator64
#define currentversion currentversion64
#define dbiterator_current dbiterator_current64
#define dbiterator_destroy dbiterator_destroy64
#define dbiterator_first dbiterator_first64
#define dbiterator_last dbiterator_last64
#define dbiterator_next dbiterator_next64
#define dbiterator_origin dbiterator_origin64
#define dbiterator_pause dbiterator_pause64
#define dbiterator_prev dbiterator_prev64
#define dbiterator_seek dbiterator_seek64
#define decrement_reference decrement_reference64
186
#define delegating_type delegating_type64
187 188 189
#define delete_callback delete_callback64
#define delete_node delete_node64
#define deleterdataset deleterdataset64
190
#define dereference_iter_node dereference_iter_node64
191
#define deserialize32 deserialize64
192 193 194 195 196 197 198
#define detach detach64
#define detachnode detachnode64
#define dump dump64
#define endload endload64
#define expire_header expire_header64
#define expirenode expirenode64
#define find_closest_nsec find_closest_nsec64
199
#define find_coveringnsec find_coveringnsec64
200
#define find_deepest_zonecut find_deepest_zonecut64
201
#define find_wildcard find_wildcard64
202 203 204 205
#define findnode findnode64
#define findnodeintree findnodeintree64
#define findnsec3node findnsec3node64
#define flush_deletions flush_deletions64
206 207
#define free_gluelist free_gluelist64
#define free_gluetable free_gluetable64
208 209 210 211 212 213 214
#define free_noqname free_noqname64
#define free_rbtdb free_rbtdb64
#define free_rbtdb_callback free_rbtdb_callback64
#define free_rdataset free_rdataset64
#define getnsec3parameters getnsec3parameters64
#define getoriginnode getoriginnode64
#define getrrsetstats getrrsetstats64
Mark Andrews's avatar
Mark Andrews committed
215
#define getservestalettl getservestalettl64
216
#define getsigningtime getsigningtime64
217
#define getsize getsize64
218
#define glue_nsdname_cb glue_nsdname_cb64
219 220
#define hashsize hashsize64
#define init_file_version init_file_version64
221
#define init_rdataset init_rdataset64
222 223 224 225 226 227
#define isdnssec isdnssec64
#define ispersistent ispersistent64
#define issecure issecure64
#define iszonesecure iszonesecure64
#define loading_addrdataset loading_addrdataset64
#define loadnode loadnode64
228
#define make_least_version make_least_version64
Mark Andrews's avatar
Mark Andrews committed
229
#define mark_header_ancient mark_header_ancient64
230
#define mark_stale_header mark_stale_header64
231
#define match_header_version match_header_version64
232 233
#define matchparams matchparams64
#define maybe_free_rbtdb maybe_free_rbtdb64
234 235
#define need_headerupdate need_headerupdate64
#define new_rdataset new_rdataset64
236
#define new_reference new_reference64
237 238
#define newversion newversion64
#define nodecount nodecount64
239
#define nodefullname nodefullname64
240
#define overmem overmem64
241
#define overmem_purge overmem_purge64
242 243 244 245 246
#define previous_closest_nsec previous_closest_nsec64
#define printnode printnode64
#define prune_tree prune_tree64
#define rbt_datafixer rbt_datafixer64
#define rbt_datawriter rbt_datawriter64
247
#define rbtdb_write_header rbtdb_write_header64
248
#define rbtdb_zero_header rbtdb_zero_header64
249
#define rdataset_addglue rdataset_addglue64
250
#define rdataset_clearprefetch rdataset_clearprefetch64
251 252 253 254 255 256 257 258
#define rdataset_clone rdataset_clone64
#define rdataset_count rdataset_count64
#define rdataset_current rdataset_current64
#define rdataset_disassociate rdataset_disassociate64
#define rdataset_expire rdataset_expire64
#define rdataset_first rdataset_first64
#define rdataset_getclosest rdataset_getclosest64
#define rdataset_getnoqname rdataset_getnoqname64
259
#define rdataset_getownercase rdataset_getownercase64
260
#define rdataset_next rdataset_next64
261
#define rdataset_setownercase rdataset_setownercase64
262 263 264 265 266 267
#define rdataset_settrust rdataset_settrust64
#define rdatasetiter_current rdatasetiter_current64
#define rdatasetiter_destroy rdatasetiter_destroy64
#define rdatasetiter_first rdatasetiter_first64
#define rdatasetiter_next rdatasetiter_next64
#define reactivate_node reactivate_node64
268 269
#define reference_iter_node reference_iter_node64
#define rehash_gluetable rehash_gluetable64
270 271 272 273
#define resign_delete resign_delete64
#define resign_insert resign_insert64
#define resign_sooner resign_sooner64
#define resigned resigned64
274 275
#define resume_iteration resume_iteration64
#define rollback_node rollback_node64
276 277
#define serialize serialize64
#define set_index set_index64
278
#define set_ttl set_ttl64
279
#define setcachestats setcachestats64
280
#define setgluecachestats setgluecachestats64
281
#define setnsec3parameters setnsec3parameters64
282
#define setownercase setownercase64
Mark Andrews's avatar
Mark Andrews committed
283
#define setservestalettl setservestalettl64
284 285 286 287 288
#define setsigningtime setsigningtime64
#define settask settask64
#define setup_delegation setup_delegation64
#define subtractrdataset subtractrdataset64
#define ttl_sooner ttl_sooner64
289
#define update_cachestats update_cachestats64
290
#define update_header update_header64
291
#define update_newheader update_newheader64
292
#define update_recordsandbytes  update_recordsandbytes64
293
#define update_rrsetstats update_rrsetstats64
294
#define valid_glue valid_glue64
295 296 297 298 299
#define zone_find zone_find64
#define zone_findrdataset zone_findrdataset64
#define zone_findzonecut zone_findzonecut64
#define zone_zonecut_callback zone_zonecut_callback64

300
#else
301
typedef uint32_t                    rbtdb_serial_t;
302 303
#endif

304
typedef uint32_t                    rbtdb_rdatatype_t;
Bob Halley's avatar
Bob Halley committed
305

306 307
#define RBTDB_RDATATYPE_BASE(type)      ((dns_rdatatype_t)((type) & 0xFFFF))
#define RBTDB_RDATATYPE_EXT(type)       ((dns_rdatatype_t)((type) >> 16))
308
#define RBTDB_RDATATYPE_VALUE(base, ext) ((rbtdb_rdatatype_t)(((uint32_t)ext) << 16) | (((uint32_t)base) & 0xffff))
Bob Halley's avatar
Bob Halley committed
309

310
#define RBTDB_RDATATYPE_SIGNSEC \
Automatic Updater's avatar
Automatic Updater committed
311
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
312 313
#define RBTDB_RDATATYPE_SIGNSEC3 \
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec3)
Bob Halley's avatar
Bob Halley committed
314
#define RBTDB_RDATATYPE_SIGNS \
Automatic Updater's avatar
Automatic Updater committed
315
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
Bob Halley's avatar
Bob Halley committed
316
#define RBTDB_RDATATYPE_SIGCNAME \
Automatic Updater's avatar
Automatic Updater committed
317
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
318
#define RBTDB_RDATATYPE_SIGDNAME \
Automatic Updater's avatar
Automatic Updater committed
319
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
320 321
#define RBTDB_RDATATYPE_SIGDDS \
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ds)
322
#define RBTDB_RDATATYPE_NCACHEANY \
Automatic Updater's avatar
Automatic Updater committed
323
		RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
Bob Halley's avatar
Bob Halley committed
324

325
/*
326
 * We use rwlock for DB lock only when ISC_RWLOCK_USEATOMIC is non 0.
327
 * Using rwlock is effective with regard to lookup performance only when
328
 * it is implemented in an efficient way.
329 330 331 332
 * Otherwise, it is generally wise to stick to the simple locking since rwlock
 * would require more memory or can even make lookups slower due to its own
 * overhead (when it internally calls mutex locks).
 */
333
#ifdef ISC_RWLOCK_USEATOMIC
334 335 336 337 338 339
#define DNS_RBTDB_USERWLOCK 1
#else
#define DNS_RBTDB_USERWLOCK 0
#endif

#if DNS_RBTDB_USERWLOCK
340 341 342 343
#define RBTDB_INITLOCK(l)       isc_rwlock_init((l), 0, 0)
#define RBTDB_DESTROYLOCK(l)    isc_rwlock_destroy(l)
#define RBTDB_LOCK(l, t)        RWLOCK((l), (t))
#define RBTDB_UNLOCK(l, t)      RWUNLOCK((l), (t))
344
#else
345 346 347 348
#define RBTDB_INITLOCK(l)       isc_mutex_init(l)
#define RBTDB_DESTROYLOCK(l)    DESTROYLOCK(l)
#define RBTDB_LOCK(l, t)        LOCK(l)
#define RBTDB_UNLOCK(l, t)      UNLOCK(l)
349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
#endif

/*
 * Since node locking is sensitive to both performance and memory footprint,
 * we need some trick here.  If we have both high-performance rwlock and
 * high performance and small-memory reference counters, we use rwlock for
 * node lock and isc_refcount for node references.  In this case, we don't have
 * to protect the access to the counters by locks.
 * Otherwise, we simply use ordinary mutex lock for node locking, and use
 * simple integers as reference counters which is protected by the lock.
 * In most cases, we can simply use wrapper macros such as NODE_LOCK and
 * NODE_UNLOCK.  In some other cases, however, we need to protect reference
 * counters first and then protect other parts of a node as read-only data.
 * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
 * provided for these special cases.  When we can use the efficient backend
 * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
 * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
 * section including the access to the reference counter.
 * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
 * section is also protected by NODE_STRONGLOCK().
 */
#if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT)
typedef isc_rwlock_t nodelock_t;

373 374 375 376 377 378 379 380 381 382 383
#define NODE_INITLOCK(l)        isc_rwlock_init((l), 0, 0)
#define NODE_DESTROYLOCK(l)     isc_rwlock_destroy(l)
#define NODE_LOCK(l, t)         RWLOCK((l), (t))
#define NODE_UNLOCK(l, t)       RWUNLOCK((l), (t))
#define NODE_TRYUPGRADE(l)      isc_rwlock_tryupgrade(l)

#define NODE_STRONGLOCK(l)      ((void)0)
#define NODE_STRONGUNLOCK(l)    ((void)0)
#define NODE_WEAKLOCK(l, t)     NODE_LOCK(l, t)
#define NODE_WEAKUNLOCK(l, t)   NODE_UNLOCK(l, t)
#define NODE_WEAKDOWNGRADE(l)   isc_rwlock_downgrade(l)
384 385 386
#else
typedef isc_mutex_t nodelock_t;

387 388 389 390 391 392 393 394 395 396 397
#define NODE_INITLOCK(l)        isc_mutex_init(l)
#define NODE_DESTROYLOCK(l)     DESTROYLOCK(l)
#define NODE_LOCK(l, t)         LOCK(l)
#define NODE_UNLOCK(l, t)       UNLOCK(l)
#define NODE_TRYUPGRADE(l)      ISC_R_SUCCESS

#define NODE_STRONGLOCK(l)      LOCK(l)
#define NODE_STRONGUNLOCK(l)    UNLOCK(l)
#define NODE_WEAKLOCK(l, t)     ((void)0)
#define NODE_WEAKUNLOCK(l, t)   ((void)0)
#define NODE_WEAKDOWNGRADE(l)   ((void)0)
398 399
#endif

400 401 402 403 404 405 406 407 408
/*%
 * Whether to rate-limit updating the LRU to avoid possible thread contention.
 * Our performance measurement has shown the cost is marginal, so it's defined
 * to be 0 by default either with or without threads.
 */
#ifndef DNS_RBTDB_LIMITLRUUPDATE
#define DNS_RBTDB_LIMITLRUUPDATE 0
#endif

409
/*
410
 * Allow clients with a virtual time of up to 5 minutes in the past to see
411 412 413 414
 * records that would have otherwise have expired.
 */
#define RBTDB_VIRTUAL 300

415
struct noqname {
416 417 418 419
	dns_name_t 	name;
	void *     	neg;
	void *     	negsig;
	dns_rdatatype_t	type;
420 421
};

Bob Halley's avatar
Bob Halley committed
422
typedef struct rdatasetheader {
Automatic Updater's avatar
Automatic Updater committed
423 424 425 426 427 428
	/*%
	 * Locked by the owning node's lock.
	 */
	rbtdb_serial_t                  serial;
	dns_ttl_t                       rdh_ttl;
	rbtdb_rdatatype_t               type;
429
	uint16_t                    attributes;
Automatic Updater's avatar
Automatic Updater committed
430 431
	dns_trust_t                     trust;
	struct noqname                  *noqname;
432
	struct noqname                  *closest;
433 434 435
	unsigned int 			is_mmapped : 1;
	unsigned int 			next_is_relative : 1;
	unsigned int 			node_is_relative : 1;
436
	unsigned int 			resign_lsb : 1;
Automatic Updater's avatar
Automatic Updater committed
437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455
	/*%<
	 * We don't use the LIST macros, because the LIST structure has
	 * both head and tail pointers, and is doubly linked.
	 */

	struct rdatasetheader           *next;
	/*%<
	 * If this is the top header for an rdataset, 'next' points
	 * to the top header for the next rdataset (i.e., the next type).
	 * Otherwise, it points up to the header whose down pointer points
	 * at this header.
	 */

	struct rdatasetheader           *down;
	/*%<
	 * Points to the header for the next older version of
	 * this rdataset.
	 */

456
	uint32_t                    count;
Automatic Updater's avatar
Automatic Updater committed
457 458 459 460 461 462 463 464 465 466
	/*%<
	 * Monotonously increased every time this rdataset is bound so that
	 * it is used as the base of the starting point in DNS responses
	 * when the "cyclic" rrset-order is required.  Since the ordering
	 * should not be so crucial, no lock is set for the counter for
	 * performance reasons.
	 */

	dns_rbtnode_t                   *node;
	isc_stdtime_t                   last_used;
467
	ISC_LINK(struct rdatasetheader) link;
Automatic Updater's avatar
Automatic Updater committed
468 469 470 471 472

	unsigned int                    heap_index;
	/*%<
	 * Used for TTL-based cache cleaning.
	 */
Automatic Updater's avatar
Automatic Updater committed
473
	isc_stdtime_t                   resign;
474 475 476 477 478 479
	/*%<
	 * Case vector.  If the bit is set then the corresponding
	 * character in the owner name needs to be AND'd with 0x20,
	 * rendering that character upper case.
	 */
	unsigned char			upper[32];
Bob Halley's avatar
Bob Halley committed
480 481
} rdatasetheader_t;

482 483 484 485
typedef ISC_LIST(rdatasetheader_t)      rdatasetheaderlist_t;
typedef ISC_LIST(dns_rbtnode_t)         rbtnodelist_t;

#define RDATASET_ATTR_NONEXISTENT       0x0001
486
/*%< May be potentially served as stale data. */
487 488 489 490
#define RDATASET_ATTR_STALE             0x0002
#define RDATASET_ATTR_IGNORE            0x0004
#define RDATASET_ATTR_RETAIN            0x0008
#define RDATASET_ATTR_NXDOMAIN          0x0010
491
#define RDATASET_ATTR_RESIGN            0x0020
492
#define RDATASET_ATTR_STATCOUNT         0x0040
493
#define RDATASET_ATTR_OPTOUT            0x0080
494
#define RDATASET_ATTR_NEGATIVE          0x0100
495
#define RDATASET_ATTR_PREFETCH          0x0200
496
#define RDATASET_ATTR_CASESET           0x0400
497
#define RDATASET_ATTR_ZEROTTL           0x0800
498
#define RDATASET_ATTR_CASEFULLYLOWER    0x1000
499 500
/*%< Ancient - awaiting cleanup. */
#define RDATASET_ATTR_ANCIENT           0x2000
501

Michael Graff's avatar
Michael Graff committed
502 503 504 505 506 507 508
/*
 * XXX
 * When the cache will pre-expire data (due to memory low or other
 * situations) before the rdataset's TTL has expired, it MUST
 * respect the RETAIN bit and not expire the data until its TTL is
 * expired.
 */
509

510
#undef IGNORE                   /* WIN32 winbase.h defines this. */
511

512
#define EXISTS(header) \
Automatic Updater's avatar
Automatic Updater committed
513
	(((header)->attributes & RDATASET_ATTR_NONEXISTENT) == 0)
514
#define NONEXISTENT(header) \
Automatic Updater's avatar
Automatic Updater committed
515
	(((header)->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
516
#define IGNORE(header) \
Automatic Updater's avatar
Automatic Updater committed
517
	(((header)->attributes & RDATASET_ATTR_IGNORE) != 0)
Michael Graff's avatar
Michael Graff committed
518
#define RETAIN(header) \
Automatic Updater's avatar
Automatic Updater committed
519
	(((header)->attributes & RDATASET_ATTR_RETAIN) != 0)
520
#define NXDOMAIN(header) \
Automatic Updater's avatar
Automatic Updater committed
521
	(((header)->attributes & RDATASET_ATTR_NXDOMAIN) != 0)
522 523
#define STALE(header) \
	(((header)->attributes & RDATASET_ATTR_STALE) != 0)
524
#define RESIGN(header) \
Automatic Updater's avatar
Automatic Updater committed
525
	(((header)->attributes & RDATASET_ATTR_RESIGN) != 0)
526 527
#define OPTOUT(header) \
	(((header)->attributes & RDATASET_ATTR_OPTOUT) != 0)
528 529
#define NEGATIVE(header) \
	(((header)->attributes & RDATASET_ATTR_NEGATIVE) != 0)
530 531
#define PREFETCH(header) \
	(((header)->attributes & RDATASET_ATTR_PREFETCH) != 0)
532 533
#define CASESET(header) \
	(((header)->attributes & RDATASET_ATTR_CASESET) != 0)
534 535
#define ZEROTTL(header) \
	(((header)->attributes & RDATASET_ATTR_ZEROTTL) != 0)
536 537
#define CASEFULLYLOWER(header) \
	(((header)->attributes & RDATASET_ATTR_CASEFULLYLOWER) != 0)
538 539
#define ANCIENT(header) \
	(((header)->attributes & RDATASET_ATTR_ANCIENT) != 0)
540

541 542 543 544
#define ACTIVE(header, now) \
	(((header)->rdh_ttl > (now)) || \
	 ((header)->rdh_ttl == (now) && ZEROTTL(header)))

545
#define DEFAULT_NODE_LOCK_COUNT         7       /*%< Should be prime. */
546
#define RBTDB_GLUE_TABLE_INIT_SIZE     2U
547 548 549 550 551 552 553 554 555 556 557 558 559

/*%
 * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
 * There is a tradeoff issue about configuring this value: if this is too
 * small, it may cause heavier contention between threads; if this is too large,
 * LRU purge algorithm won't work well (entries tend to be purged prematurely).
 * The default value should work well for most environments, but this can
 * also be configurable at compilation time via the
 * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable.  This value must be larger than
 * 1 due to the assumption of overmem_purge().
 */
#ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
#if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
560
#error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
561 562 563 564 565 566
#else
#define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
#endif
#else
#define DEFAULT_CACHE_NODE_LOCK_COUNT   16
#endif	/* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
Bob Halley's avatar
Bob Halley committed
567 568

typedef struct {
Automatic Updater's avatar
Automatic Updater committed
569 570 571 572
	nodelock_t                      lock;
	/* Protected in the refcount routines. */
	isc_refcount_t                  references;
	/* Locked by lock. */
573
	bool                   exiting;
574 575 576
} rbtdb_nodelock_t;

typedef struct rbtdb_changed {
Automatic Updater's avatar
Automatic Updater committed
577
	dns_rbtnode_t *                 node;
578
	bool                   dirty;
Automatic Updater's avatar
Automatic Updater committed
579
	ISC_LINK(struct rbtdb_changed)  link;
580 581
} rbtdb_changed_t;

582
typedef ISC_LIST(rbtdb_changed_t)       rbtdb_changedlist_t;
583

584 585 586 587 588 589
typedef enum {
	dns_db_insecure,
	dns_db_partial,
	dns_db_secure
} dns_db_secure_t;

590 591
typedef struct dns_rbtdb dns_rbtdb_t;

592 593 594 595 596 597 598
/* Reason for expiring a record from cache */
typedef enum {
	expire_lru,
	expire_ttl,
	expire_flush
} expire_t;

599 600 601 602 603 604 605 606
typedef struct rbtdb_glue rbtdb_glue_t;

typedef struct rbtdb_glue_table_node {
	struct rbtdb_glue_table_node *next;
	dns_rbtnode_t		     *node;
	rbtdb_glue_t		     *glue_list;
} rbtdb_glue_table_node_t;

607 608 609 610 611 612
typedef enum {
	rdataset_ttl_fresh,
	rdataset_ttl_stale,
	rdataset_ttl_ancient
} rdataset_ttl_t;

613
typedef struct rbtdb_version {
Automatic Updater's avatar
Automatic Updater committed
614 615
	/* Not locked */
	rbtdb_serial_t                  serial;
616
	dns_rbtdb_t *			rbtdb;
Automatic Updater's avatar
Automatic Updater committed
617 618 619 620 621 622 623
	/*
	 * Protected in the refcount routines.
	 * XXXJT: should we change the lock policy based on the refcount
	 * performance?
	 */
	isc_refcount_t                  references;
	/* Locked by database lock. */
624 625
	bool                   writer;
	bool                   commit_ok;
Automatic Updater's avatar
Automatic Updater committed
626
	rbtdb_changedlist_t             changed_list;
627
	rdatasetheaderlist_t		resigned_list;
Automatic Updater's avatar
Automatic Updater committed
628
	ISC_LINK(struct rbtdb_version)  link;
629
	dns_db_secure_t			secure;
630
	bool			havensec3;
631 632
	/* NSEC3 parameters */
	dns_hash_t			hash;
633 634 635
	uint8_t			flags;
	uint16_t			iterations;
	uint8_t			salt_length;
636
	unsigned char			salt[DNS_NSEC3_SALTSIZE];
637 638 639 640 641

	/*
	 * records and bytes are covered by rwlock.
	 */
	isc_rwlock_t                    rwlock;
642 643
	uint64_t			records;
	uint64_t			bytes;
644 645 646 647 648

	isc_rwlock_t                    glue_rwlock;
	size_t                          glue_table_size;
	size_t                          glue_table_nodecount;
	rbtdb_glue_table_node_t         **glue_table;
649 650
} rbtdb_version_t;

651 652
typedef ISC_LIST(rbtdb_version_t)       rbtdb_versionlist_t;

653
struct dns_rbtdb {
Automatic Updater's avatar
Automatic Updater committed
654 655
	/* Unlocked. */
	dns_db_t                        common;
656
	/* Locks the data in this struct */
657
#if DNS_RBTDB_USERWLOCK
Automatic Updater's avatar
Automatic Updater committed
658
	isc_rwlock_t                    lock;
659
#else
Automatic Updater's avatar
Automatic Updater committed
660
	isc_mutex_t                     lock;
661
#endif
662
	/* Locks the tree structure (prevents nodes appearing/disappearing) */
Automatic Updater's avatar
Automatic Updater committed
663
	isc_rwlock_t                    tree_lock;
664
	/* Locks for individual tree nodes */
Automatic Updater's avatar
Automatic Updater committed
665 666 667
	unsigned int                    node_lock_count;
	rbtdb_nodelock_t *              node_locks;
	dns_rbtnode_t *                 origin_node;
668
	dns_rbtnode_t *			nsec3_origin_node;
669
	dns_stats_t *			rrsetstats; /* cache DB only */
670
	isc_stats_t *			cachestats; /* cache DB only */
671
	isc_stats_t *			gluecachestats; /* zone DB only */
Automatic Updater's avatar
Automatic Updater committed
672 673 674 675 676 677 678 679 680 681 682 683 684 685
	/* Locked by lock. */
	unsigned int                    active;
	isc_refcount_t                  references;
	unsigned int                    attributes;
	rbtdb_serial_t                  current_serial;
	rbtdb_serial_t                  least_serial;
	rbtdb_serial_t                  next_serial;
	rbtdb_version_t *               current_version;
	rbtdb_version_t *               future_version;
	rbtdb_versionlist_t             open_versions;
	isc_task_t *                    task;
	dns_dbnode_t                    *soanode;
	dns_dbnode_t                    *nsnode;

686 687 688 689 690 691
	/*
	 * Maximum length of time to keep using a stale answer past its
	 * normal TTL expiry.
	*/
	dns_ttl_t			serve_stale_ttl;

Automatic Updater's avatar
Automatic Updater committed
692 693 694 695 696 697
	/*
	 * This is a linked list used to implement the LRU cache.  There will
	 * be node_lock_count linked lists here.  Nodes in bucket 1 will be
	 * placed on the linked list rdatasets[1].
	 */
	rdatasetheaderlist_t            *rdatasets;
698

Automatic Updater's avatar
Automatic Updater committed
699 700
	/*%
	 * Temporary storage for stale cache nodes and dynamically deleted
701
	 * nodes that await being cleaned up.
Automatic Updater's avatar
Automatic Updater committed
702
	 */
Automatic Updater's avatar
Automatic Updater committed
703 704 705
	rbtnodelist_t                   *deadnodes;

	/*
706 707 708 709
	 * Heaps.  These are used for TTL based expiry in a cache,
	 * or for zone resigning in a zone DB.  hmctx is the memory
	 * context to use for the heap (which differs from the main
	 * database memory context in the case of a cache).
Automatic Updater's avatar
Automatic Updater committed
710
	 */
711
	isc_mem_t			*hmctx;
Automatic Updater's avatar
Automatic Updater committed
712
	isc_heap_t                      **heaps;
Tinderbox User's avatar
Tinderbox User committed
713

714 715 716 717
	/*
	 * Base values for the mmap() code.
	 */
	void *				mmap_location;
718
	size_t				mmap_size;
Automatic Updater's avatar
Automatic Updater committed
719 720 721

	/* Locked by tree_lock. */
	dns_rbt_t *                     tree;
722
	dns_rbt_t *			nsec;
723
	dns_rbt_t *			nsec3;
Automatic Updater's avatar
Automatic Updater committed
724 725 726

	/* Unlocked */
	unsigned int                    quantum;
727
};
Bob Halley's avatar
Bob Halley committed
728

729 730
#define RBTDB_ATTR_LOADED               0x01
#define RBTDB_ATTR_LOADING              0x02
731

732 733
#define KEEPSTALE(rbtdb) ((rbtdb)->serve_stale_ttl > 0)

734
/*%
735 736 737
 * Search Context
 */
typedef struct {
Automatic Updater's avatar
Automatic Updater committed
738 739 740 741 742
	dns_rbtdb_t *           rbtdb;
	rbtdb_version_t *       rbtversion;
	rbtdb_serial_t          serial;
	unsigned int            options;
	dns_rbtnodechain_t      chain;
743 744 745
	bool           copy_name;
	bool           need_cleanup;
	bool           wild;
Automatic Updater's avatar
Automatic Updater committed
746 747 748 749 750
	dns_rbtnode_t *         zonecut;
	rdatasetheader_t *      zonecut_rdataset;
	rdatasetheader_t *      zonecut_sigrdataset;
	dns_fixedname_t         zonecut_name;
	isc_stdtime_t           now;
751 752
} rbtdb_search_t;

753
/*%
754 755 756
 * Load Context
 */
typedef struct {
Automatic Updater's avatar
Automatic Updater committed
757 758
	dns_rbtdb_t *           rbtdb;
	isc_stdtime_t           now;
759 760
} rbtdb_load_t;

761
static void delete_callback(void *data, void *arg);
Bob Halley's avatar
Bob Halley committed
762
static void rdataset_disassociate(dns_rdataset_t *rdataset);
763 764
static isc_result_t rdataset_first(dns_rdataset_t *rdataset);
static isc_result_t rdataset_next(dns_rdataset_t *rdataset);
Bob Halley's avatar
Bob Halley committed
765
static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
Bob Halley's avatar
Bob Halley committed
766
static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
Bob Halley's avatar
Bob Halley committed
767
static unsigned int rdataset_count(dns_rdataset_t *rdataset);
768
static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset,
Automatic Updater's avatar
Automatic Updater committed
769
					dns_name_t *name,
770 771 772 773 774 775
					dns_rdataset_t *neg,
					dns_rdataset_t *negsig);
static isc_result_t rdataset_getclosest(dns_rdataset_t *rdataset,
					dns_name_t *name,
					dns_rdataset_t *neg,
					dns_rdataset_t *negsig);
776
static inline bool need_headerupdate(rdatasetheader_t *header,
Automatic Updater's avatar
Automatic Updater committed
777
					      isc_stdtime_t now);
778
static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
Automatic Updater's avatar
Automatic Updater committed
779
			  isc_stdtime_t now);
780
static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
781
			  bool tree_locked, expire_t reason);
782
static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
783
			  isc_stdtime_t now, bool tree_locked);
784 785
static isc_result_t resign_insert(dns_rbtdb_t *rbtdb, int idx,
				  rdatasetheader_t *newheader);
786 787
static void resign_delete(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
			  rdatasetheader_t *header);
788
static void prune_tree(isc_task_t *task, isc_event_t *event);
789 790
static void rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust);
static void rdataset_expire(dns_rdataset_t *rdataset);
791
static void rdataset_clearprefetch(dns_rdataset_t *rdataset);
792 793 794 795
static void rdataset_setownercase(dns_rdataset_t *rdataset,
				  const dns_name_t *name);
static void rdataset_getownercase(const dns_rdataset_t *rdataset,
				  dns_name_t *name);
796 797 798 799 800
static isc_result_t rdataset_addglue(dns_rdataset_t *rdataset,
				     dns_dbversion_t *version,
				     unsigned int options,
				     dns_message_t *msg);
static void free_gluetable(rbtdb_version_t *version);
801 802

static dns_rdatasetmethods_t rdataset_methods = {
Automatic Updater's avatar
Automatic Updater committed
803 804 805 806 807 808
	rdataset_disassociate,
	rdataset_first,
	rdataset_next,
	rdataset_current,
	rdataset_clone,
	rdataset_count,
809
	NULL, /* addnoqname */
Automatic Updater's avatar
Automatic Updater committed
810
	rdataset_getnoqname,
811
	NULL, /* addclosest */
812
	rdataset_getclosest,
813
	rdataset_settrust,
814
	rdataset_expire,
815 816
	rdataset_clearprefetch,
	rdataset_setownercase,
817 818
	rdataset_getownercase,
	rdataset_addglue
819 820 821 822 823 824 825 826 827
};

static dns_rdatasetmethods_t slab_methods = {
	rdataset_disassociate,
	rdataset_first,
	rdataset_next,
	rdataset_current,
	rdataset_clone,
	rdataset_count,
828 829 830 831 832 833 834 835 836 837
	NULL, /* addnoqname */
	NULL, /* getnoqname */
	NULL, /* addclosest */
	NULL, /* getclosest */
	NULL, /* settrust */
	NULL, /* expire */
	NULL, /* clearprefetch */
	NULL, /* setownercase */
	NULL, /* getownercase */
	NULL  /* addglue */
Bob Halley's avatar
Bob Halley committed
838 839 840
};

static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
841 842
static isc_result_t rdatasetiter_first(dns_rdatasetiter_t *iterator);
static isc_result_t rdatasetiter_next(dns_rdatasetiter_t *iterator);
Bob Halley's avatar
Bob Halley committed
843
static void rdatasetiter_current(dns_rdatasetiter_t *iterator,
Automatic Updater's avatar
Automatic Updater committed
844
				 dns_rdataset_t *rdataset);
Bob Halley's avatar
Bob Halley committed
845 846

static dns_rdatasetitermethods_t rdatasetiter_methods = {
Automatic Updater's avatar
Automatic Updater committed
847 848 849 850
	rdatasetiter_destroy,
	rdatasetiter_first,
	rdatasetiter_next,
	rdatasetiter_current
851 852
};

Bob Halley's avatar
Bob Halley committed
853
typedef struct rbtdb_rdatasetiter {
Automatic Updater's avatar
Automatic Updater committed
854 855
	dns_rdatasetiter_t              common;
	rdatasetheader_t *              current;
Bob Halley's avatar
Bob Halley committed
856 857
} rbtdb_rdatasetiter_t;

858 859 860 861 862 863 864 865 866
/*
 * Note that these iterators, unless created with either DNS_DB_NSEC3ONLY or
 * DNS_DB_NONSEC3, will transparently move between the last node of the
 * "regular" RBT ("chain" field) and the root node of the NSEC3 RBT
 * ("nsec3chain" field) of the database in question, as if the latter was a
 * successor to the former in lexical order.  The "current" field always holds
 * the address of either "chain" or "nsec3chain", depending on which RBT is
 * being traversed at given time.
 */
867 868 869 870
static void             dbiterator_destroy(dns_dbiterator_t **iteratorp);
static isc_result_t     dbiterator_first(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_last(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_seek(dns_dbiterator_t *iterator,
871
					const dns_name_t *name);
872 873 874
static isc_result_t     dbiterator_prev(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_next(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_current(dns_dbiterator_t *iterator,
Automatic Updater's avatar
Automatic Updater committed
875 876
					   dns_dbnode_t **nodep,
					   dns_name_t *name);
877 878
static isc_result_t     dbiterator_pause(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_origin(dns_dbiterator_t *iterator,
Automatic Updater's avatar
Automatic Updater committed
879
					  dns_name_t *name);
880 881

static dns_dbiteratormethods_t dbiterator_methods = {
Automatic Updater's avatar
Automatic Updater committed
882 883 884 885 886 887 888 889 890
	dbiterator_destroy,
	dbiterator_first,
	dbiterator_last,
	dbiterator_seek,
	dbiterator_prev,
	dbiterator_next,
	dbiterator_current,
	dbiterator_pause,
	dbiterator_origin
891 892
};

893 894
#define DELETION_BATCH_MAX 64

895
/*
896
 * If 'paused' is true, then the tree lock is not being held.
897
 */
898
typedef struct rbtdb_dbiterator {
Automatic Updater's avatar
Automatic Updater committed
899
	dns_dbiterator_t                common;
900 901
	bool                   paused;
	bool                   new_origin;
Automatic Updater's avatar
Automatic Updater committed
902 903 904 905 906
	isc_rwlocktype_t                tree_locked;
	isc_result_t                    result;
	dns_fixedname_t                 name;
	dns_fixedname_t                 origin;
	dns_rbtnodechain_t              chain;
907 908
	dns_rbtnodechain_t		nsec3chain;
	dns_rbtnodechain_t		*current;
Automatic Updater's avatar
Automatic Updater committed
909 910
	dns_rbtnode_t                   *node;
	dns_rbtnode_t                   *deletions[DELETION_BATCH_MAX];
911
	int                             delcnt;
912 913
	bool			nsec3only;
	bool			nonsec3;
914 915 916
} rbtdb_dbiterator_t;


917 918
#define IS_STUB(rbtdb)  (((rbtdb)->common.attributes & DNS_DBATTR_STUB)  != 0)
#define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
919

920
static void free_rbtdb(dns_rbtdb_t *rbtdb, bool log,
Automatic Updater's avatar
Automatic Updater committed
921
		       isc_event_t *event);
922
static void overmem(dns_db_t *db, bool over);
923
static void setnsec3parameters(dns_db_t *db, rbtdb_version_t *version);
924
static void setownercase(rdatasetheader_t *header, const dns_name_t *name);
925

926
static bool match_header_version(rbtdb_file_header_t *header);
927

928 929 930
/* Pad to 32 bytes */
static char FILE_VERSION[32] = "\0";