rbtdb.c 269 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
Tinderbox User's avatar
Tinderbox User committed
2
 * Copyright (C) 2004-2014  Internet Systems Consortium, Inc. ("ISC")
Mark Andrews's avatar
Mark Andrews committed
3
 * Copyright (C) 1999-2003  Internet Software Consortium.
4
 *
Automatic Updater's avatar
Automatic Updater committed
5
 * Permission to use, copy, modify, and/or distribute this software for any
Bob Halley's avatar
Bob Halley committed
6
7
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
8
 *
Mark Andrews's avatar
Mark Andrews committed
9
10
11
12
13
14
15
 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
Bob Halley's avatar
Bob Halley committed
16
17
 */

18
/*! \file */
David Lawrence's avatar
David Lawrence committed
19

20
21
22
23
/*
 * Principal Author: Bob Halley
 */

24
25
#include <config.h>

Mark Andrews's avatar
Mark Andrews committed
26
/* #define inline */
27

Mark Andrews's avatar
Mark Andrews committed
28
29
30
31
#ifdef HAVE_INTTYPES_H
#include <inttypes.h> /* uintptr_t */
#endif

32
#include <isc/crc64.h>
33
#include <isc/event.h>
34
#include <isc/heap.h>
35
#include <isc/file.h>
Evan Hunt's avatar
Evan Hunt committed
36
#include <isc/hex.h>
37
#include <isc/mem.h>
38
#include <isc/mutex.h>
39
#include <isc/once.h>
40
#include <isc/platform.h>
41
#include <isc/print.h>
42
#include <isc/random.h>
43
#include <isc/refcount.h>
Bob Halley's avatar
Bob Halley committed
44
#include <isc/rwlock.h>
45
#include <isc/serial.h>
46
47
#include <isc/socket.h>
#include <isc/stdio.h>
48
#include <isc/string.h>
49
#include <isc/task.h>
50
#include <isc/time.h>
Michael Graff's avatar
Michael Graff committed
51
#include <isc/util.h>
Bob Halley's avatar
Bob Halley committed
52

53
#include <dns/acache.h>
54
#include <dns/callbacks.h>
55
56
#include <dns/db.h>
#include <dns/dbiterator.h>
57
#include <dns/events.h>
58
#include <dns/fixedname.h>
59
#include <dns/lib.h>
60
#include <dns/log.h>
61
#include <dns/masterdump.h>
62
#include <dns/nsec.h>
63
#include <dns/nsec3.h>
64
#include <dns/rbt.h>
65
#include <dns/rpz.h>
66
#include <dns/rdata.h>
Bob Halley's avatar
Bob Halley committed
67
68
#include <dns/rdataset.h>
#include <dns/rdatasetiter.h>
69
#include <dns/rdataslab.h>
70
#include <dns/rdatastruct.h>
71
#include <dns/result.h>
72
#include <dns/stats.h>
73
#include <dns/version.h>
74
75
#include <dns/view.h>
#include <dns/zone.h>
76
#include <dns/zonekey.h>
Bob Halley's avatar
Bob Halley committed
77

Evan Hunt's avatar
Evan Hunt committed
78
#ifndef WIN32
79
#include <sys/mman.h>
Evan Hunt's avatar
Evan Hunt committed
80
81
82
83
84
85
#else
#define PROT_READ	0x01
#define PROT_WRITE	0x02
#define MAP_PRIVATE	0x0002
#define MAP_FAILED	((void *)-1)
#endif
86

87
88
89
#ifdef DNS_RBTDB_VERSION64
#include "rbtdb64.h"
#else
Bob Halley's avatar
Bob Halley committed
90
#include "rbtdb.h"
91
#endif
Bob Halley's avatar
Bob Halley committed
92

93
#ifdef DNS_RBTDB_VERSION64
94
#define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '8')
95
#else
96
#define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '4')
97
98
#endif

99
100
101
102
103
104
#define CHECK(op) \
	do { result = (op); \
		if (result != ISC_R_SUCCESS) goto failure; \
	} while (0)

/*
Evan Hunt's avatar
Evan Hunt committed
105
 * This is the map file header for RBTDB images.  It is populated, and then
106
107
108
109
110
111
112
 * written, as the LAST thing done to the file.  Writing this last (with
 * zeros in the header area initially) will ensure that the header is only
 * valid when the RBTDB image is also valid.
 */
typedef struct rbtdb_file_header rbtdb_file_header_t;

/* Header length, always the same size regardless of structure size */
113
#define RBTDB_HEADER_LENGTH	1024
114
115
116
117
118
119
120
121
122
123
124
125
126

struct rbtdb_file_header {
	char version1[32];
	isc_uint32_t ptrsize;
	unsigned int bigendian:1;
	isc_uint64_t tree;
	isc_uint64_t nsec;
	isc_uint64_t nsec3;

	char version2[32];  		/* repeated; must match version1 */
};


127
/*%
128
129
130
 * Note that "impmagic" is not the first four bytes of the struct, so
 * ISC_MAGIC_VALID cannot be used.
 */
131
#define VALID_RBTDB(rbtdb)      ((rbtdb) != NULL && \
Automatic Updater's avatar
Automatic Updater committed
132
				 (rbtdb)->common.impmagic == RBTDB_MAGIC)
Bob Halley's avatar
Bob Halley committed
133

134
#ifdef DNS_RBTDB_VERSION64
135
typedef isc_uint64_t                    rbtdb_serial_t;
136
/*%
137
138
139
140
141
142
 * Make casting easier in symbolic debuggers by using different names
 * for the 64 bit version.
 */
#define dns_rbtdb_t dns_rbtdb64_t
#define rdatasetheader_t rdatasetheader64_t
#define rbtdb_version_t rbtdb_version64_t
143
144

#define once once64
145
#define FILE_VERSION FILE_VERSION64
Mark Andrews's avatar
Mark Andrews committed
146
#define init_count init_count64
147
148
149
150
151
152
153
154
155
156

#define cache_methods cache_methods64
#define dbiterator_methods dbiterator_methods64
#define rdataset_methods rdataset_methods64
#define rdatasetiter_methods rdatasetiter_methods64
#define zone_methods zone_methods64

#define acache_callback acache_callback64
#define acache_cancelentry acache_cancelentry64
#define activeempty activeempty64
157
#define activeemtpynode activeemtpynode64
158
#define add32 add64
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#define add_changed add_changed64
#define add_empty_wildcards add_empty_wildcards64
#define add_wildcard_magic add_wildcard_magic64
#define addrdataset addrdataset64
#define allrdatasets allrdatasets64
#define attach attach64
#define attachnode attachnode64
#define attachversion attachversion64
#define beginload beginload64
#define bind_rdataset bind_rdataset64
#define cache_find cache_find64
#define cache_findrdataset cache_findrdataset64
#define cache_findzonecut cache_findzonecut64
#define cache_zonecut_callback cache_zonecut_callback64
#define cleanup_dead_nodes cleanup_dead_nodes64
#define cleanup_dead_nodes_callback cleanup_dead_nodes_callback64
#define closeversion closeversion64
#define createiterator createiterator64
#define currentversion currentversion64
#define dbiterator_current dbiterator_current64
#define dbiterator_destroy dbiterator_destroy64
#define dbiterator_first dbiterator_first64
#define dbiterator_last dbiterator_last64
#define dbiterator_next dbiterator_next64
#define dbiterator_origin dbiterator_origin64
#define dbiterator_pause dbiterator_pause64
#define dbiterator_prev dbiterator_prev64
#define dbiterator_seek dbiterator_seek64
#define decrement_reference decrement_reference64
#define delete_callback delete_callback64
#define delete_node delete_node64
#define deleterdataset deleterdataset64
191
#define deserialize32 deserialize64
192
193
194
195
196
197
198
#define detach detach64
#define detachnode detachnode64
#define dump dump64
#define endload endload64
#define expire_header expire_header64
#define expirenode expirenode64
#define find_closest_nsec find_closest_nsec64
199
#define find_coveringnsec find_coveringnsec64
200
201
202
203
204
#define find_deepest_zonecut find_deepest_zonecut64
#define findnode findnode64
#define findnodeintree findnodeintree64
#define findnsec3node findnsec3node64
#define flush_deletions flush_deletions64
205
#define free_acachearray free_acachearray64
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#define free_noqname free_noqname64
#define free_rbtdb free_rbtdb64
#define free_rbtdb_callback free_rbtdb_callback64
#define free_rdataset free_rdataset64
#define getnsec3parameters getnsec3parameters64
#define getoriginnode getoriginnode64
#define getrrsetstats getrrsetstats64
#define getsigningtime getsigningtime64
#define hashsize hashsize64
#define init_file_version init_file_version64
#define isdnssec isdnssec64
#define ispersistent ispersistent64
#define issecure issecure64
#define iszonesecure iszonesecure64
#define loading_addrdataset loading_addrdataset64
#define loadnode loadnode64
#define matchparams matchparams64
#define maybe_free_rbtdb maybe_free_rbtdb64
224
#define new_reference new_reference64
225
226
227
228
229
230
231
232
#define newversion newversion64
#define nodecount nodecount64
#define overmem overmem64
#define previous_closest_nsec previous_closest_nsec64
#define printnode printnode64
#define prune_tree prune_tree64
#define rbt_datafixer rbt_datafixer64
#define rbt_datawriter rbt_datawriter64
233
#define rdataset_clearprefetch rdataset_clearprefetch64
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#define rdataset_clone rdataset_clone64
#define rdataset_count rdataset_count64
#define rdataset_current rdataset_current64
#define rdataset_disassociate rdataset_disassociate64
#define rdataset_expire rdataset_expire64
#define rdataset_first rdataset_first64
#define rdataset_getadditional rdataset_getadditional64
#define rdataset_getclosest rdataset_getclosest64
#define rdataset_getnoqname rdataset_getnoqname64
#define rdataset_next rdataset_next64
#define rdataset_putadditional rdataset_putadditional64
#define rdataset_setadditional rdataset_setadditional64
#define rdataset_settrust rdataset_settrust64
#define rdatasetiter_current rdatasetiter_current64
#define rdatasetiter_destroy rdatasetiter_destroy64
#define rdatasetiter_first rdatasetiter_first64
#define rdatasetiter_next rdatasetiter_next64
#define reactivate_node reactivate_node64
#define resign_delete resign_delete64
#define resign_insert resign_insert64
#define resign_sooner resign_sooner64
#define resigned resigned64
#define rpz_attach rpz_attach64
#define rpz_ready rpz_ready64
#define serialize serialize64
#define set_index set_index64
260
#define set_ttl set_ttl64
261
262
263
264
265
266
#define setcachestats setcachestats64
#define setsigningtime setsigningtime64
#define settask settask64
#define setup_delegation setup_delegation64
#define subtractrdataset subtractrdataset64
#define ttl_sooner ttl_sooner64
267
#define update_cachestats update_cachestats64
268
#define update_header update_header64
269
270
#define update_newheader update_newheader64
#define update_rrsetstats update_rrsetstats64
271
272
273
274
275
#define zone_find zone_find64
#define zone_findrdataset zone_findrdataset64
#define zone_findzonecut zone_findzonecut64
#define zone_zonecut_callback zone_zonecut_callback64

276
#else
277
typedef isc_uint32_t                    rbtdb_serial_t;
278
279
#endif

280
typedef isc_uint32_t                    rbtdb_rdatatype_t;
Bob Halley's avatar
Bob Halley committed
281

282
283
#define RBTDB_RDATATYPE_BASE(type)      ((dns_rdatatype_t)((type) & 0xFFFF))
#define RBTDB_RDATATYPE_EXT(type)       ((dns_rdatatype_t)((type) >> 16))
284
#define RBTDB_RDATATYPE_VALUE(b, e)     ((rbtdb_rdatatype_t)((e) << 16) | (b))
Bob Halley's avatar
Bob Halley committed
285

286
#define RBTDB_RDATATYPE_SIGNSEC \
Automatic Updater's avatar
Automatic Updater committed
287
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
288
289
#define RBTDB_RDATATYPE_SIGNSEC3 \
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec3)
Bob Halley's avatar
Bob Halley committed
290
#define RBTDB_RDATATYPE_SIGNS \
Automatic Updater's avatar
Automatic Updater committed
291
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
Bob Halley's avatar
Bob Halley committed
292
#define RBTDB_RDATATYPE_SIGCNAME \
Automatic Updater's avatar
Automatic Updater committed
293
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
294
#define RBTDB_RDATATYPE_SIGDNAME \
Automatic Updater's avatar
Automatic Updater committed
295
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
296
297
#define RBTDB_RDATATYPE_SIGDDS \
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ds)
298
#define RBTDB_RDATATYPE_NCACHEANY \
Automatic Updater's avatar
Automatic Updater committed
299
		RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
Bob Halley's avatar
Bob Halley committed
300

301
/*
302
 * We use rwlock for DB lock only when ISC_RWLOCK_USEATOMIC is non 0.
303
 * Using rwlock is effective with regard to lookup performance only when
304
 * it is implemented in an efficient way.
305
306
307
308
 * Otherwise, it is generally wise to stick to the simple locking since rwlock
 * would require more memory or can even make lookups slower due to its own
 * overhead (when it internally calls mutex locks).
 */
309
#ifdef ISC_RWLOCK_USEATOMIC
310
311
312
313
314
315
#define DNS_RBTDB_USERWLOCK 1
#else
#define DNS_RBTDB_USERWLOCK 0
#endif

#if DNS_RBTDB_USERWLOCK
316
317
318
319
#define RBTDB_INITLOCK(l)       isc_rwlock_init((l), 0, 0)
#define RBTDB_DESTROYLOCK(l)    isc_rwlock_destroy(l)
#define RBTDB_LOCK(l, t)        RWLOCK((l), (t))
#define RBTDB_UNLOCK(l, t)      RWUNLOCK((l), (t))
320
#else
321
322
323
324
#define RBTDB_INITLOCK(l)       isc_mutex_init(l)
#define RBTDB_DESTROYLOCK(l)    DESTROYLOCK(l)
#define RBTDB_LOCK(l, t)        LOCK(l)
#define RBTDB_UNLOCK(l, t)      UNLOCK(l)
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
#endif

/*
 * Since node locking is sensitive to both performance and memory footprint,
 * we need some trick here.  If we have both high-performance rwlock and
 * high performance and small-memory reference counters, we use rwlock for
 * node lock and isc_refcount for node references.  In this case, we don't have
 * to protect the access to the counters by locks.
 * Otherwise, we simply use ordinary mutex lock for node locking, and use
 * simple integers as reference counters which is protected by the lock.
 * In most cases, we can simply use wrapper macros such as NODE_LOCK and
 * NODE_UNLOCK.  In some other cases, however, we need to protect reference
 * counters first and then protect other parts of a node as read-only data.
 * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
 * provided for these special cases.  When we can use the efficient backend
 * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
 * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
 * section including the access to the reference counter.
 * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
 * section is also protected by NODE_STRONGLOCK().
 */
#if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT)
typedef isc_rwlock_t nodelock_t;

349
350
351
352
353
354
355
356
357
358
359
#define NODE_INITLOCK(l)        isc_rwlock_init((l), 0, 0)
#define NODE_DESTROYLOCK(l)     isc_rwlock_destroy(l)
#define NODE_LOCK(l, t)         RWLOCK((l), (t))
#define NODE_UNLOCK(l, t)       RWUNLOCK((l), (t))
#define NODE_TRYUPGRADE(l)      isc_rwlock_tryupgrade(l)

#define NODE_STRONGLOCK(l)      ((void)0)
#define NODE_STRONGUNLOCK(l)    ((void)0)
#define NODE_WEAKLOCK(l, t)     NODE_LOCK(l, t)
#define NODE_WEAKUNLOCK(l, t)   NODE_UNLOCK(l, t)
#define NODE_WEAKDOWNGRADE(l)   isc_rwlock_downgrade(l)
360
361
362
#else
typedef isc_mutex_t nodelock_t;

363
364
365
366
367
368
369
370
371
372
373
#define NODE_INITLOCK(l)        isc_mutex_init(l)
#define NODE_DESTROYLOCK(l)     DESTROYLOCK(l)
#define NODE_LOCK(l, t)         LOCK(l)
#define NODE_UNLOCK(l, t)       UNLOCK(l)
#define NODE_TRYUPGRADE(l)      ISC_R_SUCCESS

#define NODE_STRONGLOCK(l)      LOCK(l)
#define NODE_STRONGUNLOCK(l)    UNLOCK(l)
#define NODE_WEAKLOCK(l, t)     ((void)0)
#define NODE_WEAKUNLOCK(l, t)   ((void)0)
#define NODE_WEAKDOWNGRADE(l)   ((void)0)
374
375
#endif

376
377
378
379
380
381
382
383
384
/*%
 * Whether to rate-limit updating the LRU to avoid possible thread contention.
 * Our performance measurement has shown the cost is marginal, so it's defined
 * to be 0 by default either with or without threads.
 */
#ifndef DNS_RBTDB_LIMITLRUUPDATE
#define DNS_RBTDB_LIMITLRUUPDATE 0
#endif

385
/*
386
 * Allow clients with a virtual time of up to 5 minutes in the past to see
387
388
389
390
 * records that would have otherwise have expired.
 */
#define RBTDB_VIRTUAL 300

391
struct noqname {
392
393
394
395
	dns_name_t 	name;
	void *     	neg;
	void *     	negsig;
	dns_rdatatype_t	type;
396
397
};

398
typedef struct acachectl acachectl_t;
399

Bob Halley's avatar
Bob Halley committed
400
typedef struct rdatasetheader {
Automatic Updater's avatar
Automatic Updater committed
401
402
403
404
405
406
407
408
409
	/*%
	 * Locked by the owning node's lock.
	 */
	rbtdb_serial_t                  serial;
	dns_ttl_t                       rdh_ttl;
	rbtdb_rdatatype_t               type;
	isc_uint16_t                    attributes;
	dns_trust_t                     trust;
	struct noqname                  *noqname;
410
	struct noqname                  *closest;
411
412
413
	unsigned int 			is_mmapped : 1;
	unsigned int 			next_is_relative : 1;
	unsigned int 			node_is_relative : 1;
Automatic Updater's avatar
Automatic Updater committed
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
	/*%<
	 * We don't use the LIST macros, because the LIST structure has
	 * both head and tail pointers, and is doubly linked.
	 */

	struct rdatasetheader           *next;
	/*%<
	 * If this is the top header for an rdataset, 'next' points
	 * to the top header for the next rdataset (i.e., the next type).
	 * Otherwise, it points up to the header whose down pointer points
	 * at this header.
	 */

	struct rdatasetheader           *down;
	/*%<
	 * Points to the header for the next older version of
	 * this rdataset.
	 */

	isc_uint32_t                    count;
	/*%<
	 * Monotonously increased every time this rdataset is bound so that
	 * it is used as the base of the starting point in DNS responses
	 * when the "cyclic" rrset-order is required.  Since the ordering
	 * should not be so crucial, no lock is set for the counter for
	 * performance reasons.
	 */

	acachectl_t                     *additional_auth;
	acachectl_t                     *additional_glue;

	dns_rbtnode_t                   *node;
	isc_stdtime_t                   last_used;
447
	ISC_LINK(struct rdatasetheader) link;
Automatic Updater's avatar
Automatic Updater committed
448
449
450
451
452

	unsigned int                    heap_index;
	/*%<
	 * Used for TTL-based cache cleaning.
	 */
Automatic Updater's avatar
Automatic Updater committed
453
	isc_stdtime_t                   resign;
Bob Halley's avatar
Bob Halley committed
454
455
} rdatasetheader_t;

456
457
458
459
460
461
462
463
typedef ISC_LIST(rdatasetheader_t)      rdatasetheaderlist_t;
typedef ISC_LIST(dns_rbtnode_t)         rbtnodelist_t;

#define RDATASET_ATTR_NONEXISTENT       0x0001
#define RDATASET_ATTR_STALE             0x0002
#define RDATASET_ATTR_IGNORE            0x0004
#define RDATASET_ATTR_RETAIN            0x0008
#define RDATASET_ATTR_NXDOMAIN          0x0010
464
#define RDATASET_ATTR_RESIGN            0x0020
465
#define RDATASET_ATTR_STATCOUNT         0x0040
466
#define RDATASET_ATTR_OPTOUT            0x0080
467
#define RDATASET_ATTR_NEGATIVE          0x0100
468
#define RDATASET_ATTR_PREFETCH          0x0200
Michael Graff's avatar
Michael Graff committed
469

470
typedef struct acache_cbarg {
Automatic Updater's avatar
Automatic Updater committed
471
472
473
474
475
	dns_rdatasetadditional_t        type;
	unsigned int                    count;
	dns_db_t                        *db;
	dns_dbnode_t                    *node;
	rdatasetheader_t                *header;
476
477
478
} acache_cbarg_t;

struct acachectl {
Automatic Updater's avatar
Automatic Updater committed
479
480
	dns_acacheentry_t               *entry;
	acache_cbarg_t                  *cbarg;
481
482
};

Michael Graff's avatar
Michael Graff committed
483
484
485
486
487
488
489
/*
 * XXX
 * When the cache will pre-expire data (due to memory low or other
 * situations) before the rdataset's TTL has expired, it MUST
 * respect the RETAIN bit and not expire the data until its TTL is
 * expired.
 */
490

491
#undef IGNORE                   /* WIN32 winbase.h defines this. */
492

493
#define EXISTS(header) \
Automatic Updater's avatar
Automatic Updater committed
494
	(((header)->attributes & RDATASET_ATTR_NONEXISTENT) == 0)
495
#define NONEXISTENT(header) \
Automatic Updater's avatar
Automatic Updater committed
496
	(((header)->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
497
#define IGNORE(header) \
Automatic Updater's avatar
Automatic Updater committed
498
	(((header)->attributes & RDATASET_ATTR_IGNORE) != 0)
Michael Graff's avatar
Michael Graff committed
499
#define RETAIN(header) \
Automatic Updater's avatar
Automatic Updater committed
500
	(((header)->attributes & RDATASET_ATTR_RETAIN) != 0)
501
#define NXDOMAIN(header) \
Automatic Updater's avatar
Automatic Updater committed
502
	(((header)->attributes & RDATASET_ATTR_NXDOMAIN) != 0)
503
#define RESIGN(header) \
Automatic Updater's avatar
Automatic Updater committed
504
	(((header)->attributes & RDATASET_ATTR_RESIGN) != 0)
505
506
#define OPTOUT(header) \
	(((header)->attributes & RDATASET_ATTR_OPTOUT) != 0)
507
508
#define NEGATIVE(header) \
	(((header)->attributes & RDATASET_ATTR_NEGATIVE) != 0)
509
510
#define PREFETCH(header) \
	(((header)->attributes & RDATASET_ATTR_PREFETCH) != 0)
511

512
#define DEFAULT_NODE_LOCK_COUNT         7       /*%< Should be prime. */
513
514
515
516
517
518
519
520
521
522
523
524
525

/*%
 * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
 * There is a tradeoff issue about configuring this value: if this is too
 * small, it may cause heavier contention between threads; if this is too large,
 * LRU purge algorithm won't work well (entries tend to be purged prematurely).
 * The default value should work well for most environments, but this can
 * also be configurable at compilation time via the
 * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable.  This value must be larger than
 * 1 due to the assumption of overmem_purge().
 */
#ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
#if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
526
#error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
527
528
529
530
531
532
#else
#define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
#endif
#else
#define DEFAULT_CACHE_NODE_LOCK_COUNT   16
#endif	/* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
Bob Halley's avatar
Bob Halley committed
533
534

typedef struct {
Automatic Updater's avatar
Automatic Updater committed
535
536
537
538
539
	nodelock_t                      lock;
	/* Protected in the refcount routines. */
	isc_refcount_t                  references;
	/* Locked by lock. */
	isc_boolean_t                   exiting;
540
541
542
} rbtdb_nodelock_t;

typedef struct rbtdb_changed {
Automatic Updater's avatar
Automatic Updater committed
543
544
545
	dns_rbtnode_t *                 node;
	isc_boolean_t                   dirty;
	ISC_LINK(struct rbtdb_changed)  link;
546
547
} rbtdb_changed_t;

548
typedef ISC_LIST(rbtdb_changed_t)       rbtdb_changedlist_t;
549

550
551
552
553
554
555
typedef enum {
	dns_db_insecure,
	dns_db_partial,
	dns_db_secure
} dns_db_secure_t;

556
557
typedef struct dns_rbtdb dns_rbtdb_t;

558
559
560
561
562
563
564
/* Reason for expiring a record from cache */
typedef enum {
	expire_lru,
	expire_ttl,
	expire_flush
} expire_t;

565
typedef struct rbtdb_version {
Automatic Updater's avatar
Automatic Updater committed
566
567
	/* Not locked */
	rbtdb_serial_t                  serial;
568
	dns_rbtdb_t *			rbtdb;
Automatic Updater's avatar
Automatic Updater committed
569
570
571
572
573
574
575
576
577
578
	/*
	 * Protected in the refcount routines.
	 * XXXJT: should we change the lock policy based on the refcount
	 * performance?
	 */
	isc_refcount_t                  references;
	/* Locked by database lock. */
	isc_boolean_t                   writer;
	isc_boolean_t                   commit_ok;
	rbtdb_changedlist_t             changed_list;
579
	rdatasetheaderlist_t		resigned_list;
Automatic Updater's avatar
Automatic Updater committed
580
	ISC_LINK(struct rbtdb_version)  link;
581
	dns_db_secure_t			secure;
582
583
584
585
586
587
	isc_boolean_t			havensec3;
	/* NSEC3 parameters */
	dns_hash_t			hash;
	isc_uint8_t			flags;
	isc_uint16_t			iterations;
	isc_uint8_t			salt_length;
588
	unsigned char			salt[DNS_NSEC3_SALTSIZE];
589
590
} rbtdb_version_t;

591
592
typedef ISC_LIST(rbtdb_version_t)       rbtdb_versionlist_t;

593
struct dns_rbtdb {
Automatic Updater's avatar
Automatic Updater committed
594
595
	/* Unlocked. */
	dns_db_t                        common;
596
	/* Locks the data in this struct */
597
#if DNS_RBTDB_USERWLOCK
Automatic Updater's avatar
Automatic Updater committed
598
	isc_rwlock_t                    lock;
599
#else
Automatic Updater's avatar
Automatic Updater committed
600
	isc_mutex_t                     lock;
601
#endif
602
	/* Locks the tree structure (prevents nodes appearing/disappearing) */
Automatic Updater's avatar
Automatic Updater committed
603
	isc_rwlock_t                    tree_lock;
604
	/* Locks for individual tree nodes */
Automatic Updater's avatar
Automatic Updater committed
605
606
607
	unsigned int                    node_lock_count;
	rbtdb_nodelock_t *              node_locks;
	dns_rbtnode_t *                 origin_node;
608
	dns_stats_t *			rrsetstats; /* cache DB only */
609
	isc_stats_t *			cachestats; /* cache DB only */
Automatic Updater's avatar
Automatic Updater committed
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
	/* Locked by lock. */
	unsigned int                    active;
	isc_refcount_t                  references;
	unsigned int                    attributes;
	rbtdb_serial_t                  current_serial;
	rbtdb_serial_t                  least_serial;
	rbtdb_serial_t                  next_serial;
	rbtdb_version_t *               current_version;
	rbtdb_version_t *               future_version;
	rbtdb_versionlist_t             open_versions;
	isc_task_t *                    task;
	dns_dbnode_t                    *soanode;
	dns_dbnode_t                    *nsnode;

	/*
	 * This is a linked list used to implement the LRU cache.  There will
	 * be node_lock_count linked lists here.  Nodes in bucket 1 will be
	 * placed on the linked list rdatasets[1].
	 */
	rdatasetheaderlist_t            *rdatasets;
630

Automatic Updater's avatar
Automatic Updater committed
631
632
	/*%
	 * Temporary storage for stale cache nodes and dynamically deleted
633
	 * nodes that await being cleaned up.
Automatic Updater's avatar
Automatic Updater committed
634
	 */
Automatic Updater's avatar
Automatic Updater committed
635
636
637
	rbtnodelist_t                   *deadnodes;

	/*
638
639
640
641
	 * Heaps.  These are used for TTL based expiry in a cache,
	 * or for zone resigning in a zone DB.  hmctx is the memory
	 * context to use for the heap (which differs from the main
	 * database memory context in the case of a cache).
Automatic Updater's avatar
Automatic Updater committed
642
	 */
643
	isc_mem_t *			hmctx;
Automatic Updater's avatar
Automatic Updater committed
644
	isc_heap_t                      **heaps;
Tinderbox User's avatar
Tinderbox User committed
645

646
647
648
649
	/*
	 * Base values for the mmap() code.
	 */
	void *				mmap_location;
650
	size_t				mmap_size;
Automatic Updater's avatar
Automatic Updater committed
651
652
653

	/* Locked by tree_lock. */
	dns_rbt_t *                     tree;
654
	dns_rbt_t *			nsec;
655
	dns_rbt_t *			nsec3;
656
657
658
	dns_rpz_zones_t			*rpzs;
	dns_rpz_num_t			rpz_num;
	dns_rpz_zones_t			*load_rpzs;
Automatic Updater's avatar
Automatic Updater committed
659
660
661

	/* Unlocked */
	unsigned int                    quantum;
662
};
Bob Halley's avatar
Bob Halley committed
663

664
665
#define RBTDB_ATTR_LOADED               0x01
#define RBTDB_ATTR_LOADING              0x02
666

667
/*%
668
669
670
 * Search Context
 */
typedef struct {
Automatic Updater's avatar
Automatic Updater committed
671
672
673
674
675
676
677
678
679
680
681
682
683
	dns_rbtdb_t *           rbtdb;
	rbtdb_version_t *       rbtversion;
	rbtdb_serial_t          serial;
	unsigned int            options;
	dns_rbtnodechain_t      chain;
	isc_boolean_t           copy_name;
	isc_boolean_t           need_cleanup;
	isc_boolean_t           wild;
	dns_rbtnode_t *         zonecut;
	rdatasetheader_t *      zonecut_rdataset;
	rdatasetheader_t *      zonecut_sigrdataset;
	dns_fixedname_t         zonecut_name;
	isc_stdtime_t           now;
684
685
} rbtdb_search_t;

686
/*%
687
688
689
 * Load Context
 */
typedef struct {
Automatic Updater's avatar
Automatic Updater committed
690
691
	dns_rbtdb_t *           rbtdb;
	isc_stdtime_t           now;
692
693
} rbtdb_load_t;

694
static void delete_callback(void *data, void *arg);
Bob Halley's avatar
Bob Halley committed
695
static void rdataset_disassociate(dns_rdataset_t *rdataset);
696
697
static isc_result_t rdataset_first(dns_rdataset_t *rdataset);
static isc_result_t rdataset_next(dns_rdataset_t *rdataset);
Bob Halley's avatar
Bob Halley committed
698
static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
Bob Halley's avatar
Bob Halley committed
699
static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
Bob Halley's avatar
Bob Halley committed
700
static unsigned int rdataset_count(dns_rdataset_t *rdataset);
701
static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset,
Automatic Updater's avatar
Automatic Updater committed
702
					dns_name_t *name,
703
704
705
706
707
708
					dns_rdataset_t *neg,
					dns_rdataset_t *negsig);
static isc_result_t rdataset_getclosest(dns_rdataset_t *rdataset,
					dns_name_t *name,
					dns_rdataset_t *neg,
					dns_rdataset_t *negsig);
709
static isc_result_t rdataset_getadditional(dns_rdataset_t *rdataset,
Automatic Updater's avatar
Automatic Updater committed
710
711
712
713
714
715
716
717
718
719
					   dns_rdatasetadditional_t type,
					   dns_rdatatype_t qtype,
					   dns_acache_t *acache,
					   dns_zone_t **zonep,
					   dns_db_t **dbp,
					   dns_dbversion_t **versionp,
					   dns_dbnode_t **nodep,
					   dns_name_t *fname,
					   dns_message_t *msg,
					   isc_stdtime_t now);
720
static isc_result_t rdataset_setadditional(dns_rdataset_t *rdataset,
Automatic Updater's avatar
Automatic Updater committed
721
722
723
724
725
726
727
728
					   dns_rdatasetadditional_t type,
					   dns_rdatatype_t qtype,
					   dns_acache_t *acache,
					   dns_zone_t *zone,
					   dns_db_t *db,
					   dns_dbversion_t *version,
					   dns_dbnode_t *node,
					   dns_name_t *fname);
729
static isc_result_t rdataset_putadditional(dns_acache_t *acache,
Automatic Updater's avatar
Automatic Updater committed
730
731
732
					   dns_rdataset_t *rdataset,
					   dns_rdatasetadditional_t type,
					   dns_rdatatype_t qtype);
733
static inline isc_boolean_t need_headerupdate(rdatasetheader_t *header,
Automatic Updater's avatar
Automatic Updater committed
734
					      isc_stdtime_t now);
735
static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
Automatic Updater's avatar
Automatic Updater committed
736
			  isc_stdtime_t now);
737
static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
738
			  isc_boolean_t tree_locked, expire_t reason);
739
740
static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
			  isc_stdtime_t now, isc_boolean_t tree_locked);
741
742
static isc_result_t resign_insert(dns_rbtdb_t *rbtdb, int idx,
				  rdatasetheader_t *newheader);
743
744
static void resign_delete(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
			  rdatasetheader_t *header);
745
static void prune_tree(isc_task_t *task, isc_event_t *event);
746
747
static void rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust);
static void rdataset_expire(dns_rdataset_t *rdataset);
748
static void rdataset_clearprefetch(dns_rdataset_t *rdataset);
749
750

static dns_rdatasetmethods_t rdataset_methods = {
Automatic Updater's avatar
Automatic Updater committed
751
752
753
754
755
756
757
758
	rdataset_disassociate,
	rdataset_first,
	rdataset_next,
	rdataset_current,
	rdataset_clone,
	rdataset_count,
	NULL,
	rdataset_getnoqname,
759
760
	NULL,
	rdataset_getclosest,
Automatic Updater's avatar
Automatic Updater committed
761
762
	rdataset_getadditional,
	rdataset_setadditional,
763
764
	rdataset_putadditional,
	rdataset_settrust,
765
766
	rdataset_expire,
	rdataset_clearprefetch
Bob Halley's avatar
Bob Halley committed
767
768
769
};

static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
770
771
static isc_result_t rdatasetiter_first(dns_rdatasetiter_t *iterator);
static isc_result_t rdatasetiter_next(dns_rdatasetiter_t *iterator);
Bob Halley's avatar
Bob Halley committed
772
static void rdatasetiter_current(dns_rdatasetiter_t *iterator,
Automatic Updater's avatar
Automatic Updater committed
773
				 dns_rdataset_t *rdataset);
Bob Halley's avatar
Bob Halley committed
774
775

static dns_rdatasetitermethods_t rdatasetiter_methods = {
Automatic Updater's avatar
Automatic Updater committed
776
777
778
779
	rdatasetiter_destroy,
	rdatasetiter_first,
	rdatasetiter_next,
	rdatasetiter_current
780
781
};

Bob Halley's avatar
Bob Halley committed
782
typedef struct rbtdb_rdatasetiter {
Automatic Updater's avatar
Automatic Updater committed
783
784
	dns_rdatasetiter_t              common;
	rdatasetheader_t *              current;
Bob Halley's avatar
Bob Halley committed
785
786
} rbtdb_rdatasetiter_t;

787
788
789
790
static void             dbiterator_destroy(dns_dbiterator_t **iteratorp);
static isc_result_t     dbiterator_first(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_last(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_seek(dns_dbiterator_t *iterator,
Automatic Updater's avatar
Automatic Updater committed
791
					dns_name_t *name);
792
793
794
static isc_result_t     dbiterator_prev(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_next(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_current(dns_dbiterator_t *iterator,
Automatic Updater's avatar
Automatic Updater committed
795
796
					   dns_dbnode_t **nodep,
					   dns_name_t *name);
797
798
static isc_result_t     dbiterator_pause(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_origin(dns_dbiterator_t *iterator,
Automatic Updater's avatar
Automatic Updater committed
799
					  dns_name_t *name);
800
801

static dns_dbiteratormethods_t dbiterator_methods = {
Automatic Updater's avatar
Automatic Updater committed
802
803
804
805
806
807
808
809
810
	dbiterator_destroy,
	dbiterator_first,
	dbiterator_last,
	dbiterator_seek,
	dbiterator_prev,
	dbiterator_next,
	dbiterator_current,
	dbiterator_pause,
	dbiterator_origin
811
812
};

813
814
#define DELETION_BATCH_MAX 64

815
/*
816
 * If 'paused' is ISC_TRUE, then the tree lock is not being held.
817
 */
818
typedef struct rbtdb_dbiterator {
Automatic Updater's avatar
Automatic Updater committed
819
820
821
822
823
824
825
826
	dns_dbiterator_t                common;
	isc_boolean_t                   paused;
	isc_boolean_t                   new_origin;
	isc_rwlocktype_t                tree_locked;
	isc_result_t                    result;
	dns_fixedname_t                 name;
	dns_fixedname_t                 origin;
	dns_rbtnodechain_t              chain;
827
828
	dns_rbtnodechain_t		nsec3chain;
	dns_rbtnodechain_t		*current;
Automatic Updater's avatar
Automatic Updater committed
829
830
831
	dns_rbtnode_t                   *node;
	dns_rbtnode_t                   *deletions[DELETION_BATCH_MAX];
	int                             delete;
832
833
	isc_boolean_t			nsec3only;
	isc_boolean_t			nonsec3;
834
835
836
} rbtdb_dbiterator_t;


837
838
#define IS_STUB(rbtdb)  (((rbtdb)->common.attributes & DNS_DBATTR_STUB)  != 0)
#define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
839

840
static void free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log,
Automatic Updater's avatar
Automatic Updater committed
841
		       isc_event_t *event);
842
static void overmem(dns_db_t *db, isc_boolean_t overmem);
843
static void setnsec3parameters(dns_db_t *db, rbtdb_version_t *version);
844

845
846
847
/* Pad to 32 bytes */
static char FILE_VERSION[32] = "\0";

848
849
850
/*%
 * 'init_count' is used to initialize 'newheader->count' which inturn
 * is used to determine where in the cycle rrset-order cyclic starts.
Francis Dupont's avatar
Francis Dupont committed
851
 * We don't lock this as we don't care about simultaneous updates.
852
853
 *
 * Note:
854
 *      Both init_count and header->count can be ISC_UINT32_MAX.
855
 *      The count on the returned rdataset however can't be as
856
857
 *      that indicates that the database does not implement cyclic
 *      processing.
858
859
860
 */
static unsigned int init_count;

861
862
863
864
865
866
/*
 * Locking
 *
 * If a routine is going to lock more than one lock in this module, then
 * the locking must be done in the following order:
 *
867
 *      Tree Lock
868
 *
869
870
 *      Node Lock       (Only one from the set may be locked at one time by
 *                       any caller)
871
 *
872
 *      Database Lock
873
874
875
876
 *
 * Failure to follow this hierarchy can result in deadlock.
 */

877
878
879
/*
 * Deleting Nodes
 *
880
 * For zone databases the node for the origin of the zone MUST NOT be deleted.
881
882
 */

Evan Hunt's avatar
Evan Hunt committed
883
884
885
886
887
888
/*
 * Debugging routines
 */
#ifdef DEBUG
static void
hexdump(const char *desc, unsigned char *data, size_t size) {
889
	char hexdump[BUFSIZ * 2 + 1];
Evan Hunt's avatar
Evan Hunt committed
890
891
	isc_buffer_t b;
	isc_region_t r;
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
	isc_result_t result;
	size_t bytes;

	fprintf(stderr, "%s: ", desc);
	do {
		isc_buffer_init(&b, hexdump, sizeof(hexdump));
		r.base = data;
		r.length = bytes = (size > BUFSIZ) ? BUFSIZ : size;
		result = isc_hex_totext(&r, 0, "", &b);
		RUNTIME_CHECK(result == ISC_R_SUCCESS);
		isc_buffer_putuint8(&b, 0);
		fprintf(stderr, "%s", hexdump);
		data += bytes;
		size -= bytes;
	} while (size > 0);
	fprintf(stderr, "\n");
Evan Hunt's avatar
Evan Hunt committed
908
909
910
}
#endif

911

912
913
914
915
/*
 * DB Routines
 */

Bob Halley's avatar
Bob Halley committed
916
917
static void
attach(dns_db_t *source, dns_db_t **targetp) {
Automatic Updater's avatar
Automatic Updater committed
918
	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source;
Bob Halley's avatar
Bob Halley committed
919

Automatic Updater's avatar
Automatic Updater committed
920
	REQUIRE(VALID_RBTDB(rbtdb));
Bob Halley's avatar
Bob Halley committed
921

Automatic Updater's avatar
Automatic Updater committed
922
	isc_refcount_increment(&rbtdb->references, NULL);
Bob Halley's avatar
Bob Halley committed
923

Automatic Updater's avatar
Automatic Updater committed
924
	*targetp = source;
Bob Halley's avatar
Bob Halley committed
925
926
927
}

static void
928
free_rbtdb_callback(isc_task_t *task, isc_event_t *event) {
Automatic Updater's avatar
Automatic Updater committed
929
	dns_rbtdb_t *rbtdb = event->ev_arg;
930

Automatic Updater's avatar
Automatic Updater committed
931
	UNUSED(task);
932

Automatic Updater's avatar
Automatic Updater committed
933
	free_rbtdb(rbtdb, ISC_TRUE, event);
934
935
}

936
937
938
static void
update_cachestats(dns_rbtdb_t *rbtdb, isc_result_t result) {
	INSIST(IS_CACHE(rbtdb));
939
940
941

	if (rbtdb->cachestats == NULL)
		return;
942
943
944
945
946
947
948
949
950
951
952
953

	switch (result) {
	case ISC_R_SUCCESS:
	case DNS_R_CNAME:
	case DNS_R_DNAME:
	case DNS_R_DELEGATION:
	case DNS_R_NCACHENXDOMAIN:
	case DNS_R_NCACHENXRRSET:
		isc_stats_increment(rbtdb->cachestats,
				    dns_cachestatscounter_hits);
		break;
	default:
Tinderbox User's avatar
Tinderbox User committed
954
		isc_stats_increment(rbtdb->cachestats,
955
956
957
958
				    dns_cachestatscounter_misses);
	}
}

959
960
961
962
963
964
965
966
967
968
969
static void
update_rrsetstats(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
		  isc_boolean_t increment)
{
	dns_rdatastatstype_t statattributes = 0;
	dns_rdatastatstype_t base = 0;
	dns_rdatastatstype_t type;

	/* At the moment we count statistics only for cache DB */
	INSIST(IS_CACHE(rbtdb));

970
971
972
973
974
975
976
	if (NEGATIVE(header)) {
		if (NXDOMAIN(header))
			statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN;
		else {
			statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET;
			base = RBTDB_RDATATYPE_EXT(header->type);
		}
977
978
979
980
981
982
983
984
985
986
	} else
		base = RBTDB_RDATATYPE_BASE(header->type);

	type = DNS_RDATASTATSTYPE_VALUE(base, statattributes);
	if (increment)
		dns_rdatasetstats_increment(rbtdb->rrsetstats, type);
	else
		dns_rdatasetstats_decrement(rbtdb->rrsetstats, type);
}

987
988
static void
set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) {
Automatic Updater's avatar
Automatic Updater committed
989
990
991
992
993
994
995
	int idx;
	isc_heap_t *heap;
	dns_ttl_t oldttl;

	oldttl = header->rdh_ttl;
	header->rdh_ttl = newttl;

996
997
998
	if (!IS_CACHE(rbtdb))
		return;

Automatic Updater's avatar
Automatic Updater committed
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
	/*
	 * It's possible the rbtdb is not a cache.  If this is the case,
	 * we will not have a heap, and we move on.  If we do, though,
	 * we might need to adjust things.
	 */
	if (header->heap_index == 0 || newttl == oldttl)
		return;
	idx = header->node->locknum;
	if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL)
	    return;
	heap = rbtdb->heaps[idx];

	if (newttl < oldttl)
		isc_heap_increased(heap, header->heap_index);
	else
		isc_heap_decreased(heap, header->heap_index);
1015
1016
1017
}

/*%
1018
 * These functions allow the heap code to rank the priority of each
1019
1020
1021
1022
 * element.  It returns ISC_TRUE if v1 happens "sooner" than v2.
 */
static isc_boolean_t
ttl_sooner(void *v1, void *v2) {
Automatic Updater's avatar
Automatic Updater committed
1023
1024
	rdatasetheader_t *h1 = v1;
	rdatasetheader_t *h2 = v2;
1025

Automatic Updater's avatar
Automatic Updater committed
1026
1027
1028
	if (h1->rdh_ttl < h2->rdh_ttl)
		return (ISC_TRUE);
	return (ISC_FALSE);
1029
1030
}

1031
1032
static isc_boolean_t
resign_sooner(void *v1, void *v2) {
Automatic Updater's avatar
Automatic Updater committed
1033
1034
	rdatasetheader_t *h1 = v1;
	rdatasetheader_t *h2 = v2;
1035

1036
	if (isc_serial_lt(h1->resign, h2->resign))
Automatic Updater's avatar
Automatic Updater committed
1037
1038
		return (ISC_TRUE);
	return (ISC_FALSE);
1039
1040
}

1041
1042
1043
1044
/*%
 * This function sets the heap index into the header.
 */
static void
1045
set_index(void *what, unsigned int index) {
Automatic Updater's avatar
Automatic Updater committed
1046
	rdatasetheader_t *h = what;
1047

Automatic Updater's avatar
Automatic Updater committed
1048
	h->heap_index = index;
1049
1050
}

1051
1052
1053
1054
1055
1056
/*%
 * Work out how many nodes can be deleted in the time between two
 * requests to the nameserver.  Smooth the resulting number and use it
 * as a estimate for the number of nodes to be deleted in the next
 * iteration.
 */
1057
1058
static unsigned int
adjust_quantum(unsigned int old, isc_time_t *start) {
Automatic Updater's avatar
Automatic Updater committed
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
	unsigned int pps = dns_pps;     /* packets per second */
	unsigned int interval;
	isc_uint64_t usecs;
	isc_time_t end;
	unsigned int new;

	if (pps < 100)
		pps = 100;
	isc_time_now(&end);

	interval = 1000000 / pps;       /* interval in usec */
	if (interval == 0)
		interval = 1;
	usecs = isc_time_microdiff(&end, start);
	if (usecs == 0) {
		/*
		 * We were unable to measure the amount of time taken.
		 * Double the nodes deleted next time.
		 */
		old *= 2;
		if (old > 1000)
			old = 1000;
		return (old);
	}
	new = old * interval;
	new /= (unsigned int)usecs;
	if (new == 0)
		new = 1;
	else if (new > 1000)
		new = 1000;

	/* Smooth */
	new = (new + old * 3) / 4;

	isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE,
		      ISC_LOG_DEBUG(1), "adjust_quantum -> %d", new);

	return (new);
1097
}
1098

1099
1100
static void
free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) {
Automatic Updater's avatar
Automatic Updater committed
1101
1102
1103
1104
	unsigned int i;
	isc_ondestroy_t ondest;
	isc_result_t result;
	char buf[DNS_NAME_FORMATSIZE];
1105
	dns_rbt_t **treep;
Automatic Updater's avatar
Automatic Updater committed
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
	isc_time_t start;

	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
		overmem((dns_db_t *)rbtdb, (isc_boolean_t)-1);

	REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions));
	REQUIRE(rbtdb->future_version == NULL);

	if (rbtdb->current_version != NULL) {
		unsigned int refs;

		isc_refcount_decrement(&rbtdb->current_version->references,
				       &refs);
		INSIST(refs == 0);
		UNLINK(rbtdb->open_versions, rbtdb->current_version, link);
		isc_refcount_destroy(&rbtdb->current_version->references);
		isc_mem_put(rbtdb->common.mctx, rbtdb->current_version,
			    sizeof(rbtdb_version_t));
	}

1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
	/*
	 * We assume the number of remaining dead nodes is reasonably small;
	 * the overhead of unlinking all nodes here should be negligible.
	 */
	for (i = 0; i < rbtdb->node_lock_count; i++) {
		dns_rbtnode_t *node;

		node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
		while (node != NULL) {
			ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
Automatic Updater's avatar
Automatic Updater committed
1136
1137
1138
			node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
		}
	}
1139

Automatic Updater's avatar
Automatic Updater committed
1140
1141
	if (event == NULL)
		rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157

	for (;;) {
		/*
		 * pick the next tree to (start to) destroy
		 */
		treep = &rbtdb->tree;
		if (*treep == NULL) {
			treep = &rbtdb->nsec;
			if (*treep == NULL) {
				treep = &rbtdb->nsec3;
				/*
				 * we're finished after clear cutting
				 */
				if (*treep == NULL)
					break;
			}
Automatic Updater's avatar
Automatic Updater committed
1158
		}
1159
1160

		isc_time_now(&start);
1161
		result = dns_rbt_destroy2(treep, rbtdb->quantum);
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
		if (result == ISC_R_QUOTA) {
			INSIST(rbtdb->task != NULL);
			if (rbtdb->quantum != 0)
				rbtdb->quantum = adjust_quantum(rbtdb->quantum,
								&start);
			if (event == NULL)
				event = isc_event_allocate(rbtdb->common.mctx,
							   NULL,
							 DNS_EVENT_FREESTORAGE,
							   free_rbtdb_callback,
							   rbtdb,
							   sizeof(isc_event_t));
			if (event == NULL)
1175
				continue;
1176
1177
1178
			isc_task_send(rbtdb->task, &event);
			return;
		}
1179
		INSIST(result == ISC_R_SUCCESS && *treep == NULL);
1180
1181
	}

Automatic Updater's avatar
Automatic Updater committed
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
	if (event != NULL)
		isc_event_free(&event);
	if (log) {
		if (dns_name_dynamic(&rbtdb->common.origin))
			dns_name_format(&rbtdb->common.origin, buf,
					sizeof(buf));
		else
			strcpy(buf, "<UNKNOWN>");
		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
			      "done free_rbtdb(%s)", buf);
	}
	if (dns_name_dynamic(&rbtdb->common.origin))
		dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx);
	for (i = 0; i < rbtdb->node_lock_count; i++) {
		isc_refcount_destroy(&rbtdb->node_locks[i].references);
		NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
	}

	/*
Automatic Updater's avatar
Automatic Updater committed
1202
	 * Clean up LRU / re-signing order lists.
Automatic Updater's avatar
Automatic Updater committed
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
	 */
	if (rbtdb->rdatasets != NULL) {
		for (i = 0; i < rbtdb->node_lock_count; i++)
			INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i]));
		isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets,
			    rbtdb->node_lock_count *
			    sizeof(rdatasetheaderlist_t));
	}
	/*
	 * Clean up dead node buckets.
	 */
	if (rbtdb->deadnodes != NULL) {
		for (i = 0; i < rbtdb->node_lock_count; i++)
			INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i]));
		isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes,
		    rbtdb->node_lock_count * sizeof(rbtnodelist_t));
	}
	/*
Automatic Updater's avatar
Automatic Updater committed
1221
	 * Clean up heap objects.
Automatic Updater's avatar
Automatic Updater committed
1222
1223
1224
1225
	 */
	if (rbtdb->heaps != NULL) {
		for (i = 0; i < rbtdb->node_lock_count; i++)
			isc_heap_destroy(&rbtdb->heaps[i]);
1226
1227
		isc_mem_put(rbtdb->hmctx, rbtdb->heaps,
			    rbtdb->node_lock_count * sizeof(isc_heap_t *));
Automatic Updater's avatar
Automatic Updater committed
1228
1229
	}