rbtdb.c 213 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
Automatic Updater's avatar
Automatic Updater committed
2
 * Copyright (C) 2004-2008  Internet Systems Consortium, Inc. ("ISC")
Mark Andrews's avatar
Mark Andrews committed
3
 * Copyright (C) 1999-2003  Internet Software Consortium.
4
 *
Automatic Updater's avatar
Automatic Updater committed
5
 * Permission to use, copy, modify, and/or distribute this software for any
Bob Halley's avatar
Bob Halley committed
6
7
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
8
 *
Mark Andrews's avatar
Mark Andrews committed
9
10
11
12
13
14
15
 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
Bob Halley's avatar
Bob Halley committed
16
17
 */

18
/* $Id: rbtdb.c,v 1.256 2008/04/03 04:00:38 marka Exp $ */
19
20

/*! \file */
David Lawrence's avatar
David Lawrence committed
21

22
23
24
25
/*
 * Principal Author: Bob Halley
 */

26
27
#include <config.h>

28
29
#define inline

30
#include <isc/event.h>
31
#include <isc/heap.h>
32
#include <isc/mem.h>
33
#include <isc/mutex.h>
34
#include <isc/platform.h>
35
#include <isc/print.h>
36
#include <isc/random.h>
37
#include <isc/refcount.h>
Bob Halley's avatar
Bob Halley committed
38
#include <isc/rwlock.h>
39
#include <isc/serial.h>
40
#include <isc/string.h>
41
#include <isc/task.h>
42
#include <isc/time.h>
Michael Graff's avatar
Michael Graff committed
43
#include <isc/util.h>
Bob Halley's avatar
Bob Halley committed
44

45
#include <dns/acache.h>
46
47
#include <dns/db.h>
#include <dns/dbiterator.h>
48
#include <dns/events.h>
49
#include <dns/fixedname.h>
50
#include <dns/lib.h>
51
#include <dns/log.h>
52
#include <dns/masterdump.h>
53
#include <dns/nsec.h>
54
#include <dns/rbt.h>
55
#include <dns/rdata.h>
Bob Halley's avatar
Bob Halley committed
56
57
#include <dns/rdataset.h>
#include <dns/rdatasetiter.h>
58
59
#include <dns/rdataslab.h>
#include <dns/result.h>
60
61
#include <dns/view.h>
#include <dns/zone.h>
62
#include <dns/zonekey.h>
Bob Halley's avatar
Bob Halley committed
63

64
65
66
#ifdef DNS_RBTDB_VERSION64
#include "rbtdb64.h"
#else
Bob Halley's avatar
Bob Halley committed
67
#include "rbtdb.h"
68
#endif
Bob Halley's avatar
Bob Halley committed
69

70
#ifdef DNS_RBTDB_VERSION64
71
#define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '8')
72
#else
73
#define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '4')
74
75
#endif

76
/*%
77
78
79
 * Note that "impmagic" is not the first four bytes of the struct, so
 * ISC_MAGIC_VALID cannot be used.
 */
80
#define VALID_RBTDB(rbtdb)      ((rbtdb) != NULL && \
Automatic Updater's avatar
Automatic Updater committed
81
				 (rbtdb)->common.impmagic == RBTDB_MAGIC)
Bob Halley's avatar
Bob Halley committed
82

83
#ifdef DNS_RBTDB_VERSION64
84
typedef isc_uint64_t                    rbtdb_serial_t;
85
/*%
86
87
88
89
90
91
 * Make casting easier in symbolic debuggers by using different names
 * for the 64 bit version.
 */
#define dns_rbtdb_t dns_rbtdb64_t
#define rdatasetheader_t rdatasetheader64_t
#define rbtdb_version_t rbtdb_version64_t
92
#else
93
typedef isc_uint32_t                    rbtdb_serial_t;
94
95
#endif

96
typedef isc_uint32_t                    rbtdb_rdatatype_t;
Bob Halley's avatar
Bob Halley committed
97

98
99
100
#define RBTDB_RDATATYPE_BASE(type)      ((dns_rdatatype_t)((type) & 0xFFFF))
#define RBTDB_RDATATYPE_EXT(type)       ((dns_rdatatype_t)((type) >> 16))
#define RBTDB_RDATATYPE_VALUE(b, e)     (((e) << 16) | (b))
Bob Halley's avatar
Bob Halley committed
101

102
#define RBTDB_RDATATYPE_SIGNSEC \
Automatic Updater's avatar
Automatic Updater committed
103
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
Bob Halley's avatar
Bob Halley committed
104
#define RBTDB_RDATATYPE_SIGNS \
Automatic Updater's avatar
Automatic Updater committed
105
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
Bob Halley's avatar
Bob Halley committed
106
#define RBTDB_RDATATYPE_SIGCNAME \
Automatic Updater's avatar
Automatic Updater committed
107
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
108
#define RBTDB_RDATATYPE_SIGDNAME \
Automatic Updater's avatar
Automatic Updater committed
109
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
110
#define RBTDB_RDATATYPE_NCACHEANY \
Automatic Updater's avatar
Automatic Updater committed
111
		RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
Bob Halley's avatar
Bob Halley committed
112

113
/*
114
 * We use rwlock for DB lock only when ISC_RWLOCK_USEATOMIC is non 0.
115
 * Using rwlock is effective with regard to lookup performance only when
116
 * it is implemented in an efficient way.
117
118
119
120
 * Otherwise, it is generally wise to stick to the simple locking since rwlock
 * would require more memory or can even make lookups slower due to its own
 * overhead (when it internally calls mutex locks).
 */
121
#ifdef ISC_RWLOCK_USEATOMIC
122
123
124
125
126
127
#define DNS_RBTDB_USERWLOCK 1
#else
#define DNS_RBTDB_USERWLOCK 0
#endif

#if DNS_RBTDB_USERWLOCK
128
129
130
131
#define RBTDB_INITLOCK(l)       isc_rwlock_init((l), 0, 0)
#define RBTDB_DESTROYLOCK(l)    isc_rwlock_destroy(l)
#define RBTDB_LOCK(l, t)        RWLOCK((l), (t))
#define RBTDB_UNLOCK(l, t)      RWUNLOCK((l), (t))
132
#else
133
134
135
136
#define RBTDB_INITLOCK(l)       isc_mutex_init(l)
#define RBTDB_DESTROYLOCK(l)    DESTROYLOCK(l)
#define RBTDB_LOCK(l, t)        LOCK(l)
#define RBTDB_UNLOCK(l, t)      UNLOCK(l)
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#endif

/*
 * Since node locking is sensitive to both performance and memory footprint,
 * we need some trick here.  If we have both high-performance rwlock and
 * high performance and small-memory reference counters, we use rwlock for
 * node lock and isc_refcount for node references.  In this case, we don't have
 * to protect the access to the counters by locks.
 * Otherwise, we simply use ordinary mutex lock for node locking, and use
 * simple integers as reference counters which is protected by the lock.
 * In most cases, we can simply use wrapper macros such as NODE_LOCK and
 * NODE_UNLOCK.  In some other cases, however, we need to protect reference
 * counters first and then protect other parts of a node as read-only data.
 * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
 * provided for these special cases.  When we can use the efficient backend
 * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
 * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
 * section including the access to the reference counter.
 * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
 * section is also protected by NODE_STRONGLOCK().
 */
#if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT)
typedef isc_rwlock_t nodelock_t;

161
162
163
164
165
166
167
168
169
170
171
#define NODE_INITLOCK(l)        isc_rwlock_init((l), 0, 0)
#define NODE_DESTROYLOCK(l)     isc_rwlock_destroy(l)
#define NODE_LOCK(l, t)         RWLOCK((l), (t))
#define NODE_UNLOCK(l, t)       RWUNLOCK((l), (t))
#define NODE_TRYUPGRADE(l)      isc_rwlock_tryupgrade(l)

#define NODE_STRONGLOCK(l)      ((void)0)
#define NODE_STRONGUNLOCK(l)    ((void)0)
#define NODE_WEAKLOCK(l, t)     NODE_LOCK(l, t)
#define NODE_WEAKUNLOCK(l, t)   NODE_UNLOCK(l, t)
#define NODE_WEAKDOWNGRADE(l)   isc_rwlock_downgrade(l)
172
173
174
#else
typedef isc_mutex_t nodelock_t;

175
176
177
178
179
180
181
182
183
184
185
#define NODE_INITLOCK(l)        isc_mutex_init(l)
#define NODE_DESTROYLOCK(l)     DESTROYLOCK(l)
#define NODE_LOCK(l, t)         LOCK(l)
#define NODE_UNLOCK(l, t)       UNLOCK(l)
#define NODE_TRYUPGRADE(l)      ISC_R_SUCCESS

#define NODE_STRONGLOCK(l)      LOCK(l)
#define NODE_STRONGUNLOCK(l)    UNLOCK(l)
#define NODE_WEAKLOCK(l, t)     ((void)0)
#define NODE_WEAKUNLOCK(l, t)   ((void)0)
#define NODE_WEAKDOWNGRADE(l)   ((void)0)
186
187
#endif

188
189
190
191
#ifndef DNS_RDATASET_FIXED
#define DNS_RDATASET_FIXED 1
#endif

192
/*
193
 * Allow clients with a virtual time of up to 5 minutes in the past to see
194
195
196
197
 * records that would have otherwise have expired.
 */
#define RBTDB_VIRTUAL 300

198
struct noqname {
Automatic Updater's avatar
Automatic Updater committed
199
200
201
	dns_name_t name;
	void *     nsec;
	void *     nsecsig;
202
203
};

204
typedef struct acachectl acachectl_t;
205

Bob Halley's avatar
Bob Halley committed
206
typedef struct rdatasetheader {
Automatic Updater's avatar
Automatic Updater committed
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
	/*%
	 * Locked by the owning node's lock.
	 */
	rbtdb_serial_t                  serial;
	dns_ttl_t                       rdh_ttl;
	rbtdb_rdatatype_t               type;
	isc_uint16_t                    attributes;
	dns_trust_t                     trust;
	struct noqname                  *noqname;
	/*%<
	 * We don't use the LIST macros, because the LIST structure has
	 * both head and tail pointers, and is doubly linked.
	 */

	struct rdatasetheader           *next;
	/*%<
	 * If this is the top header for an rdataset, 'next' points
	 * to the top header for the next rdataset (i.e., the next type).
	 * Otherwise, it points up to the header whose down pointer points
	 * at this header.
	 */

	struct rdatasetheader           *down;
	/*%<
	 * Points to the header for the next older version of
	 * this rdataset.
	 */

	isc_uint32_t                    count;
	/*%<
	 * Monotonously increased every time this rdataset is bound so that
	 * it is used as the base of the starting point in DNS responses
	 * when the "cyclic" rrset-order is required.  Since the ordering
	 * should not be so crucial, no lock is set for the counter for
	 * performance reasons.
	 */

	acachectl_t                     *additional_auth;
	acachectl_t                     *additional_glue;

	dns_rbtnode_t                   *node;
	isc_stdtime_t                   last_used;
	ISC_LINK(struct rdatasetheader) lru_link;
	/*%<
	 * Used for LRU-based cache management.  We should probably make
	 * these cache-DB specific.  We might also make it a pointer and
	 * ensure only the top header has a valid link to save memory.
	 * The linked-list is locked by the rbtdb->lrulock.
	 */

	/*
	 * It's possible this should not be here anymore, but instead
	 * referenced from the bucket's heap directly.
	 */
261
#if 0
Automatic Updater's avatar
Automatic Updater committed
262
	isc_heap_t                      *heap;
263
#endif
Automatic Updater's avatar
Automatic Updater committed
264
265
266
267
	unsigned int                    heap_index;
	/*%<
	 * Used for TTL-based cache cleaning.
	 */
Automatic Updater's avatar
Automatic Updater committed
268
	isc_stdtime_t                   resign;
Bob Halley's avatar
Bob Halley committed
269
270
} rdatasetheader_t;

271
272
273
274
275
276
277
278
typedef ISC_LIST(rdatasetheader_t)      rdatasetheaderlist_t;
typedef ISC_LIST(dns_rbtnode_t)         rbtnodelist_t;

#define RDATASET_ATTR_NONEXISTENT       0x0001
#define RDATASET_ATTR_STALE             0x0002
#define RDATASET_ATTR_IGNORE            0x0004
#define RDATASET_ATTR_RETAIN            0x0008
#define RDATASET_ATTR_NXDOMAIN          0x0010
279
#define RDATASET_ATTR_RESIGN            0x0020
280
281
#define RDATASET_ATTR_CACHE             0x1000 /* for debug */
#define RDATASET_ATTR_CANCELED          0x2000 /* for debug */
Michael Graff's avatar
Michael Graff committed
282

283
typedef struct acache_cbarg {
Automatic Updater's avatar
Automatic Updater committed
284
285
286
287
288
	dns_rdatasetadditional_t        type;
	unsigned int                    count;
	dns_db_t                        *db;
	dns_dbnode_t                    *node;
	rdatasetheader_t                *header;
289
290
291
} acache_cbarg_t;

struct acachectl {
Automatic Updater's avatar
Automatic Updater committed
292
293
	dns_acacheentry_t               *entry;
	acache_cbarg_t                  *cbarg;
294
295
};

Michael Graff's avatar
Michael Graff committed
296
297
298
299
300
301
302
/*
 * XXX
 * When the cache will pre-expire data (due to memory low or other
 * situations) before the rdataset's TTL has expired, it MUST
 * respect the RETAIN bit and not expire the data until its TTL is
 * expired.
 */
303

304
#undef IGNORE                   /* WIN32 winbase.h defines this. */
305

306
#define EXISTS(header) \
Automatic Updater's avatar
Automatic Updater committed
307
	(((header)->attributes & RDATASET_ATTR_NONEXISTENT) == 0)
308
#define NONEXISTENT(header) \
Automatic Updater's avatar
Automatic Updater committed
309
	(((header)->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
310
#define IGNORE(header) \
Automatic Updater's avatar
Automatic Updater committed
311
	(((header)->attributes & RDATASET_ATTR_IGNORE) != 0)
Michael Graff's avatar
Michael Graff committed
312
#define RETAIN(header) \
Automatic Updater's avatar
Automatic Updater committed
313
	(((header)->attributes & RDATASET_ATTR_RETAIN) != 0)
314
#define NXDOMAIN(header) \
Automatic Updater's avatar
Automatic Updater committed
315
	(((header)->attributes & RDATASET_ATTR_NXDOMAIN) != 0)
316
#define RESIGN(header) \
Automatic Updater's avatar
Automatic Updater committed
317
	(((header)->attributes & RDATASET_ATTR_RESIGN) != 0)
318

319
320
#define DEFAULT_NODE_LOCK_COUNT         7       /*%< Should be prime. */
#define DEFAULT_CACHE_NODE_LOCK_COUNT   1009    /*%< Should be prime. */
Bob Halley's avatar
Bob Halley committed
321
322

typedef struct {
Automatic Updater's avatar
Automatic Updater committed
323
324
325
326
327
	nodelock_t                      lock;
	/* Protected in the refcount routines. */
	isc_refcount_t                  references;
	/* Locked by lock. */
	isc_boolean_t                   exiting;
328
329
330
} rbtdb_nodelock_t;

typedef struct rbtdb_changed {
Automatic Updater's avatar
Automatic Updater committed
331
332
333
	dns_rbtnode_t *                 node;
	isc_boolean_t                   dirty;
	ISC_LINK(struct rbtdb_changed)  link;
334
335
} rbtdb_changed_t;

336
typedef ISC_LIST(rbtdb_changed_t)       rbtdb_changedlist_t;
337
338

typedef struct rbtdb_version {
Automatic Updater's avatar
Automatic Updater committed
339
340
341
342
343
344
345
346
347
348
349
350
	/* Not locked */
	rbtdb_serial_t                  serial;
	/*
	 * Protected in the refcount routines.
	 * XXXJT: should we change the lock policy based on the refcount
	 * performance?
	 */
	isc_refcount_t                  references;
	/* Locked by database lock. */
	isc_boolean_t                   writer;
	isc_boolean_t                   commit_ok;
	rbtdb_changedlist_t             changed_list;
351
	rdatasetheaderlist_t		resigned_list;
Automatic Updater's avatar
Automatic Updater committed
352
	ISC_LINK(struct rbtdb_version)  link;
353
354
} rbtdb_version_t;

355
356
357
358
359
typedef ISC_LIST(rbtdb_version_t)       rbtdb_versionlist_t;

#ifdef LRU_DEBUG
/* statistics info for testing */
struct cachestat {
Automatic Updater's avatar
Automatic Updater committed
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
	unsigned int    cache_total;
	int             cache_current;
	unsigned int    ncache_total;
	int             ncache_current;
	unsigned int    a_total;
	int             a_current;
	unsigned int    aaaa_total;
	int             aaaa_current;
	unsigned int    ns_total;
	int             ns_current;
	unsigned int    ptr_total;
	int             ptr_current;
	unsigned int    glue_total;
	int             glue_current;
	unsigned int    additional_total;
	int             additional_current;

	unsigned int    stale_purge;
	unsigned int    stale_scan;
	unsigned int    stale_expire;
	unsigned int    stale_lru;
381
382
};
#endif
Bob Halley's avatar
Bob Halley committed
383

384
385
386
387
388
389
typedef enum {
	dns_db_insecure,
	dns_db_partial,
	dns_db_secure
} dns_db_secure_t;

Bob Halley's avatar
Bob Halley committed
390
typedef struct {
Automatic Updater's avatar
Automatic Updater committed
391
392
	/* Unlocked. */
	dns_db_t                        common;
393
#if DNS_RBTDB_USERWLOCK
Automatic Updater's avatar
Automatic Updater committed
394
	isc_rwlock_t                    lock;
395
#else
Automatic Updater's avatar
Automatic Updater committed
396
	isc_mutex_t                     lock;
397
#endif
Automatic Updater's avatar
Automatic Updater committed
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
	isc_rwlock_t                    tree_lock;
	unsigned int                    node_lock_count;
	rbtdb_nodelock_t *              node_locks;
	dns_rbtnode_t *                 origin_node;
	/* Locked by lock. */
	unsigned int                    active;
	isc_refcount_t                  references;
	unsigned int                    attributes;
	rbtdb_serial_t                  current_serial;
	rbtdb_serial_t                  least_serial;
	rbtdb_serial_t                  next_serial;
	rbtdb_version_t *               current_version;
	rbtdb_version_t *               future_version;
	rbtdb_versionlist_t             open_versions;
	isc_boolean_t                   overmem;
	isc_task_t *                    task;
	dns_dbnode_t                    *soanode;
	dns_dbnode_t                    *nsnode;

	/*
	 * This is a linked list used to implement the LRU cache.  There will
	 * be node_lock_count linked lists here.  Nodes in bucket 1 will be
	 * placed on the linked list rdatasets[1].
	 */
	rdatasetheaderlist_t            *rdatasets;
423

Automatic Updater's avatar
Automatic Updater committed
424
425
	/*%
	 * Temporary storage for stale cache nodes and dynamically deleted
426
	 * nodes that await being cleaned up.
Automatic Updater's avatar
Automatic Updater committed
427
	 */
Automatic Updater's avatar
Automatic Updater committed
428
429
430
431
432
433
434
435
436
	rbtnodelist_t                   *deadnodes;

	/*
	 * Heaps.  Each of these is used for TTL based expiry.
	 */
	isc_heap_t                      **heaps;

	/* Locked by tree_lock. */
	dns_rbt_t *                     tree;
Automatic Updater's avatar
Automatic Updater committed
437
	dns_db_secure_t                 secure;
Automatic Updater's avatar
Automatic Updater committed
438
439
440

	/* Unlocked */
	unsigned int                    quantum;
441
#ifdef LRU_DEBUG
Automatic Updater's avatar
Automatic Updater committed
442
	struct cachestat                cachestat;
443
#endif
Bob Halley's avatar
Bob Halley committed
444
445
} dns_rbtdb_t;

446
447
#define RBTDB_ATTR_LOADED               0x01
#define RBTDB_ATTR_LOADING              0x02
448

449
/*%
450
451
452
 * Search Context
 */
typedef struct {
Automatic Updater's avatar
Automatic Updater committed
453
454
455
456
457
458
459
460
461
462
463
464
465
	dns_rbtdb_t *           rbtdb;
	rbtdb_version_t *       rbtversion;
	rbtdb_serial_t          serial;
	unsigned int            options;
	dns_rbtnodechain_t      chain;
	isc_boolean_t           copy_name;
	isc_boolean_t           need_cleanup;
	isc_boolean_t           wild;
	dns_rbtnode_t *         zonecut;
	rdatasetheader_t *      zonecut_rdataset;
	rdatasetheader_t *      zonecut_sigrdataset;
	dns_fixedname_t         zonecut_name;
	isc_stdtime_t           now;
466
467
} rbtdb_search_t;

468
/*%
469
470
471
 * Load Context
 */
typedef struct {
Automatic Updater's avatar
Automatic Updater committed
472
473
	dns_rbtdb_t *           rbtdb;
	isc_stdtime_t           now;
474
475
} rbtdb_load_t;

Bob Halley's avatar
Bob Halley committed
476
static void rdataset_disassociate(dns_rdataset_t *rdataset);
477
478
static isc_result_t rdataset_first(dns_rdataset_t *rdataset);
static isc_result_t rdataset_next(dns_rdataset_t *rdataset);
Bob Halley's avatar
Bob Halley committed
479
static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
Bob Halley's avatar
Bob Halley committed
480
static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
Bob Halley's avatar
Bob Halley committed
481
static unsigned int rdataset_count(dns_rdataset_t *rdataset);
482
static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset,
Automatic Updater's avatar
Automatic Updater committed
483
484
485
					dns_name_t *name,
					dns_rdataset_t *nsec,
					dns_rdataset_t *nsecsig);
486
static isc_result_t rdataset_getadditional(dns_rdataset_t *rdataset,
Automatic Updater's avatar
Automatic Updater committed
487
488
489
490
491
492
493
494
495
496
					   dns_rdatasetadditional_t type,
					   dns_rdatatype_t qtype,
					   dns_acache_t *acache,
					   dns_zone_t **zonep,
					   dns_db_t **dbp,
					   dns_dbversion_t **versionp,
					   dns_dbnode_t **nodep,
					   dns_name_t *fname,
					   dns_message_t *msg,
					   isc_stdtime_t now);
497
static isc_result_t rdataset_setadditional(dns_rdataset_t *rdataset,
Automatic Updater's avatar
Automatic Updater committed
498
499
500
501
502
503
504
505
					   dns_rdatasetadditional_t type,
					   dns_rdatatype_t qtype,
					   dns_acache_t *acache,
					   dns_zone_t *zone,
					   dns_db_t *db,
					   dns_dbversion_t *version,
					   dns_dbnode_t *node,
					   dns_name_t *fname);
506
static isc_result_t rdataset_putadditional(dns_acache_t *acache,
Automatic Updater's avatar
Automatic Updater committed
507
508
509
					   dns_rdataset_t *rdataset,
					   dns_rdatasetadditional_t type,
					   dns_rdatatype_t qtype);
510
static inline isc_boolean_t need_headerupdate(rdatasetheader_t *header,
Automatic Updater's avatar
Automatic Updater committed
511
					      isc_stdtime_t now);
512
static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
Automatic Updater's avatar
Automatic Updater committed
513
			  isc_stdtime_t now);
514
static void check_stale_cache(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode,
Automatic Updater's avatar
Automatic Updater committed
515
			      isc_stdtime_t now, isc_boolean_t tree_locked);
516
517
static isc_result_t resign_insert(dns_rbtdb_t *rbtdb, int idx,
				  rdatasetheader_t *newheader);
518
519

static dns_rdatasetmethods_t rdataset_methods = {
Automatic Updater's avatar
Automatic Updater committed
520
521
522
523
524
525
526
527
528
529
530
	rdataset_disassociate,
	rdataset_first,
	rdataset_next,
	rdataset_current,
	rdataset_clone,
	rdataset_count,
	NULL,
	rdataset_getnoqname,
	rdataset_getadditional,
	rdataset_setadditional,
	rdataset_putadditional
Bob Halley's avatar
Bob Halley committed
531
532
533
};

static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
534
535
static isc_result_t rdatasetiter_first(dns_rdatasetiter_t *iterator);
static isc_result_t rdatasetiter_next(dns_rdatasetiter_t *iterator);
Bob Halley's avatar
Bob Halley committed
536
static void rdatasetiter_current(dns_rdatasetiter_t *iterator,
Automatic Updater's avatar
Automatic Updater committed
537
				 dns_rdataset_t *rdataset);
Bob Halley's avatar
Bob Halley committed
538
539

static dns_rdatasetitermethods_t rdatasetiter_methods = {
Automatic Updater's avatar
Automatic Updater committed
540
541
542
543
	rdatasetiter_destroy,
	rdatasetiter_first,
	rdatasetiter_next,
	rdatasetiter_current
544
545
};

Bob Halley's avatar
Bob Halley committed
546
typedef struct rbtdb_rdatasetiter {
Automatic Updater's avatar
Automatic Updater committed
547
548
	dns_rdatasetiter_t              common;
	rdatasetheader_t *              current;
Bob Halley's avatar
Bob Halley committed
549
550
} rbtdb_rdatasetiter_t;

551
552
553
554
static void             dbiterator_destroy(dns_dbiterator_t **iteratorp);
static isc_result_t     dbiterator_first(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_last(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_seek(dns_dbiterator_t *iterator,
Automatic Updater's avatar
Automatic Updater committed
555
					dns_name_t *name);
556
557
558
static isc_result_t     dbiterator_prev(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_next(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_current(dns_dbiterator_t *iterator,
Automatic Updater's avatar
Automatic Updater committed
559
560
					   dns_dbnode_t **nodep,
					   dns_name_t *name);
561
562
static isc_result_t     dbiterator_pause(dns_dbiterator_t *iterator);
static isc_result_t     dbiterator_origin(dns_dbiterator_t *iterator,
Automatic Updater's avatar
Automatic Updater committed
563
					  dns_name_t *name);
564
565

static dns_dbiteratormethods_t dbiterator_methods = {
Automatic Updater's avatar
Automatic Updater committed
566
567
568
569
570
571
572
573
574
	dbiterator_destroy,
	dbiterator_first,
	dbiterator_last,
	dbiterator_seek,
	dbiterator_prev,
	dbiterator_next,
	dbiterator_current,
	dbiterator_pause,
	dbiterator_origin
575
576
};

577
578
#define DELETION_BATCH_MAX 64

579
/*
580
 * If 'paused' is ISC_TRUE, then the tree lock is not being held.
581
 */
582
typedef struct rbtdb_dbiterator {
Automatic Updater's avatar
Automatic Updater committed
583
584
585
586
587
588
589
590
591
592
593
	dns_dbiterator_t                common;
	isc_boolean_t                   paused;
	isc_boolean_t                   new_origin;
	isc_rwlocktype_t                tree_locked;
	isc_result_t                    result;
	dns_fixedname_t                 name;
	dns_fixedname_t                 origin;
	dns_rbtnodechain_t              chain;
	dns_rbtnode_t                   *node;
	dns_rbtnode_t                   *deletions[DELETION_BATCH_MAX];
	int                             delete;
594
595
596
} rbtdb_dbiterator_t;


597
598
#define IS_STUB(rbtdb)  (((rbtdb)->common.attributes & DNS_DBATTR_STUB)  != 0)
#define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
599

600
static void free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log,
Automatic Updater's avatar
Automatic Updater committed
601
		       isc_event_t *event);
602
static void overmem(dns_db_t *db, isc_boolean_t overmem);
603

604
605
606
607
608
609
/*%
 * 'init_count' is used to initialize 'newheader->count' which inturn
 * is used to determine where in the cycle rrset-order cyclic starts.
 * We don't lock this as we don't care about simultanious updates.
 *
 * Note:
610
 *      Both init_count and header->count can be ISC_UINT32_MAX.
611
 *      The count on the returned rdataset however can't be as
612
613
 *      that indicates that the database does not implement cyclic
 *      processing.
614
615
616
 */
static unsigned int init_count;

617
618
619
620
621
622
/*
 * Locking
 *
 * If a routine is going to lock more than one lock in this module, then
 * the locking must be done in the following order:
 *
623
 *      Tree Lock
624
 *
625
626
 *      Node Lock       (Only one from the set may be locked at one time by
 *                       any caller)
627
 *
628
 *      Database Lock
629
630
631
632
 *
 * Failure to follow this hierarchy can result in deadlock.
 */

633
634
635
/*
 * Deleting Nodes
 *
636
 * For zone databases the node for the origin of the zone MUST NOT be deleted.
637
638
639
 */


640
641
642
643
/*
 * DB Routines
 */

Bob Halley's avatar
Bob Halley committed
644
645
static void
attach(dns_db_t *source, dns_db_t **targetp) {
Automatic Updater's avatar
Automatic Updater committed
646
	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source;
Bob Halley's avatar
Bob Halley committed
647

Automatic Updater's avatar
Automatic Updater committed
648
	REQUIRE(VALID_RBTDB(rbtdb));
Bob Halley's avatar
Bob Halley committed
649

Automatic Updater's avatar
Automatic Updater committed
650
	isc_refcount_increment(&rbtdb->references, NULL);
Bob Halley's avatar
Bob Halley committed
651

Automatic Updater's avatar
Automatic Updater committed
652
	*targetp = source;
Bob Halley's avatar
Bob Halley committed
653
654
655
}

static void
656
free_rbtdb_callback(isc_task_t *task, isc_event_t *event) {
Automatic Updater's avatar
Automatic Updater committed
657
	dns_rbtdb_t *rbtdb = event->ev_arg;
658

Automatic Updater's avatar
Automatic Updater committed
659
	UNUSED(task);
660

Automatic Updater's avatar
Automatic Updater committed
661
	free_rbtdb(rbtdb, ISC_TRUE, event);
662
663
664
665
}

static void
set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) {
Automatic Updater's avatar
Automatic Updater committed
666
667
668
669
670
671
672
	int idx;
	isc_heap_t *heap;
	dns_ttl_t oldttl;

	oldttl = header->rdh_ttl;
	header->rdh_ttl = newttl;

673
674
675
	if (!IS_CACHE(rbtdb))
		return;

Automatic Updater's avatar
Automatic Updater committed
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
	/*
	 * It's possible the rbtdb is not a cache.  If this is the case,
	 * we will not have a heap, and we move on.  If we do, though,
	 * we might need to adjust things.
	 */
	if (header->heap_index == 0 || newttl == oldttl)
		return;
	idx = header->node->locknum;
	if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL)
	    return;
	heap = rbtdb->heaps[idx];

	if (newttl < oldttl)
		isc_heap_increased(heap, header->heap_index);
	else
		isc_heap_decreased(heap, header->heap_index);
692
693
694
}

/*%
695
 * These functions allow the heap code to rank the priority of each
696
697
698
699
 * element.  It returns ISC_TRUE if v1 happens "sooner" than v2.
 */
static isc_boolean_t
ttl_sooner(void *v1, void *v2) {
Automatic Updater's avatar
Automatic Updater committed
700
701
	rdatasetheader_t *h1 = v1;
	rdatasetheader_t *h2 = v2;
702

Automatic Updater's avatar
Automatic Updater committed
703
704
705
	if (h1->rdh_ttl < h2->rdh_ttl)
		return (ISC_TRUE);
	return (ISC_FALSE);
706
707
}

708
709
static isc_boolean_t
resign_sooner(void *v1, void *v2) {
Automatic Updater's avatar
Automatic Updater committed
710
711
	rdatasetheader_t *h1 = v1;
	rdatasetheader_t *h2 = v2;
712

Automatic Updater's avatar
Automatic Updater committed
713
714
715
	if (h1->resign < h2->resign)
		return (ISC_TRUE);
	return (ISC_FALSE);
716
717
}

718
719
720
721
/*%
 * This function sets the heap index into the header.
 */
static void
722
set_index(void *what, unsigned int index) {
Automatic Updater's avatar
Automatic Updater committed
723
	rdatasetheader_t *h = what;
724

Automatic Updater's avatar
Automatic Updater committed
725
	h->heap_index = index;
726
727
}

728
729
730
731
732
733
/*%
 * Work out how many nodes can be deleted in the time between two
 * requests to the nameserver.  Smooth the resulting number and use it
 * as a estimate for the number of nodes to be deleted in the next
 * iteration.
 */
734
735
static unsigned int
adjust_quantum(unsigned int old, isc_time_t *start) {
Automatic Updater's avatar
Automatic Updater committed
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
	unsigned int pps = dns_pps;     /* packets per second */
	unsigned int interval;
	isc_uint64_t usecs;
	isc_time_t end;
	unsigned int new;

	if (pps < 100)
		pps = 100;
	isc_time_now(&end);

	interval = 1000000 / pps;       /* interval in usec */
	if (interval == 0)
		interval = 1;
	usecs = isc_time_microdiff(&end, start);
	if (usecs == 0) {
		/*
		 * We were unable to measure the amount of time taken.
		 * Double the nodes deleted next time.
		 */
		old *= 2;
		if (old > 1000)
			old = 1000;
		return (old);
	}
	new = old * interval;
	new /= (unsigned int)usecs;
	if (new == 0)
		new = 1;
	else if (new > 1000)
		new = 1000;

	/* Smooth */
	new = (new + old * 3) / 4;

	isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE,
		      ISC_LOG_DEBUG(1), "adjust_quantum -> %d", new);

	return (new);
774
}
775

776
777
static void
free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) {
Automatic Updater's avatar
Automatic Updater committed
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
	unsigned int i;
	isc_ondestroy_t ondest;
	isc_result_t result;
	char buf[DNS_NAME_FORMATSIZE];
	isc_time_t start;

	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
		overmem((dns_db_t *)rbtdb, (isc_boolean_t)-1);

	REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions));
	REQUIRE(rbtdb->future_version == NULL);

	if (rbtdb->current_version != NULL) {
		unsigned int refs;

		isc_refcount_decrement(&rbtdb->current_version->references,
				       &refs);
		INSIST(refs == 0);
		UNLINK(rbtdb->open_versions, rbtdb->current_version, link);
		isc_refcount_destroy(&rbtdb->current_version->references);
		isc_mem_put(rbtdb->common.mctx, rbtdb->current_version,
			    sizeof(rbtdb_version_t));
	}

802
803
804
805
806
807
808
809
810
811
	/*
	 * We assume the number of remaining dead nodes is reasonably small;
	 * the overhead of unlinking all nodes here should be negligible.
	 */
	for (i = 0; i < rbtdb->node_lock_count; i++) {
		dns_rbtnode_t *node;

		node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
		while (node != NULL) {
			ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
Automatic Updater's avatar
Automatic Updater committed
812
813
814
			node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
		}
	}
815

Automatic Updater's avatar
Automatic Updater committed
816
817
	if (event == NULL)
		rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
818
 again:
Automatic Updater's avatar
Automatic Updater committed
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
	if (rbtdb->tree != NULL) {
		isc_time_now(&start);
		result = dns_rbt_destroy2(&rbtdb->tree, rbtdb->quantum);
		if (result == ISC_R_QUOTA) {
			INSIST(rbtdb->task != NULL);
			if (rbtdb->quantum != 0)
				rbtdb->quantum = adjust_quantum(rbtdb->quantum,
								&start);
			if (event == NULL)
				event = isc_event_allocate(rbtdb->common.mctx,
							   NULL,
							 DNS_EVENT_FREESTORAGE,
							   free_rbtdb_callback,
							   rbtdb,
							   sizeof(isc_event_t));
			if (event == NULL)
				goto again;
			isc_task_send(rbtdb->task, &event);
			return;
		}
		INSIST(result == ISC_R_SUCCESS && rbtdb->tree == NULL);
	}
	if (event != NULL)
		isc_event_free(&event);
	if (log) {
		if (dns_name_dynamic(&rbtdb->common.origin))
			dns_name_format(&rbtdb->common.origin, buf,
					sizeof(buf));
		else
			strcpy(buf, "<UNKNOWN>");
		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
			      "done free_rbtdb(%s)", buf);
	}
	if (dns_name_dynamic(&rbtdb->common.origin))
		dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx);
	for (i = 0; i < rbtdb->node_lock_count; i++) {
		isc_refcount_destroy(&rbtdb->node_locks[i].references);
		NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
	}

	/*
Automatic Updater's avatar
Automatic Updater committed
861
	 * Clean up LRU / re-signing order lists.
Automatic Updater's avatar
Automatic Updater committed
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
	 */
	if (rbtdb->rdatasets != NULL) {
		for (i = 0; i < rbtdb->node_lock_count; i++)
			INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i]));
		isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets,
			    rbtdb->node_lock_count *
			    sizeof(rdatasetheaderlist_t));
	}
	/*
	 * Clean up dead node buckets.
	 */
	if (rbtdb->deadnodes != NULL) {
		for (i = 0; i < rbtdb->node_lock_count; i++)
			INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i]));
		isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes,
		    rbtdb->node_lock_count * sizeof(rbtnodelist_t));
	}
	/*
Automatic Updater's avatar
Automatic Updater committed
880
	 * Clean up heap objects.
Automatic Updater's avatar
Automatic Updater committed
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
	 */
	if (rbtdb->heaps != NULL) {
		for (i = 0; i < rbtdb->node_lock_count; i++)
			isc_heap_destroy(&rbtdb->heaps[i]);
		isc_mem_put(rbtdb->common.mctx, rbtdb->heaps,
			    rbtdb->node_lock_count *
			    sizeof(isc_heap_t *));
	}

	isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks,
		    rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
	isc_rwlock_destroy(&rbtdb->tree_lock);
	isc_refcount_destroy(&rbtdb->references);
	if (rbtdb->task != NULL)
		isc_task_detach(&rbtdb->task);
896
897

#ifdef LRU_DEBUG
Automatic Updater's avatar
Automatic Updater committed
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
	/* Experimental logging about memory usage */
	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) {
		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
			      DNS_LOGMODULE_CACHE, ISC_LOG_INFO,
			      "cache DB %p: mem inuse %lu, XXX node, "
			      "%d/%u current/total cache, %d/%u neg, %d/%u A, %d/%u AAAA, "
			      "%d/%u NS, %d/%u PTR, %d/%u glue, "
			      "%d/%u  additional, purge/scan=%u(%u expiry, %u lru)/%u, "
			      "overmem=%d",
			      rbtdb,
			      (unsigned long)isc_mem_inuse(rbtdb->common.mctx),
			      rbtdb->cachestat.cache_current, rbtdb->cachestat.cache_total,
			      rbtdb->cachestat.ncache_current, rbtdb->cachestat.ncache_total,
			      rbtdb->cachestat.a_current, rbtdb->cachestat.a_total,
			      rbtdb->cachestat.aaaa_current, rbtdb->cachestat.aaaa_total,
			      rbtdb->cachestat.ns_current, rbtdb->cachestat.ns_total,
			      rbtdb->cachestat.ptr_current, rbtdb->cachestat.ptr_total,
			      rbtdb->cachestat.glue_current, rbtdb->cachestat.glue_total,
			      rbtdb->cachestat.additional_current,
			      rbtdb->cachestat.additional_total,
			      rbtdb->cachestat.stale_purge, rbtdb->cachestat.stale_expire,
			      rbtdb->cachestat.stale_lru, rbtdb->cachestat.stale_scan,
			      rbtdb->overmem);
		INSIST(rbtdb->cachestat.cache_current == 0);
		INSIST(rbtdb->cachestat.ncache_current == 0);
		INSIST(rbtdb->cachestat.a_current == 0);
		INSIST(rbtdb->cachestat.aaaa_current == 0);
		INSIST(rbtdb->cachestat.ns_current == 0);
		INSIST(rbtdb->cachestat.ptr_current == 0);
		INSIST(rbtdb->cachestat.glue_current == 0);
		INSIST(rbtdb->cachestat.additional_current == 0);
	}
930
931
#endif

Automatic Updater's avatar
Automatic Updater committed
932
933
934
935
936
937
	RBTDB_DESTROYLOCK(&rbtdb->lock);
	rbtdb->common.magic = 0;
	rbtdb->common.impmagic = 0;
	ondest = rbtdb->common.ondest;
	isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb));
	isc_ondestroy_notify(&ondest, rbtdb);
Bob Halley's avatar
Bob Halley committed
938
939
}

940
static inline void
941
maybe_free_rbtdb(dns_rbtdb_t *rbtdb) {
Automatic Updater's avatar
Automatic Updater committed
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
	isc_boolean_t want_free = ISC_FALSE;
	unsigned int i;
	unsigned int inactive = 0;

	/* XXX check for open versions here */

	if (rbtdb->soanode != NULL)
		dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode);
	if (rbtdb->nsnode != NULL)
		dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode);

	/*
	 * Even though there are no external direct references, there still
	 * may be nodes in use.
	 */
	for (i = 0; i < rbtdb->node_lock_count; i++) {
		NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
		rbtdb->node_locks[i].exiting = ISC_TRUE;
		NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
		if (isc_refcount_current(&rbtdb->node_locks[i].references)
		    == 0) {
			inactive++;
		}
	}

	if (inactive != 0) {
		RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
		rbtdb->active -= inactive;
		if (rbtdb->active == 0)
			want_free = ISC_TRUE;
		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
		if (want_free) {
			char buf[DNS_NAME_FORMATSIZE];
			if (dns_name_dynamic(&rbtdb->common.origin))
				dns_name_format(&rbtdb->common.origin, buf,
						sizeof(buf));
			else
				strcpy(buf, "<UNKNOWN>");
			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
				      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
				      "calling free_rbtdb(%s)", buf);
			free_rbtdb(rbtdb, ISC_TRUE, NULL);
		}
	}
Bob Halley's avatar
Bob Halley committed
986
987
988
989
}

static void
detach(dns_db_t **dbp) {
Automatic Updater's avatar
Automatic Updater committed
990
991
	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(*dbp);
	unsigned int refs;
Bob Halley's avatar
Bob Halley committed
992

Automatic Updater's avatar
Automatic Updater committed
993
	REQUIRE(VALID_RBTDB(rbtdb));
Bob Halley's avatar
Bob Halley committed
994

Automatic Updater's avatar
Automatic Updater committed
995
	isc_refcount_decrement(&rbtdb->references, &refs);
Bob Halley's avatar
Bob Halley committed
996

Automatic Updater's avatar
Automatic Updater committed
997
998
	if (refs == 0)
		maybe_free_rbtdb(rbtdb);
Bob Halley's avatar
Bob Halley committed
999

Automatic Updater's avatar
Automatic Updater committed
1000
	*dbp = NULL;
Bob Halley's avatar
Bob Halley committed
1001
1002
1003
1004
}

static void
currentversion(dns_db_t *db, dns_dbversion_t **versionp) {
Automatic Updater's avatar
Automatic Updater committed
1005
1006
1007
	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
	rbtdb_version_t *version;
	unsigned int refs;
Bob Halley's avatar
Bob Halley committed
1008

Automatic Updater's avatar
Automatic Updater committed
1009
	REQUIRE(VALID_RBTDB(rbtdb));
Bob Halley's avatar
Bob Halley committed
1010

Automatic Updater's avatar
Automatic Updater committed
1011
1012
1013
1014
	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
	version = rbtdb->current_version;
	isc_refcount_increment(&version->references, &refs);
	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
1015

Automatic Updater's avatar
Automatic Updater committed
1016
	*versionp = (dns_dbversion_t *)version;
1017
1018
1019
}

static inline rbtdb_version_t *
1020
allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial,
Automatic Updater's avatar
Automatic Updater committed
1021
		 unsigned int references, isc_boolean_t writer)
1022
{
Automatic Updater's avatar
Automatic Updater committed
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
	isc_result_t result;
	rbtdb_version_t *version;

	version = isc_mem_get(mctx, sizeof(*version));
	if (version == NULL)
		return (NULL);
	version->serial = serial;
	result = isc_refcount_init(&version->references, references);
	if (result != ISC_R_SUCCESS) {
		isc_mem_put(mctx, version, sizeof(*version));
		return (NULL);
	}
	version->writer = writer;
	version->commit_ok = ISC_FALSE;
	ISC_LIST_INIT(version->changed_list);
Automatic Updater's avatar
Automatic Updater committed
1038
	ISC_LIST_INIT(version->resigned_list);
Automatic Updater's avatar
Automatic Updater committed
1039
1040
1041
	ISC_LINK_INIT(version, link);

	return (version);
Bob Halley's avatar
Bob Halley committed
1042
1043
}

1044
static isc_result_t
Bob Halley's avatar
Bob Halley committed
1045
newversion(dns_db_t *db, dns_dbversion_t **versionp) {
Automatic Updater's avatar
Automatic Updater committed
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
	rbtdb_version_t *version;

	REQUIRE(VALID_RBTDB(rbtdb));
	REQUIRE(versionp != NULL && *versionp == NULL);
	REQUIRE(rbtdb->future_version == NULL);

	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
	RUNTIME_CHECK(rbtdb->next_serial != 0);         /* XXX Error? */
	version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1,
				   ISC_TRUE);
	if (version != NULL) {
		version->commit_ok = ISC_TRUE;
		rbtdb->next_serial++;
		rbtdb->future_version = version;
	}
	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);

	if (version == NULL)
		return (ISC_R_NOMEMORY);

	*versionp = version;

	return (ISC_R_SUCCESS);
1070
1071
}

1072
1073
static void
attachversion(dns_db_t *db, dns_dbversion_t *source,
Automatic Updater's avatar
Automatic Updater committed
1074
	      dns_dbversion_t **targetp)
1075
{
Automatic Updater's avatar
Automatic Updater committed
1076
1077
1078
	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
	rbtdb_version_t *rbtversion = source;
	unsigned int refs;
1079

Automatic Updater's avatar
Automatic Updater committed
1080
	REQUIRE(VALID_RBTDB(rbtdb));
1081

Automatic Updater's avatar
Automatic Updater committed
1082
1083
	isc_refcount_increment(&rbtversion->references, &refs);
	INSIST(refs > 1);
1084

Automatic Updater's avatar
Automatic Updater committed
1085
	*targetp = rbtversion;
1086
1087
}

1088
1089
static rbtdb_changed_t *
add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
Automatic Updater's avatar
Automatic Updater committed
1090
	    dns_rbtnode_t *node)
1091
{
Automatic Updater's avatar
Automatic Updater committed
1092
1093
	rbtdb_changed_t *changed;
	unsigned int refs;
1094

Automatic Updater's avatar
Automatic Updater committed
1095
1096
1097
1098
	/*
	 * Caller must be holding the node lock if its reference must be
	 * protected by the lock.
	 */
1099

Automatic Updater's avatar
Automatic Updater committed
1100
	changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed));
Bob Halley's avatar
Bob Halley committed
1101

Automatic Updater's avatar
Automatic Updater committed
1102
	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
Bob Halley's avatar
Bob Halley committed
1103

Automatic Updater's avatar
Automatic Updater committed
1104
	REQUIRE(version->writer);
Bob Halley's avatar
Bob Halley committed
1105

Automatic Updater's avatar
Automatic Updater committed
1106
1107
1108
1109
1110
1111
1112
1113
	if (changed != NULL) {
		dns_rbtnode_refincrement(node, &refs);
		INSIST(refs != 0);
		changed->node = node;
		changed->dirty = ISC_FALSE;
		ISC_LIST_INITANDAPPEND(version->changed_list, changed, link);
	} else
		version->commit_ok = ISC_FALSE;
Bob Halley's avatar
Bob Halley committed
1114

Automatic Updater's avatar
Automatic Updater committed
1115
	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1116

Automatic Updater's avatar
Automatic Updater committed
1117
	return (changed);
1118
1119
}

1120
1121
static void
free_acachearray(isc_mem_t *mctx, rdatasetheader_t *header,
Automatic Updater's avatar
Automatic Updater committed
1122
		 acachectl_t *array)
1123
{
Automatic Updater's avatar
Automatic Updater committed
1124
1125
1126
	unsigned int count;
	unsigned int i;
	unsigned char *raw;     /* RDATASLAB */
1127

Automatic Updater's avatar
Automatic Updater committed
1128
1129
1130
	/*
	 * The caller must be holding the corresponding node lock.
	 */
1131

Automatic Updater's avatar
Automatic Updater committed
1132
1133
	if (array == NULL)
		return;
1134

Automatic Updater's avatar
Automatic Updater committed
1135
1136
	raw = (unsigned char *)header + sizeof(*header);
	count = raw[0] * 256 + raw[1];
1137

Automatic Updater's avatar
Automatic Updater committed
1138
1139
1140
1141
1142
1143
1144
	/*
	 * Sanity check: since an additional cache entry has a reference to
	 * the original DB node (in the callback arg), there should be no
	 * acache entries when the node can be freed.
	 */
	for (i = 0; i < count; i++)
		INSIST(array[i].entry == NULL && array[i].cbarg == NULL);
1145

Automatic Updater's avatar
Automatic Updater committed
1146
	isc_mem_put(mctx, array, count * sizeof(acachectl_t));
1147
1148
}

1149
1150
1151
static inline void
free_noqname(isc_mem_t *mctx, struct noqname **noqname) {

Automatic Updater's avatar
Automatic Updater committed
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
	if (dns_name_dynamic(&(*noqname)->name))
		dns_name_free(&(*noqname)->name, mctx);
	if ((*noqname)->nsec != NULL)
		isc_mem_put(mctx, (*noqname)->nsec,
			    dns_rdataslab_size((*noqname)->nsec, 0));
	if ((*noqname)->nsecsig != NULL)
		isc_mem_put(mctx, (*noqname)->nsecsig,
			    dns_rdataslab_size((*noqname)->nsecsig, 0));
	isc_mem_put(mctx, *noqname, sizeof(**noqname));
	*noqname = NULL;
1162
1163
}

1164
static inline void
1165
1166
init_rdataset(dns_rbtdb_t *rbtdb, rdatasetheader_t *h)
{
Automatic Updater's avatar
Automatic Updater committed
1167
1168
	ISC_LINK_INIT(h, lru_link);
	h->heap_index = 0;
1169
1170

#if TRACE_HEADER
Automatic Updater's avatar
Automatic Updater committed
1171
1172
	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
		fprintf(stderr, "initialized header: %p\n", h);
1173
#else
Automatic Updater's avatar
Automatic Updater committed
1174
	UNUSED(rbtdb);
1175
1176
#endif
}
1177

1178
1179
1180
static inline rdatasetheader_t *
new_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx)
{
Automatic Updater's avatar
Automatic Updater committed
1181
	rdatasetheader_t *h;
1182

Automatic Updater's avatar
Automatic Updater committed
1183
1184
1185
	h = isc_mem_get(mctx, sizeof(*h));
	if (h == NULL)
		return (NULL);
1186

1187
#if TRACE_HEADER
Automatic Updater's avatar
Automatic Updater committed
1188
1189
	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
		fprintf(stderr, "allocated header: %p\n", h);
1190
#endif
Automatic Updater's avatar
Automatic Updater committed
1191
1192
	init_rdataset(rbtdb, h);
	return (h);
1193
1194
1195
1196
1197
}

static inline void
free_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *rdataset)
{
Automatic Updater's avatar
Automatic Updater committed
1198
	unsigned int size;
1199
	int idx;
1200
1201

#ifdef LRU_DEBUG
Automatic Updater's avatar
Automatic Updater committed
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
	/*
	 * for debug: statistics update.
	 * Nothing in this block should have any side-effects.
	 */
	if (EXISTS(rdataset) &&
	    (rdataset->attributes & RDATASET_ATTR_CACHE) != 0) {
		rbtdb->cachestat.cache_current--;
		if ((rdataset->attributes & RDATASET_ATTR_CANCELED) != 0)
			rbtdb->cachestat.cache_total--;
		if (RBTDB_RDATATYPE_BASE(rdataset->type) == 0) {
			rbtdb->cachestat.ncache_current--;
			INSIST(rbtdb->cachestat.ncache_current >= 0);
			if ((rdataset->attributes & RDATASET_ATTR_CANCELED)
			    != 0)
				rbtdb->cachestat.ncache_total--;
		}
		if (rdataset->type == dns_rdatatype_a) {
			rbtdb->cachestat.a_current--;
			INSIST(rbtdb->cachestat.a_current >= 0);
			if ((rdataset->attributes & RDATASET_ATTR_CANCELED)
			    != 0)
				rbtdb->cachestat.a_total--;
		} else if (rdataset->type == dns_rdatatype_aaaa) {
			rbtdb->cachestat.aaaa_current--;
			INSIST(rbtdb->cachestat.aaaa_current >= 0);
			if ((rdataset->attributes & RDATASET_ATTR_CANCELED)
			    != 0)
				rbtdb->cachestat.aaaa_total--;
		} else if (rdataset->type == dns_rdatatype_ptr) {
			rbtdb->cachestat.ptr_current--;
			INSIST(rbtdb->cachestat.ptr_current >= 0);
			if ((rdataset->attributes & RDATASET_ATTR_CANCELED)
			    != 0)
				rbtdb->cachestat.ptr_total--;
		} else if (rdataset->type == dns_rdatatype_ns) {
			rbtdb->cachestat.ns_current--;
			INSIST(rbtdb->cachestat.ns_current >= 0);
			if ((rdataset->attributes & RDATASET_ATTR_CANCELED)
			    != 0)
				rbtdb->cachestat.ns_total--;
		}
		if (rdataset->trust == dns_trust_glue &&
		    (rdataset->type == dns_rdatatype_a ||
		     rdataset->type == dns_rdatatype_aaaa)) {
			rbtdb->cachestat.glue_current--;
			INSIST(rbtdb->cachestat.glue_current >= 0);
			if ((rdataset->attributes & RDATASET_ATTR_CANCELED)
			    != 0)
				rbtdb->cachestat.glue_total--;
		}
		if (rdataset->trust == dns_trust_additional &&
		    (rdataset->type == dns_rdatatype_a ||
		     rdataset->type == dns_rdatatype_aaaa)) {
			rbtdb->cachestat.additional_current--;
			INSIST(rbtdb->cachestat.additional_current >= 0);
			if ((rdataset->attributes & RDATASET_ATTR_CANCELED)
			    != 0)
				rbtdb->cachestat.additional_total--;
		}
	}
1262
1263
#endif

1264
1265
	idx = rdataset->node->locknum;
	if (ISC_LINK_LINKED(rdataset, lru_link))
Automatic Updater's avatar
Automatic Updater committed
1266
		ISC_LIST_UNLINK(rbtdb->rdatasets[idx], rdataset, lru_link);
1267
1268
1269
	if (rdataset->heap_index != 0)
		isc_heap_delete(rbtdb->heaps[idx], rdataset->heap_index);
	rdataset->heap_index = 0;
Automatic Updater's avatar
Automatic Updater committed
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282

	if (rdataset->noqname != NULL)
		free_noqname(mctx, &rdataset->noqname);

	free_acachearray(mctx, rdataset, rdataset->additional_auth);
	free_acachearray(mctx, rdataset, rdataset->additional_glue);

	if ((rdataset->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
		size = sizeof(*rdataset);
	else
		size = dns_rdataslab_size((unsigned char *)rdataset,
					  sizeof(*rdataset));
	isc_mem_put(mctx, rdataset, size);
1283
1284
}

1285
static inline void
1286
rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) {
Automatic Updater's avatar
Automatic Updater committed
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
	rdatasetheader_t *header, *dcurrent;
	isc_boolean_t make_dirty = ISC_FALSE;

	/*
	 * Caller must hold the node lock.
	 */

	/*
	 * We set the IGNORE attribute on rdatasets with serial number
	 * 'serial'.  When the reference count goes to zero, these rdatasets
	 * will be cleaned up; until that time, they will be ignored.
	 */
	for (header = node->data; header != NULL; header = header->next) {
		if (header->serial == serial) {
			header->attributes |= RDATASET_ATTR_IGNORE;
			make_dirty = ISC_TRUE;
		}
		for (dcurrent = header->down;
		     dcurrent != NULL;
		     dcurrent = dcurrent->down) {
			if (dcurrent->serial == serial) {
				dcurrent->attributes |= RDATASET_ATTR_IGNORE;
				make_dirty = ISC_TRUE;
			}
		}
	}
	if (make_dirty)
		node->dirty = 1;
1315
1316
}

1317
static inline void
1318
1319
clean_stale_headers(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *top)
{
Automatic Updater's avatar
Automatic Updater committed
1320
	rdatasetheader_t *d, *down_next;
1321

Automatic Updater's avatar
Automatic Updater committed
1322
1323
1324
1325
1326
	for (d = top->down; d != NULL; d = down_next) {
		down_next = d->down;
		free_rdataset(rbtdb, mctx, d);
	}
	top->down = NULL;
1327
1328
}

1329
1330
static inline void
clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
Automatic Updater's avatar
Automatic Updater committed
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
	rdatasetheader_t *current, *top_prev, *top_next;
	isc_mem_t *mctx = rbtdb->common.mctx;

	/*
	 * Caller must be holding the node lock.
	 */

	top_prev = NULL;
	for (current = node->data; current != NULL; current = top_next) {
		top_next = current->next;
		clean_stale_headers(rbtdb, mctx, current);
		/*
		 * If current is nonexistent or stale, we can clean it up.
		 */
		if ((current->attributes &
		     (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0) {
			if (top_prev != NULL)
				top_prev->next = current->next;
			else
				node->data = current->next;
			free_rdataset(rbtdb, mctx, current);
		} else
			top_prev = current;
	}
	node->dirty = 0;
1356
1357
1358
1359
}

static inline void
clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
Automatic Updater's avatar
Automatic Updater committed
1360
		rbtdb_serial_t least_serial)
1361
{
Automatic Updater's avatar
Automatic Updater committed
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
	rdatasetheader_t *current, *dcurrent, *down_next, *dparent;
	rdatasetheader_t *top_prev, *top_next;
	isc_mem_t *mctx = rbtdb->common.mctx;
	isc_boolean_t still_dirty = ISC_FALSE;

	/*
	 * Caller must be holding the node lock.
	 */
	REQUIRE(least_serial != 0);

	top_prev = NULL;
	for (current = node->data; current != NULL; current = top_next) {
		top_next = current->next;

		/*
		 * First, we clean up any instances of multiple rdatasets
		 * with the same serial number, or that have the IGNORE
		 * attribute.
		 */
		dparent = current;
		for (dcurrent = current->down;
		     dcurrent != NULL;
		     dcurrent = down_next) {
			down_next = dcurrent->down;
			INSIST(dcurrent->serial <= dparent->serial);
			if (dcurrent->serial == dparent->serial ||
			    IGNORE(dcurrent)) {
				if (down_next != NULL)
					down_next->next = dparent;
				dparent->down = down_next;
				free_rdataset(rbtdb, mctx, dcurrent);
			} else
				dparent = dcurrent;
		}

		/*
		 * We've now eliminated all IGNORE datasets with the possible
		 * exception of current, which we now check.
		 */
		if (IGNORE(current)) {
			down_next = current->down;
			if (down_next == NULL) {
				if (top_prev != NULL)
					top_prev->next = current->next;
				else
					node->data = current->next;
				free_rdataset(rbtdb, mctx, current);
				/*
				 * current no longer exists, so we can
				 * just continue with the loop.
				 */
				continue;
			} else {
				/*
				 * Pull up current->down, making it the new
				 * current.
				 */
				if (top_prev != NULL)
					top_prev->next = down_next;
				else
					node->data = down_next;
				down_next->next = top_next;
				free_rdataset(rbtdb, mctx, current);
				current = down_next;
			}
		}

		/*
		 * We now try to find the first down node less than the
		 * least serial.
		 */
		dparent = current;
		for (dcurrent = current->down;
		     dcurrent != NULL;
		     dcurrent = down_next) {
			down_next = dcurrent->down;
			if (dcurrent->serial < least_serial)
				break;
			dparent = dcurrent;
		}

		/*
		 * If there is a such an rdataset, delete it and any older
		 * versions.
		 */
		if (dcurrent != NULL) {
			do {
				down_next = dcurrent->down;
				INSIST(dcurrent->serial <= least_serial);
				free_rdataset(rbtdb, mctx, dcurrent);
				dcurrent = down_next;
			} while (dcurrent != NULL);
			dparent->down = NULL;
		}

		/*
		 * Note.  The serial number of 'current' might be less than
		 * least_serial too, but we cannot delete it because it is
		 * the most recent version, unless it is a NONEXISTENT
		 * rdataset.
		 */
		if (current->down != NULL) {
			still_dirty = ISC_TRUE;
			top_prev = current;
		} else {
			/*
			 * If this is a NONEXISTENT rdataset, we can delete it.
			 */
			if (NONEXISTENT(current)) {
				if (top_prev != NULL)
					top_prev->next = current->next;
				else
					node->data = current->next;
				free_rdataset(rbtdb, mctx, current);
			} else
				top_prev = current;
		}
	}
	if (!still_dirty)
		node->dirty = 0;
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
}

/*%
 * Clean up dead nodes.  These are nodes which have no references, and
 * have no data.  They are dead but we could not or chose not to delete
 * them when we deleted all the data at that node because we did not want
 * to wait for the tree write lock.
 *
 * The caller must hold a tree write lock and bucketnum'th node (write) lock.
 */
static void
cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) {
Automatic Updater's avatar
Automatic Updater committed
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506