rbtdb.c 175 KB
Newer Older
Bob Halley's avatar
Bob Halley committed
1
/*
Mark Andrews's avatar
Mark Andrews committed
2
 * Copyright (C) 2004-2006  Internet Systems Consortium, Inc. ("ISC")
Mark Andrews's avatar
Mark Andrews committed
3
 * Copyright (C) 1999-2003  Internet Software Consortium.
4
 *
Bob Halley's avatar
Bob Halley committed
5
6
7
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
8
 *
Mark Andrews's avatar
Mark Andrews committed
9
10
11
12
13
14
15
 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 * PERFORMANCE OF THIS SOFTWARE.
Bob Halley's avatar
Bob Halley committed
16
17
 */

Mark Andrews's avatar
Mark Andrews committed
18
/* $Id: rbtdb.c,v 1.238 2006/07/31 02:04:03 marka Exp $ */
19
20

/*! \file */
David Lawrence's avatar
David Lawrence committed
21

22
23
24
25
/*
 * Principal Author: Bob Halley
 */

26
27
#include <config.h>

28
#include <isc/event.h>
29
#include <isc/mem.h>
30
#include <isc/print.h>
31
#include <isc/mutex.h>
32
#include <isc/random.h>
33
#include <isc/refcount.h>
Bob Halley's avatar
Bob Halley committed
34
#include <isc/rwlock.h>
35
#include <isc/string.h>
36
#include <isc/task.h>
37
#include <isc/time.h>
Michael Graff's avatar
Michael Graff committed
38
#include <isc/util.h>
Bob Halley's avatar
Bob Halley committed
39

40
#include <dns/acache.h>
41
42
#include <dns/db.h>
#include <dns/dbiterator.h>
43
#include <dns/events.h>
44
#include <dns/fixedname.h>
45
#include <dns/lib.h>
46
#include <dns/log.h>
47
48
#include <dns/masterdump.h>
#include <dns/rbt.h>
49
#include <dns/rdata.h>
Bob Halley's avatar
Bob Halley committed
50
51
#include <dns/rdataset.h>
#include <dns/rdatasetiter.h>
52
53
#include <dns/rdataslab.h>
#include <dns/result.h>
54
55
#include <dns/view.h>
#include <dns/zone.h>
56
#include <dns/zonekey.h>
Bob Halley's avatar
Bob Halley committed
57

58
59
60
#ifdef DNS_RBTDB_VERSION64
#include "rbtdb64.h"
#else
Bob Halley's avatar
Bob Halley committed
61
#include "rbtdb.h"
62
#endif
Bob Halley's avatar
Bob Halley committed
63

64
#ifdef DNS_RBTDB_VERSION64
65
#define RBTDB_MAGIC			ISC_MAGIC('R', 'B', 'D', '8')
66
#else
67
#define RBTDB_MAGIC			ISC_MAGIC('R', 'B', 'D', '4')
68
69
#endif

70
/*%
71
72
73
74
75
 * Note that "impmagic" is not the first four bytes of the struct, so
 * ISC_MAGIC_VALID cannot be used.
 */
#define VALID_RBTDB(rbtdb)	((rbtdb) != NULL && \
				 (rbtdb)->common.impmagic == RBTDB_MAGIC)
Bob Halley's avatar
Bob Halley committed
76

77
78
#ifdef DNS_RBTDB_VERSION64
typedef isc_uint64_t			rbtdb_serial_t;
79
/*%
80
81
82
83
84
85
 * Make casting easier in symbolic debuggers by using different names
 * for the 64 bit version.
 */
#define dns_rbtdb_t dns_rbtdb64_t
#define rdatasetheader_t rdatasetheader64_t
#define rbtdb_version_t rbtdb_version64_t
86
87
88
89
#else
typedef isc_uint32_t			rbtdb_serial_t;
#endif

Bob Halley's avatar
Bob Halley committed
90
91
typedef isc_uint32_t			rbtdb_rdatatype_t;

92
93
#define RBTDB_RDATATYPE_BASE(type)	((dns_rdatatype_t)((type) & 0xFFFF))
#define RBTDB_RDATATYPE_EXT(type)	((dns_rdatatype_t)((type) >> 16))
Bob Halley's avatar
Bob Halley committed
94
95
#define RBTDB_RDATATYPE_VALUE(b, e)	(((e) << 16) | (b))

96
97
#define RBTDB_RDATATYPE_SIGNSEC \
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
Bob Halley's avatar
Bob Halley committed
98
#define RBTDB_RDATATYPE_SIGNS \
99
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
Bob Halley's avatar
Bob Halley committed
100
#define RBTDB_RDATATYPE_SIGCNAME \
101
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
102
#define RBTDB_RDATATYPE_SIGDNAME \
103
		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
104
#define RBTDB_RDATATYPE_NCACHEANY \
Bob Halley's avatar
Bob Halley committed
105
		RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
Bob Halley's avatar
Bob Halley committed
106

107
/*
108
 * We use rwlock for DB lock only when ISC_RWLOCK_USEATOMIC is non 0.
109
 * Using rwlock is effective with regard to lookup performance only when
110
 * it is implemented in an efficient way.
111
112
113
114
 * Otherwise, it is generally wise to stick to the simple locking since rwlock
 * would require more memory or can even make lookups slower due to its own
 * overhead (when it internally calls mutex locks).
 */
115
#ifdef ISC_RWLOCK_USEATOMIC
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#define DNS_RBTDB_USERWLOCK 1
#else
#define DNS_RBTDB_USERWLOCK 0
#endif

#if DNS_RBTDB_USERWLOCK
#define RBTDB_INITLOCK(l)	isc_rwlock_init((l), 0, 0)
#define RBTDB_DESTROYLOCK(l)	isc_rwlock_destroy(l)
#define RBTDB_LOCK(l, t)	RWLOCK((l), (t))
#define RBTDB_UNLOCK(l, t)	RWUNLOCK((l), (t))
#else
#define RBTDB_INITLOCK(l)	isc_mutex_init(l)
#define RBTDB_DESTROYLOCK(l)	DESTROYLOCK(l)
#define RBTDB_LOCK(l, t)	LOCK(l)
#define RBTDB_UNLOCK(l, t)	UNLOCK(l)
#endif

/*
 * Since node locking is sensitive to both performance and memory footprint,
 * we need some trick here.  If we have both high-performance rwlock and
 * high performance and small-memory reference counters, we use rwlock for
 * node lock and isc_refcount for node references.  In this case, we don't have
 * to protect the access to the counters by locks.
 * Otherwise, we simply use ordinary mutex lock for node locking, and use
 * simple integers as reference counters which is protected by the lock.
 * In most cases, we can simply use wrapper macros such as NODE_LOCK and
 * NODE_UNLOCK.  In some other cases, however, we need to protect reference
 * counters first and then protect other parts of a node as read-only data.
 * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
 * provided for these special cases.  When we can use the efficient backend
 * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
 * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
 * section including the access to the reference counter.
 * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
 * section is also protected by NODE_STRONGLOCK().
 */
#if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT)
typedef isc_rwlock_t nodelock_t;

#define NODE_INITLOCK(l)	isc_rwlock_init((l), 0, 0)
#define NODE_DESTROYLOCK(l)	isc_rwlock_destroy(l)
#define NODE_LOCK(l, t)		RWLOCK((l), (t))
#define NODE_UNLOCK(l, t)	RWUNLOCK((l), (t))
#define NODE_TRYUPGRADE(l)	isc_rwlock_tryupgrade(l)

#define NODE_STRONGLOCK(l)
#define NODE_STRONGUNLOCK(l)
#define NODE_WEAKLOCK(l, t)	NODE_LOCK(l, t)
#define NODE_WEAKUNLOCK(l, t)	NODE_UNLOCK(l, t)
#else
typedef isc_mutex_t nodelock_t;

#define NODE_INITLOCK(l)	isc_mutex_init(l)
#define NODE_DESTROYLOCK(l)	DESTROYLOCK(l)
#define NODE_LOCK(l, t)		LOCK(l)
#define NODE_UNLOCK(l, t)	UNLOCK(l)
#define NODE_TRYUPGRADE(l)	ISC_R_SUCCESS

#define NODE_STRONGLOCK(l)	LOCK(l)
#define NODE_STRONGUNLOCK(l)	UNLOCK(l)
#define NODE_WEAKLOCK(l, t)
#define NODE_WEAKUNLOCK(l, t)
#endif

180
181
182
183
184
185
/*
 * Allow clients with a virtual time of upto 5 minutes in the past to see
 * records that would have otherwise have expired.
 */
#define RBTDB_VIRTUAL 300

186
187
188
189
190
191
struct noqname {
	dns_name_t name;
	void *	   nsec;
	void *	   nsecsig;
};

192
193
typedef struct acachectl acachectl_t;  

Bob Halley's avatar
Bob Halley committed
194
typedef struct rdatasetheader {
195
	/*%
196
197
	 * Locked by the owning node's lock.
	 */
198
	rbtdb_serial_t			serial;
Bob Halley's avatar
Bob Halley committed
199
	dns_ttl_t			ttl;
Bob Halley's avatar
Bob Halley committed
200
	rbtdb_rdatatype_t		type;
201
	isc_uint16_t			attributes;
Bob Halley's avatar
Bob Halley committed
202
	dns_trust_t			trust;
203
	struct noqname			*noqname;
204
	/*%<
Bob Halley's avatar
Bob Halley committed
205
	 * We don't use the LIST macros, because the LIST structure has
206
	 * both head and tail pointers, and is doubly linked.
Bob Halley's avatar
Bob Halley committed
207
	 */
208

Bob Halley's avatar
Bob Halley committed
209
	struct rdatasetheader		*next;
210
	/*%<
211
212
213
214
215
216
	 * If this is the top header for an rdataset, 'next' points
	 * to the top header for the next rdataset (i.e., the next type).
	 * Otherwise, it points up to the header whose down pointer points
	 * at this header.
	 */
	  
217
	struct rdatasetheader		*down;
218
	/*%<
219
220
221
222
	 * Points to the header for the next older version of
	 * this rdataset.
	 */

223
	isc_uint32_t			count;
224
	/*%<
225
226
227
228
229
230
	 * Monotonously increased every time this rdataset is bound so that
	 * it is used as the base of the starting point in DNS responses
	 * when the "cyclic" rrset-order is required.  Since the ordering
	 * should not be so crucial, no lock is set for the counter for
	 * performance reasons.
	 */
231
232
233

	acachectl_t			*additional_auth;
	acachectl_t			*additional_glue;
Bob Halley's avatar
Bob Halley committed
234
235
} rdatasetheader_t;

Michael Graff's avatar
Michael Graff committed
236
237
238
239
#define RDATASET_ATTR_NONEXISTENT	0x0001
#define RDATASET_ATTR_STALE		0x0002
#define RDATASET_ATTR_IGNORE		0x0004
#define RDATASET_ATTR_RETAIN		0x0008
240
#define RDATASET_ATTR_NXDOMAIN		0x0010
Michael Graff's avatar
Michael Graff committed
241

242
243
244
245
246
247
248
249
250
251
252
253
254
typedef struct acache_cbarg {
	dns_rdatasetadditional_t	type;
	unsigned int			count;
	dns_db_t			*db;
	dns_dbnode_t			*node;
	rdatasetheader_t		*header;
} acache_cbarg_t;

struct acachectl {
	dns_acacheentry_t		*entry;
	acache_cbarg_t			*cbarg;
};

Michael Graff's avatar
Michael Graff committed
255
256
257
258
259
260
261
/*
 * XXX
 * When the cache will pre-expire data (due to memory low or other
 * situations) before the rdataset's TTL has expired, it MUST
 * respect the RETAIN bit and not expire the data until its TTL is
 * expired.
 */
262

263
264
#undef IGNORE			/* WIN32 winbase.h defines this. */

265
266
267
268
269
270
#define EXISTS(header) \
	(((header)->attributes & RDATASET_ATTR_NONEXISTENT) == 0)
#define NONEXISTENT(header) \
	(((header)->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
#define IGNORE(header) \
	(((header)->attributes & RDATASET_ATTR_IGNORE) != 0)
Michael Graff's avatar
Michael Graff committed
271
272
#define RETAIN(header) \
	(((header)->attributes & RDATASET_ATTR_RETAIN) != 0)
273
274
#define NXDOMAIN(header) \
	(((header)->attributes & RDATASET_ATTR_NXDOMAIN) != 0)
275

276
#define DEFAULT_NODE_LOCK_COUNT		7	/*%< Should be prime. */
Mark Andrews's avatar
update    
Mark Andrews committed
277
#define DEFAULT_CACHE_NODE_LOCK_COUNT	1009	/*%< Should be prime. */
Bob Halley's avatar
Bob Halley committed
278
279

typedef struct {
280
281
282
	nodelock_t			lock;
	/* Protected in the refcount routines. */
	isc_refcount_t			references;
283
	/* Locked by lock. */
284
	isc_boolean_t			exiting;
285
286
287
288
289
290
291
292
293
294
295
} rbtdb_nodelock_t;

typedef struct rbtdb_changed {
	dns_rbtnode_t *			node;
	isc_boolean_t			dirty;
	ISC_LINK(struct rbtdb_changed)	link;
} rbtdb_changed_t;

typedef ISC_LIST(rbtdb_changed_t)	rbtdb_changedlist_t;

typedef struct rbtdb_version {
Bob Halley's avatar
Bob Halley committed
296
	/* Not locked */
297
	rbtdb_serial_t			serial;
298
299
300
301
302
303
	/*
	 * Protected in the refcount routines.
	 * XXXJT: should we change the lock policy based on the refcount
	 * performance?
	 */
	isc_refcount_t			references;
Bob Halley's avatar
Bob Halley committed
304
	/* Locked by database lock. */
305
306
307
308
309
310
311
	isc_boolean_t			writer;
	isc_boolean_t			commit_ok;
	rbtdb_changedlist_t		changed_list;
	ISC_LINK(struct rbtdb_version)	link;
} rbtdb_version_t;

typedef ISC_LIST(rbtdb_version_t)	rbtdb_versionlist_t;
Bob Halley's avatar
Bob Halley committed
312
313

typedef struct {
314
	/* Unlocked. */
Bob Halley's avatar
Bob Halley committed
315
	dns_db_t			common;
316
317
318
#if DNS_RBTDB_USERWLOCK
	isc_rwlock_t			lock;
#else
Bob Halley's avatar
Bob Halley committed
319
	isc_mutex_t			lock;
320
#endif
Bob Halley's avatar
Bob Halley committed
321
322
	isc_rwlock_t			tree_lock;
	unsigned int			node_lock_count;
323
	rbtdb_nodelock_t *	       	node_locks;
324
	dns_rbtnode_t *			origin_node;
325
	/* Locked by lock. */
326
	unsigned int			active;
327
	isc_refcount_t			references;
328
329
330
331
332
333
334
	unsigned int			attributes;
	rbtdb_serial_t			current_serial;
	rbtdb_serial_t			least_serial;
	rbtdb_serial_t			next_serial;
	rbtdb_version_t *		current_version;
	rbtdb_version_t *		future_version;
	rbtdb_versionlist_t		open_versions;
335
	isc_boolean_t			overmem;
336
	isc_task_t *			task;
337
338
	dns_dbnode_t			*soanode;
	dns_dbnode_t			*nsnode;
339
	/* Locked by tree_lock. */
Bob Halley's avatar
Bob Halley committed
340
	dns_rbt_t *			tree;
341
	isc_boolean_t			secure;
342
343

	/* Unlocked */
344
	unsigned int			quantum;
Bob Halley's avatar
Bob Halley committed
345
346
} dns_rbtdb_t;

347
#define RBTDB_ATTR_LOADED		0x01
Bob Halley's avatar
Bob Halley committed
348
#define RBTDB_ATTR_LOADING		0x02
349

350
/*%
351
352
353
354
355
356
357
358
359
360
 * Search Context
 */
typedef struct {
	dns_rbtdb_t *		rbtdb;
	rbtdb_version_t *	rbtversion;
	rbtdb_serial_t		serial;
	unsigned int		options;
	dns_rbtnodechain_t	chain;
	isc_boolean_t		copy_name;
	isc_boolean_t		need_cleanup;
Bob Halley's avatar
Bob Halley committed
361
	isc_boolean_t		wild;
362
363
	dns_rbtnode_t *	       	zonecut;
	rdatasetheader_t *	zonecut_rdataset;
364
	rdatasetheader_t *	zonecut_sigrdataset;
365
	dns_fixedname_t		zonecut_name;
366
	isc_stdtime_t		now;
367
368
} rbtdb_search_t;

369
/*%
370
371
372
373
374
375
376
 * Load Context
 */
typedef struct {
	dns_rbtdb_t *		rbtdb;
	isc_stdtime_t		now;
} rbtdb_load_t;

Bob Halley's avatar
Bob Halley committed
377
static void rdataset_disassociate(dns_rdataset_t *rdataset);
378
379
static isc_result_t rdataset_first(dns_rdataset_t *rdataset);
static isc_result_t rdataset_next(dns_rdataset_t *rdataset);
Bob Halley's avatar
Bob Halley committed
380
static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
Bob Halley's avatar
Bob Halley committed
381
static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
Bob Halley's avatar
Bob Halley committed
382
static unsigned int rdataset_count(dns_rdataset_t *rdataset);
383
384
385
386
static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset,
				        dns_name_t *name,
					dns_rdataset_t *nsec,
					dns_rdataset_t *nsecsig);
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
static isc_result_t rdataset_getadditional(dns_rdataset_t *rdataset,
					   dns_rdatasetadditional_t type,
					   dns_rdatatype_t qtype,
					   dns_acache_t *acache,
					   dns_zone_t **zonep,
					   dns_db_t **dbp,
					   dns_dbversion_t **versionp,
					   dns_dbnode_t **nodep,
					   dns_name_t *fname,
					   dns_message_t *msg,
					   isc_stdtime_t now);
static isc_result_t rdataset_setadditional(dns_rdataset_t *rdataset,
					   dns_rdatasetadditional_t type,
					   dns_rdatatype_t qtype,
					   dns_acache_t *acache,
					   dns_zone_t *zone,
					   dns_db_t *db,
					   dns_dbversion_t *version,
					   dns_dbnode_t *node,
					   dns_name_t *fname);
static isc_result_t rdataset_putadditional(dns_acache_t *acache,
					   dns_rdataset_t *rdataset,
					   dns_rdatasetadditional_t type,
					   dns_rdatatype_t qtype);
411
412

static dns_rdatasetmethods_t rdataset_methods = {
Bob Halley's avatar
Bob Halley committed
413
414
415
	rdataset_disassociate,
	rdataset_first,
	rdataset_next,
Bob Halley's avatar
Bob Halley committed
416
	rdataset_current,
Bob Halley's avatar
Bob Halley committed
417
	rdataset_clone,
418
419
	rdataset_count,
	NULL,
420
421
422
423
	rdataset_getnoqname,
	rdataset_getadditional,
	rdataset_setadditional,
	rdataset_putadditional
Bob Halley's avatar
Bob Halley committed
424
425
426
};

static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
427
428
static isc_result_t rdatasetiter_first(dns_rdatasetiter_t *iterator);
static isc_result_t rdatasetiter_next(dns_rdatasetiter_t *iterator);
Bob Halley's avatar
Bob Halley committed
429
430
431
432
433
434
435
436
static void rdatasetiter_current(dns_rdatasetiter_t *iterator,
				 dns_rdataset_t *rdataset);

static dns_rdatasetitermethods_t rdatasetiter_methods = {
	rdatasetiter_destroy,
	rdatasetiter_first,
	rdatasetiter_next,
	rdatasetiter_current
437
438
};

Bob Halley's avatar
Bob Halley committed
439
440
441
442
443
typedef struct rbtdb_rdatasetiter {
	dns_rdatasetiter_t		common;
	rdatasetheader_t *		current;
} rbtdb_rdatasetiter_t;

444
static void		dbiterator_destroy(dns_dbiterator_t **iteratorp);
445
446
447
static isc_result_t	dbiterator_first(dns_dbiterator_t *iterator);
static isc_result_t	dbiterator_last(dns_dbiterator_t *iterator);
static isc_result_t	dbiterator_seek(dns_dbiterator_t *iterator,
448
					dns_name_t *name);
449
450
451
static isc_result_t	dbiterator_prev(dns_dbiterator_t *iterator);
static isc_result_t	dbiterator_next(dns_dbiterator_t *iterator);
static isc_result_t	dbiterator_current(dns_dbiterator_t *iterator,
452
453
					   dns_dbnode_t **nodep,
					   dns_name_t *name);
454
455
static isc_result_t	dbiterator_pause(dns_dbiterator_t *iterator);
static isc_result_t	dbiterator_origin(dns_dbiterator_t *iterator,
456
457
458
459
460
					  dns_name_t *name);

static dns_dbiteratormethods_t dbiterator_methods = {
	dbiterator_destroy,
	dbiterator_first,
461
462
463
	dbiterator_last,
	dbiterator_seek,
	dbiterator_prev,
464
465
466
467
468
469
	dbiterator_next,
	dbiterator_current,
	dbiterator_pause,
	dbiterator_origin
};

470
471
#define DELETION_BATCH_MAX 64

472
/*
473
 * If 'paused' is ISC_TRUE, then the tree lock is not being held.
474
 */
475
476
477
478
typedef struct rbtdb_dbiterator {
	dns_dbiterator_t		common;
	isc_boolean_t			paused;
	isc_boolean_t			new_origin;
479
	isc_rwlocktype_t		tree_locked;
480
	isc_result_t			result;
481
482
	dns_fixedname_t			name;
	dns_fixedname_t			origin;
Bob Halley's avatar
Bob Halley committed
483
	dns_rbtnodechain_t		chain;
484
485
486
	dns_rbtnode_t			*node;
	dns_rbtnode_t			*deletions[DELETION_BATCH_MAX];
	int				delete;
487
488
489
} rbtdb_dbiterator_t;


490
491
#define IS_STUB(rbtdb)  (((rbtdb)->common.attributes & DNS_DBATTR_STUB)  != 0)
#define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
492

493
494
495
static void free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log,
		       isc_event_t *event);

496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
/*
 * Locking
 *
 * If a routine is going to lock more than one lock in this module, then
 * the locking must be done in the following order:
 *
 *	Tree Lock
 *
 *	Node Lock	(Only one from the set may be locked at one time by
 *			 any caller)
 *
 *	Database Lock
 *
 * Failure to follow this hierarchy can result in deadlock.
 */

512
513
514
515
516
517
518
519
520
521
522
/*
 * Deleting Nodes
 *
 * Currently there is no deletion of nodes from the database, except when
 * the database is being destroyed.
 *
 * If node deletion is added in the future, then for zone databases the node
 * for the origin of the zone MUST NOT be deleted.
 */


523
524
525
526
/*
 * DB Routines
 */

Bob Halley's avatar
Bob Halley committed
527
528
529
530
531
532
static void
attach(dns_db_t *source, dns_db_t **targetp) {
	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source;

	REQUIRE(VALID_RBTDB(rbtdb));

533
	isc_refcount_increment(&rbtdb->references, NULL);
Bob Halley's avatar
Bob Halley committed
534
535
536
537
538

	*targetp = source;
}

static void
539
540
541
542
543
544
545
546
free_rbtdb_callback(isc_task_t *task, isc_event_t *event) {
	dns_rbtdb_t *rbtdb = event->ev_arg;

	UNUSED(task);

	free_rbtdb(rbtdb, ISC_TRUE, event);
}

547
548
549
550
551
552
/*%
 * Work out how many nodes can be deleted in the time between two
 * requests to the nameserver.  Smooth the resulting number and use it
 * as a estimate for the number of nodes to be deleted in the next
 * iteration.
 */
553
554
static unsigned int
adjust_quantum(unsigned int old, isc_time_t *start) {
555
	unsigned int pps = dns_pps;	/* packets per second */
556
557
558
	unsigned int interval;
	isc_uint64_t usecs;
	isc_time_t end;
559
	unsigned int new;
560
561
562
563
564

	if (pps < 100)
		pps = 100;
	isc_time_now(&end);

565
	interval = 1000000 / pps;	/* interval in usec */
566
567
568
569
	if (interval == 0)
		interval = 1;
	usecs = isc_time_microdiff(&end, start);
	if (usecs == 0) {
570
571
572
573
		/*
		 * We were unable to measure the amount of time taken.
		 * Double the nodes deleted next time.
		 */
574
575
576
577
578
		old *= 2;
		if (old > 1000)
			old = 1000;
		return (old);
	}
579
580
581
582
583
584
585
586
587
588
589
	new = old * interval;
	new /= (unsigned int)usecs;
	if (new == 0)
		new = 1;
	else if (new > 1000)
		new = 1000;

	/* Smooth */
	new = (new + old * 3) / 4;
	
	isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE,
590
		      ISC_LOG_DEBUG(1), "adjust_quantum -> %d", new);
591
592

	return (new);
593
594
}
		
595
596
static void
free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) {
Bob Halley's avatar
Bob Halley committed
597
	unsigned int i;
598
	isc_ondestroy_t ondest;
599
600
	isc_result_t result;
	char buf[DNS_NAME_FORMATSIZE];
601
	isc_time_t start;
Bob Halley's avatar
Bob Halley committed
602

603
	REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions));
604
	REQUIRE(rbtdb->future_version == NULL);
605

606
607
608
609
610
611
612
613
	if (rbtdb->current_version != NULL) {
		unsigned int refs;

		isc_refcount_decrement(&rbtdb->current_version->references,
				       &refs);
		INSIST(refs == 0);
		UNLINK(rbtdb->open_versions, rbtdb->current_version, link);
		isc_refcount_destroy(&rbtdb->current_version->references);
614
		isc_mem_put(rbtdb->common.mctx, rbtdb->current_version,
Andreas Gustafsson's avatar
Andreas Gustafsson committed
615
			    sizeof(rbtdb_version_t));
616
	}
617
618
	if (event == NULL)
		rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
619
620
 again:
	if (rbtdb->tree != NULL) {
621
622
		isc_time_now(&start);
		result = dns_rbt_destroy2(&rbtdb->tree, rbtdb->quantum);
623
624
		if (result == ISC_R_QUOTA) {
			INSIST(rbtdb->task != NULL);
625
626
627
			if (rbtdb->quantum != 0)
				rbtdb->quantum = adjust_quantum(rbtdb->quantum,
								&start);
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
			if (event == NULL)
				event = isc_event_allocate(rbtdb->common.mctx,
							   NULL,
						         DNS_EVENT_FREESTORAGE,
							   free_rbtdb_callback,
							   rbtdb,
							   sizeof(isc_event_t));
			if (event == NULL)
				goto again;
			isc_task_send(rbtdb->task, &event);
			return;
		}
		INSIST(result == ISC_R_SUCCESS && rbtdb->tree == NULL);
	}
	if (event != NULL)
		isc_event_free(&event);
	if (log) {
		if (dns_name_dynamic(&rbtdb->common.origin))
			dns_name_format(&rbtdb->common.origin, buf,
					sizeof(buf));
		else
			strcpy(buf, "<UNKNOWN>");
		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
			      "done free_rbtdb(%s)", buf);
	}
654
655
	if (dns_name_dynamic(&rbtdb->common.origin))
		dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx);
656
657
658
659
	for (i = 0; i < rbtdb->node_lock_count; i++) {
		isc_refcount_destroy(&rbtdb->node_locks[i].references);
		NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
	}
Bob Halley's avatar
Bob Halley committed
660
	isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks,
Andreas Gustafsson's avatar
Andreas Gustafsson committed
661
		    rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
Bob Halley's avatar
Bob Halley committed
662
	isc_rwlock_destroy(&rbtdb->tree_lock);
663
	isc_refcount_destroy(&rbtdb->references);
664
665
	if (rbtdb->task != NULL)
		isc_task_detach(&rbtdb->task);
666
	RBTDB_DESTROYLOCK(&rbtdb->lock);
Bob Halley's avatar
Bob Halley committed
667
668
	rbtdb->common.magic = 0;
	rbtdb->common.impmagic = 0;
669
	ondest = rbtdb->common.ondest;
670
	isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb));
671
	isc_ondestroy_notify(&ondest, rbtdb);
Bob Halley's avatar
Bob Halley committed
672
673
}

674
static inline void
675
maybe_free_rbtdb(dns_rbtdb_t *rbtdb) {
676
	isc_boolean_t want_free = ISC_FALSE;
Bob Halley's avatar
Bob Halley committed
677
	unsigned int i;
678
	unsigned int inactive = 0;
679

Bob Halley's avatar
Bob Halley committed
680
681
	/* XXX check for open versions here */

682
683
684
685
686
	if (rbtdb->soanode != NULL)
		dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode);
	if (rbtdb->nsnode != NULL)
		dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode);

Bob Halley's avatar
Bob Halley committed
687
688
689
690
691
	/*
	 * Even though there are no external direct references, there still
	 * may be nodes in use.
	 */
	for (i = 0; i < rbtdb->node_lock_count; i++) {
692
		NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
693
		rbtdb->node_locks[i].exiting = ISC_TRUE;
694
695
696
		NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
		if (isc_refcount_current(&rbtdb->node_locks[i].references)
		    == 0) {
697
			inactive++;
698
		}
Bob Halley's avatar
Bob Halley committed
699
700
	}

701
	if (inactive != 0) {
702
		RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
703
704
705
		rbtdb->active -= inactive;
		if (rbtdb->active == 0)
			want_free = ISC_TRUE;
706
		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
707
708
709
710
711
712
713
714
715
716
717
718
		if (want_free) {
			char buf[DNS_NAME_FORMATSIZE];
			if (dns_name_dynamic(&rbtdb->common.origin))
				dns_name_format(&rbtdb->common.origin, buf,
						sizeof(buf));
			else
				strcpy(buf, "<UNKNOWN>");
			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
				      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
				      "calling free_rbtdb(%s)", buf);
			free_rbtdb(rbtdb, ISC_TRUE, NULL);
		}
719
	}
Bob Halley's avatar
Bob Halley committed
720
721
722
723
724
}

static void
detach(dns_db_t **dbp) {
	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(*dbp);
725
	unsigned int refs;
Bob Halley's avatar
Bob Halley committed
726
727
728

	REQUIRE(VALID_RBTDB(rbtdb));

729
	isc_refcount_decrement(&rbtdb->references, &refs);
Bob Halley's avatar
Bob Halley committed
730

731
	if (refs == 0)
732
		maybe_free_rbtdb(rbtdb);
Bob Halley's avatar
Bob Halley committed
733

Bob Halley's avatar
Bob Halley committed
734
	*dbp = NULL;
Bob Halley's avatar
Bob Halley committed
735
736
737
738
739
}

static void
currentversion(dns_db_t *db, dns_dbversion_t **versionp) {
	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
740
	rbtdb_version_t *version;
741
	unsigned int refs;
Bob Halley's avatar
Bob Halley committed
742
743
744

	REQUIRE(VALID_RBTDB(rbtdb));

745
	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
746
	version = rbtdb->current_version;
747
	isc_refcount_increment(&version->references, &refs);
748
	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
749
750
751
752
753

	*versionp = (dns_dbversion_t *)version;
}

static inline rbtdb_version_t *
754
allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial,
755
756
		 unsigned int references, isc_boolean_t writer)
{
757
	isc_result_t result;
758
759
	rbtdb_version_t *version;

Andreas Gustafsson's avatar
Andreas Gustafsson committed
760
	version = isc_mem_get(mctx, sizeof(*version));
761
762
763
	if (version == NULL)
		return (NULL);
	version->serial = serial;
764
765
766
767
768
	result = isc_refcount_init(&version->references, references);
	if (result != ISC_R_SUCCESS) {
		isc_mem_put(mctx, version, sizeof(*version));
		return (NULL);
	}
769
770
771
772
	version->writer = writer;
	version->commit_ok = ISC_FALSE;
	ISC_LIST_INIT(version->changed_list);
	ISC_LINK_INIT(version, link);
773

774
	return (version);
Bob Halley's avatar
Bob Halley committed
775
776
}

777
static isc_result_t
Bob Halley's avatar
Bob Halley committed
778
779
newversion(dns_db_t *db, dns_dbversion_t **versionp) {
	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
780
	rbtdb_version_t *version;
Bob Halley's avatar
Bob Halley committed
781

Bob Halley's avatar
Bob Halley committed
782
	REQUIRE(VALID_RBTDB(rbtdb));
783
784
	REQUIRE(versionp != NULL && *versionp == NULL);
	REQUIRE(rbtdb->future_version == NULL);
Bob Halley's avatar
Bob Halley committed
785

786
	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
787
788
789
790
	RUNTIME_CHECK(rbtdb->next_serial != 0);		/* XXX Error? */
	version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1,
				   ISC_TRUE);
	if (version != NULL) {
791
		version->commit_ok = ISC_TRUE;
792
793
794
		rbtdb->next_serial++;
		rbtdb->future_version = version;
	}
795
	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
796
797

	if (version == NULL)
798
		return (ISC_R_NOMEMORY);
799

800
801
	*versionp = version;

802
	return (ISC_R_SUCCESS);
803
804
}

805
806
807
808
809
810
static void
attachversion(dns_db_t *db, dns_dbversion_t *source,
	      dns_dbversion_t **targetp)
{
	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
	rbtdb_version_t *rbtversion = source;
811
	unsigned int refs;
812
813
814

	REQUIRE(VALID_RBTDB(rbtdb));

815
816
	isc_refcount_increment(&rbtversion->references, &refs);
	INSIST(refs > 1);
817
818
819
820

	*targetp = rbtversion;
}

821
822
823
824
825
static rbtdb_changed_t *
add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
	    dns_rbtnode_t *node)
{
	rbtdb_changed_t *changed;
826
	unsigned int refs;
827
828

	/*
829
830
	 * Caller must be holding the node lock if its reference must be
	 * protected by the lock.
831
832
	 */

Andreas Gustafsson's avatar
Andreas Gustafsson committed
833
	changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed));
Bob Halley's avatar
Bob Halley committed
834

835
	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
Bob Halley's avatar
Bob Halley committed
836
837
838
839

	REQUIRE(version->writer);

	if (changed != NULL) {
840
841
		dns_rbtnode_refincrement0(node, &refs);
		INSIST(refs > 0);
Bob Halley's avatar
Bob Halley committed
842
843
		changed->node = node;
		changed->dirty = ISC_FALSE;
844
		ISC_LIST_INITANDAPPEND(version->changed_list, changed, link);
Bob Halley's avatar
Bob Halley committed
845
	} else
846
		version->commit_ok = ISC_FALSE;
Bob Halley's avatar
Bob Halley committed
847

848
	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
849
850
851
852

	return (changed);
}

853
854
855
856
857
858
static void
free_acachearray(isc_mem_t *mctx, rdatasetheader_t *header,
		 acachectl_t *array)
{
	unsigned int count;
	unsigned int i;
859
	unsigned char *raw;	/* RDATASLAB */
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881

	/*
	 * The caller must be holding the corresponding node lock.
	 */

	if (array == NULL)
		return;

	raw = (unsigned char *)header + sizeof(*header);
	count = raw[0] * 256 + raw[1];

	/*
	 * Sanity check: since an additional cache entry has a reference to
	 * the original DB node (in the callback arg), there should be no
	 * acache entries when the node can be freed. 
	 */
	for (i = 0; i < count; i++)
		INSIST(array[i].entry == NULL && array[i].cbarg == NULL);

	isc_mem_put(mctx, array, count * sizeof(acachectl_t));
}

882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
static inline void
free_noqname(isc_mem_t *mctx, struct noqname **noqname) {

	if (dns_name_dynamic(&(*noqname)->name))
		dns_name_free(&(*noqname)->name, mctx);
	if ((*noqname)->nsec != NULL)
		isc_mem_put(mctx, (*noqname)->nsec,
			    dns_rdataslab_size((*noqname)->nsec, 0));
	if ((*noqname)->nsec != NULL)
		isc_mem_put(mctx, (*noqname)->nsecsig,
			    dns_rdataslab_size((*noqname)->nsecsig, 0));
	isc_mem_put(mctx, *noqname, sizeof(**noqname));
	*noqname = NULL;
}

897
898
899
900
static inline void
free_rdataset(isc_mem_t *mctx, rdatasetheader_t *rdataset) {
	unsigned int size;

901
902
	if (rdataset->noqname != NULL)
		free_noqname(mctx, &rdataset->noqname);
903
904
905
906

	free_acachearray(mctx, rdataset, rdataset->additional_auth);
	free_acachearray(mctx, rdataset, rdataset->additional_glue);

Bob Halley's avatar
Bob Halley committed
907
	if ((rdataset->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
Andreas Gustafsson's avatar
Andreas Gustafsson committed
908
		size = sizeof(*rdataset);
Bob Halley's avatar
Bob Halley committed
909
910
	else
		size = dns_rdataslab_size((unsigned char *)rdataset,
Andreas Gustafsson's avatar
Andreas Gustafsson committed
911
					  sizeof(*rdataset));
912
913
914
	isc_mem_put(mctx, rdataset, size);
}

915
static inline void
916
917
rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) {
	rdatasetheader_t *header, *dcurrent;
918
	isc_boolean_t make_dirty = ISC_FALSE;
919
920
921
922
923

	/*
	 * Caller must hold the node lock.
	 */

924
925
926
927
928
929
	/*
	 * We set the IGNORE attribute on rdatasets with serial number
	 * 'serial'.  When the reference count goes to zero, these rdatasets
	 * will be cleaned up; until that time, they will be ignored.
	 */
	for (header = node->data; header != NULL; header = header->next) {
930
		if (header->serial == serial) {
931
			header->attributes |= RDATASET_ATTR_IGNORE;
932
933
			make_dirty = ISC_TRUE;
		}
934
935
936
		for (dcurrent = header->down;
		     dcurrent != NULL;
		     dcurrent = dcurrent->down) {
937
			if (dcurrent->serial == serial) {
938
				dcurrent->attributes |= RDATASET_ATTR_IGNORE;
939
940
				make_dirty = ISC_TRUE;
			}
941
942
		}
	}
943
944
	if (make_dirty)
		node->dirty = 1;
945
946
}

947
948
949
950
951
952
953
954
955
956
957
static inline void
clean_stale_headers(isc_mem_t *mctx, rdatasetheader_t *top) {
	rdatasetheader_t *d, *down_next;

	for (d = top->down; d != NULL; d = down_next) {
		down_next = d->down;
		free_rdataset(mctx, d);
	}
	top->down = NULL;
}

958
959
static inline void
clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
960
	rdatasetheader_t *current, *top_prev, *top_next;
Mark Andrews's avatar
update    
Mark Andrews committed
961
	isc_mem_t *mctx = rbtdb->common.mctx;
962
963
964
965
966

	/*
	 * Caller must be holding the node lock.
	 */

967
	top_prev = NULL;
968
969
	for (current = node->data; current != NULL; current = top_next) {
		top_next = current->next;
970
		clean_stale_headers(mctx, current);
971
972
973
974
975
		/*
		 * If current is nonexistent or stale, we can clean it up.
		 */
		if ((current->attributes &
		     (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0) {
976
977
			if (top_prev != NULL)
				top_prev->next = current->next;
978
979
980
			else
				node->data = current->next;
			free_rdataset(mctx, current);
981
982
		} else
			top_prev = current;
983
	}
984
	node->dirty = 0;
985
986
987
988
989
990
}

static inline void
clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
		rbtdb_serial_t least_serial)
{
991
992
	rdatasetheader_t *current, *dcurrent, *down_next, *dparent;
	rdatasetheader_t *top_prev, *top_next;
993
994
995
996
997
998
999
1000
	isc_mem_t *mctx = rbtdb->common.mctx;
	isc_boolean_t still_dirty = ISC_FALSE;

	/*
	 * Caller must be holding the node lock.
	 */
	REQUIRE(least_serial != 0);

1001
	top_prev = NULL;
1002
1003
	for (current = node->data; current != NULL; current = top_next) {
		top_next = current->next;
1004

1005
		/*
1006
1007
1008
		 * First, we clean up any instances of multiple rdatasets
		 * with the same serial number, or that have the IGNORE
		 * attribute.
1009
1010
1011
1012
		 */
		dparent = current;
		for (dcurrent = current->down;
		     dcurrent != NULL;
1013
1014
1015
		     dcurrent = down_next) {
			down_next = dcurrent->down;
			INSIST(dcurrent->serial <= dparent->serial);
1016
1017
			if (dcurrent->serial == dparent->serial ||
			    IGNORE(dcurrent)) {
1018
1019
1020
				if (down_next != NULL)
					down_next->next = dparent;
				dparent->down = down_next;
1021
1022
1023
				free_rdataset(mctx, dcurrent);
			} else
				dparent = dcurrent;
1024
		}
1025

1026
		/*
1027
		 * We've now eliminated all IGNORE datasets with the possible
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
		 * exception of current, which we now check.
		 */
		if (IGNORE(current)) {
			down_next = current->down;
			if (down_next == NULL) {
				if (top_prev != NULL)
					top_prev->next = current->next;
				else
					node->data = current->next;
				free_rdataset(mctx, current);
				/*
				 * current no longer exists, so we can
				 * just continue with the loop.
				 */
				continue;
			} else {
				/*
				 * Pull up current->down, making it the new
				 * current.
				 */
				if (top_prev != NULL)
					top_prev->next = down_next;
				else
					node->data = down_next;
				down_next->next = top_next;
				free_rdataset(mctx, current);
				current = down_next;
			}
		}

1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
		/*
		 * We now try to find the first down node less than the
		 * least serial.
		 */
		dparent = current;
		for (dcurrent = current->down;
		     dcurrent != NULL;
		     dcurrent = down_next) {
			down_next = dcurrent->down;
			if (dcurrent->serial < least_serial)
				break;
			dparent = dcurrent;
		}

		/*
		 * If there is a such an rdataset, delete it and any older
		 * versions.
		 */
		if (dcurrent != NULL) {
			do {
				down_next = dcurrent->down;
				INSIST(dcurrent->serial <= least_serial);
				free_rdataset(mctx, dcurrent);
				dcurrent = down_next;
			} while (dcurrent != NULL);
			dparent->down = NULL;
		}

1086
1087
1088
		/*
		 * Note.  The serial number of 'current' might be less than
		 * least_serial too, but we cannot delete it because it is
Bob Halley's avatar
Bob Halley committed
1089
		 * the most recent version, unless it is a NONEXISTENT
1090
		 * rdataset.
1091
		 */
1092
		if (current->down != NULL) {
1093
			still_dirty = ISC_TRUE;
1094
1095
			top_prev = current;
		} else {
Bob Halley's avatar
Bob Halley committed
1096
1097
1098
			/*
			 * If this is a NONEXISTENT rdataset, we can delete it.
			 */
1099
			if (NONEXISTENT(current)) {
1100
1101
				if (top_prev != NULL)
					top_prev->next = current->next;
Bob Halley's avatar
Bob Halley committed
1102
1103
1104
				else
					node->data = current->next;
				free_rdataset(mctx, current);
1105
1106
			} else
				top_prev = current;
Bob Halley's avatar
Bob Halley committed
1107
		}
1108
1109
1110
1111
1112
	}
	if (!still_dirty)
		node->dirty = 0;
}

1113
1114
1115
1116
/*
 * Caller must be holding the node lock if its reference must be protected
 * by the lock.
 */
1117
1118
static inline void
new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1119
1120
1121
1122
1123
1124
1125
1126
	unsigned int lockrefs, noderefs;
	isc_refcount_t *lockref;

	dns_rbtnode_refincrement0(node, &noderefs);
	if (noderefs == 1) {	/* this is the first reference to the node */
		lockref = &rbtdb->node_locks[node->locknum].references;
		isc_refcount_increment0(lockref, &lockrefs);
		INSIST(lockrefs != 0);
1127
	}
1128
	INSIST(noderefs != 0);
1129
1130
}

1131
1132
1133
1134
1135

/*
 * Caller must be holding the node lock if its reference must be protected
 * by the lock.
 */
1136
static void
1137
no_references(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1138
	      rbtdb_serial_t least_serial, isc_rwlocktype_t lock)
1139
{
1140
1141
	isc_result_t result;
	isc_boolean_t write_locked;
1142
	unsigned int locknum;
1143
	unsigned int refs;
1144

1145
	/*
1146
1147
1148
	 * We cannot request the node reference be 0 at the moment, since
	 * the reference counter can atomically be modified without a lock.
	 * It should still be safe unless we actually try to delete the node,
1149
	 * at which point the condition is explicitly checked.
1150
	 */
1151

1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
	locknum = node->locknum;

	NODE_WEAKLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_read);
	if (!node->dirty && (node->data != NULL || node->down != NULL)) {
		/* easy and typical case first, in an efficient way. */
		isc_refcount_decrement(&rbtdb->node_locks[locknum].references,
				       &refs);
		INSIST((int)refs >= 0);
		NODE_WEAKUNLOCK(&rbtdb->node_locks[locknum].lock,
				isc_rwlocktype_read);
		return;
	}
	NODE_WEAKUNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_read);
1165

1166
	NODE_WEAKLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
1167
	if (node->dirty && dns_rbtnode_refcurrent(node) == 0) {
1168
		if (IS_CACHE(rbtdb))
1169
1170
1171
1172
1173
1174
1175
			clean_cache_node(rbtdb, node);
		else {
			if (least_serial == 0) {
				/*
				 * Caller doesn't know the least serial.
				 * Get it.
				 */
1176
				RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1177
				least_serial = rbtdb->least_serial;
1178
1179
				RBTDB_UNLOCK(&rbtdb->lock,
					     isc_rwlocktype_read);
1180
1181
			}
			clean_zone_node(rbtdb, node, least_serial);
1182
1183
1184
		}
	}

1185
1186
	isc_refcount_decrement(&rbtdb->node_locks[locknum].references, &refs);
	INSIST((int)refs >= 0);
1187
1188
1189
1190

	/*
	 * XXXDCL should this only be done for cache zones?
	 */
1191
1192
1193
	if (node->data != NULL || node->down != NULL) {
		NODE_WEAKUNLOCK(&rbtdb->node_locks[locknum].lock,
				isc_rwlocktype_write);
1194
		return;
1195
	}
1196
1197
1198
1199
1200

	/*
	 * XXXDCL need to add a deferred delete method for ISC_R_LOCKBUSY.
	 */
	if (lock != isc_rwlocktype_write) {
1201
		/*
1202
1203
1204
		 * Locking hierarchy notwithstanding, we don't need to free
		 * the node lock before acquiring the tree write lock because
		 * we only do a trylock.
1205
		 */
1206
		if (lock == isc_rwlocktype_read)
1207
1208
1209
1210
			result = isc_rwlock_tryupgrade(&rbtdb->tree_lock);
		else
			result = isc_rwlock_trylock(&rbtdb->tree_lock,
						    isc_rwlocktype_write);
1211
1212
1213
1214
1215
1216
1217
		RUNTIME_CHECK(result == ISC_R_SUCCESS ||
			      result == ISC_R_LOCKBUSY);
 
		write_locked = ISC_TF(result == ISC_R_SUCCESS);
	} else
		write_locked = ISC_TRUE;

1218
	if (write_locked && dns_rbtnode_refcurrent(node) == 0) {
1219
		/*
1220
1221
1222
1223
		 * We can now delete the node if the reference counter must be
		 * zero.  This should be typically the case, but a different
		 * thread may still gain a (new) reference just before the
		 * current thread locks the tree (e.g., in findnode()).
1224
1225
		 */

1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
		if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
			char printname[DNS_NAME_FORMATSIZE];

			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
				      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
				      "no_references: delete from rbt: %p %s",
				      node,
				      dns_rbt_formatnodename(node, printname,
							   sizeof(printname)));
		}

		result = dns_rbt_deletenode(rbtdb->tree, node, ISC_FALSE);
		if (result != ISC_R_SUCCESS)
			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
				      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
				      "no_references: dns_rbt_deletenode: %s",
				      isc_result_totext(result));
	}

1245
1246
1247
	NODE_WEAKUNLOCK(&rbtdb->node_locks[locknum].lock,
			isc_rwlocktype_write);

1248
1249
1250
	/*
	 * Relock a read lock, or unlock the write lock if no lock was held.
	 */
1251
	if (lock == isc_rwlocktype_none)
1252
1253
1254
1255
		if (write_locked)
			RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);

	if (lock == isc_rwlocktype_read)
1256
1257
		if (write_locked)
			isc_rwlock_downgrade(&rbtdb->tree_lock);
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
}

static inline void
make_least_version(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
		   rbtdb_changedlist_t *cleanup_list)
{
	/*
	 * Caller must be holding the database lock.
	 */

	rbtdb->least_serial = version->serial;
	*cleanup_list = version->changed_list;
1270
	ISC_LIST_INIT(version->changed_list);
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
}

static inline void
cleanup_nondirty(rbtdb_version_t *version, rbtdb_changedlist_t *cleanup_list) {
	rbtdb_changed_t *changed, *next_changed;

	/*
	 * If the changed record is dirty, then
	 * an update created multiple versions of
	 * a given rdataset.  We keep this list
	 * until we're the least open version, at
	 * which point it's safe to get rid of any
	 * older versions.
	 *
	 * If the changed record isn't dirty, then
	 * we don't need it anymore since we're
	 * committing and not rolling back.
Bob Halley's avatar
Bob Halley committed
1288
1289
	 *
	 * The caller must be holding the database lock.