Commit e24bc324 authored by Ondřej Surý's avatar Ondřej Surý

Fix the rbt hashtable and grow it when setting max-cache-size

There were several problems with rbt hashtable implementation:

1. Our internal hashing function returns uint64_t value, but it was
   silently truncated to unsigned int in dns_name_hash() and
   dns_name_fullhash() functions.  As the SipHash 2-4 higher bits are
   more random, we need to use the upper half of the return value.

2. The hashtable implementation in rbt.c was using modulo to pick the
   slot number for the hash table.  This has several problems because
   modulo is: a) slow, b) oblivious to patterns in the input data.  This
   could lead to very uneven distribution of the hashed data in the
   hashtable.  Combined with the single-linked lists we use, it could
   really hog-down the lookup and removal of the nodes from the rbt
   tree[a].  The Fibonacci Hashing is much better fit for the hashtable
   function here.  For longer description, read "Fibonacci Hashing: The
   Optimization that the World Forgot"[b] or just look at the Linux
   kernel.  Also this will make Diego very happy :).

3. The hashtable would rehash every time the number of nodes in the rbt
   tree would exceed 3 * (hashtable size).  The overcommit will make the
   uneven distribution in the hashtable even worse, but the main problem
   lies in the rehashing - every time the database grows beyond the
   limit, each subsequent rehashing will be much slower.  The mitigation
   here is letting the rbt know how big the cache can grown and
   pre-allocate the hashtable to be big enough to actually never need to
   rehash.  This will consume more memory at the start, but since the
   size of the hashtable is capped to `1 << 32` (e.g. 4 mio entries), it
   will only consume maximum of 32GB of memory for hashtable in the
   worst case (and max-cache-size would need to be set to more than
   4TB).  Calling the dns_db_adjusthashsize() will also cap the maximum
   size of the hashtable to the pre-computed number of bits, so it won't
   try to consume more gigabytes of memory than available for the
   database.

   FIXME: What is the average size of the rbt node that gets hashed?  I
   chose the pagesize (4k) as initial value to precompute the size of
   the hashtable, but the value is based on feeling and not any real
   data.

For future work, there are more places where we use result of the hash
value modulo some small number and that would benefit from Fibonacci
Hashing to get better distribution.

Notes:
a. A doubly linked list should be used here to speedup the removal of
   the entries from the hashtable.
b. https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
parent 9dcf2296
......@@ -589,7 +589,8 @@ static dns_dbmethods_t sampledb_methods = {
NULL, /* getsize */
NULL, /* setservestalettl */
NULL, /* getservestalettl */
NULL /* setgluecachestats */
NULL, /* setgluecachestats */
NULL /* adjusthashsize */
};
/* Auxiliary driver functions. */
......
......@@ -756,7 +756,8 @@ grow_names(isc_task_t *task, isc_event_t *ev) {
isc_mutex_t *newnamelocks = NULL;
isc_result_t result;
unsigned int *newname_refcnt = NULL;
unsigned int i, n, bucket;
unsigned int i, n;
unsigned int bucket;
adb = ev->ev_arg;
INSIST(DNS_ADB_VALID(adb));
......@@ -4838,7 +4839,7 @@ void
dns_adb_flushname(dns_adb_t *adb, const dns_name_t *name) {
dns_adbname_t *adbname;
dns_adbname_t *nextname;
int bucket;
unsigned int bucket;
REQUIRE(DNS_ADB_VALID(adb));
REQUIRE(name != NULL);
......
......@@ -285,7 +285,8 @@ dns_badcache_find(dns_badcache_t *bc, const dns_name_t *name,
dns_rdatatype_t type, uint32_t *flagp, isc_time_t *now) {
dns_bcentry_t *bad, *prev, *next;
bool answer = false;
unsigned int i, hash;
unsigned int i;
unsigned int hash;
REQUIRE(VALID_BADCACHE(bc));
REQUIRE(name != NULL);
......@@ -385,7 +386,7 @@ dns_badcache_flushname(dns_badcache_t *bc, const dns_name_t *name) {
dns_bcentry_t *bad, *prev, *next;
isc_result_t result;
isc_time_t now;
unsigned int i;
unsigned int hash;
REQUIRE(VALID_BADCACHE(bc));
REQUIRE(name != NULL);
......@@ -396,16 +397,16 @@ dns_badcache_flushname(dns_badcache_t *bc, const dns_name_t *name) {
if (result != ISC_R_SUCCESS) {
isc_time_settoepoch(&now);
}
i = dns_name_hash(name, false) % bc->size;
LOCK(&bc->tlocks[i]);
hash = dns_name_hash(name, false) % bc->size;
LOCK(&bc->tlocks[hash]);
prev = NULL;
for (bad = bc->table[i]; bad != NULL; bad = next) {
for (bad = bc->table[hash]; bad != NULL; bad = next) {
int n;
next = bad->next;
n = isc_time_compare(&bad->expire, &now);
if (n < 0 || dns_name_equal(name, &bad->name)) {
if (prev == NULL) {
bc->table[i] = bad->next;
bc->table[hash] = bad->next;
} else {
prev->next = bad->next;
}
......@@ -417,7 +418,7 @@ dns_badcache_flushname(dns_badcache_t *bc, const dns_name_t *name) {
prev = bad;
}
}
UNLOCK(&bc->tlocks[i]);
UNLOCK(&bc->tlocks[hash]);
RWUNLOCK(&bc->lock, isc_rwlocktype_read);
}
......
......@@ -956,6 +956,7 @@ dns_cache_setcachesize(dns_cache_t *cache, size_t size) {
* time, or replacing other limits).
*/
isc_mem_setwater(cache->mctx, water, cache, hiwater, lowater);
dns_db_adjusthashsize(cache->db, size);
}
}
......
......@@ -831,6 +831,17 @@ dns_db_hashsize(dns_db_t *db) {
return ((db->methods->hashsize)(db));
}
isc_result_t
dns_db_adjusthashsize(dns_db_t *db, size_t size) {
REQUIRE(DNS_DB_VALID(db));
if (db->methods->adjusthashsize != NULL) {
return ((db->methods->adjusthashsize)(db, size));
}
return (ISC_R_NOTIMPLEMENTED);
}
void
dns_db_settask(dns_db_t *db, isc_task_t *task) {
REQUIRE(DNS_DB_VALID(db));
......
......@@ -967,7 +967,8 @@ static dns_dbmethods_t rpsdb_db_methods = {
NULL, /* getsize */
NULL, /* setservestalettl */
NULL, /* getservestalettl */
NULL /* setgluecachestats */
NULL, /* setgluecachestats */
NULL /* adjusthashsize */
};
static dns_rdatasetmethods_t rpsdb_rdataset_methods = {
......
......@@ -179,6 +179,7 @@ typedef struct dns_dbmethods {
isc_result_t (*setservestalettl)(dns_db_t *db, dns_ttl_t ttl);
isc_result_t (*getservestalettl)(dns_db_t *db, dns_ttl_t *ttl);
isc_result_t (*setgluecachestats)(dns_db_t *db, isc_stats_t *stats);
isc_result_t (*adjusthashsize)(dns_db_t *db, size_t size);
} dns_dbmethods_t;
typedef isc_result_t (*dns_dbcreatefunc_t)(isc_mem_t * mctx,
......@@ -1363,6 +1364,23 @@ dns_db_hashsize(dns_db_t *db);
* 0 if not implemented.
*/
isc_result_t
dns_db_adjusthashsize(dns_db_t *db, size_t size);
/*%<
* For database implementations using a hash table, adjust
* the size of the hash table to store objects with size
* memory footprint.
*
* Requires:
*
* \li 'db' is a valid database.
* \li 'size' is maximum memory footprint of the database
*
* Returns:
* \li #ISC_R_SUCCESS The registration succeeded
* \li #ISC_R_NOMEMORY Out of memory
*/
void
dns_db_settask(dns_db_t *db, isc_task_t *task);
/*%<
......
......@@ -684,6 +684,17 @@ dns_rbt_hashsize(dns_rbt_t *rbt);
* \li rbt is a valid rbt manager.
*/
isc_result_t
dns_rbt_adjusthashsize(dns_rbt_t *rbt, size_t size);
/*%<
* Adjust the number of buckets in the 'rbt' hash table, according to the
* expected maximum size of the rbt database.
*
* Requires:
* \li rbt is a valid rbt manager.
* \li size is expected maximum memory footprint of rbt.
*/
void
dns_rbt_destroy(dns_rbt_t **rbtp);
isc_result_t
......
......@@ -464,7 +464,8 @@ dns_name_hash(const dns_name_t *name, bool case_sensitive) {
length = 16;
}
return (isc_hash_function(name->ndata, length, case_sensitive));
/* High bits are more random. */
return (isc_hash_function(name->ndata, length, case_sensitive) >> 32);
}
unsigned int
......@@ -478,7 +479,9 @@ dns_name_fullhash(const dns_name_t *name, bool case_sensitive) {
return (0);
}
return (isc_hash_function(name->ndata, name->length, case_sensitive));
/* High bits are more random. */
return (isc_hash_function(name->ndata, name->length, case_sensitive) >>
32);
}
dns_namereln_t
......
......@@ -59,13 +59,27 @@
#define CHAIN_MAGIC ISC_MAGIC('0', '-', '0', '-')
#define VALID_CHAIN(chain) ISC_MAGIC_VALID(chain, CHAIN_MAGIC)
#define RBT_HASH_SIZE 64
#define RBT_HASH_MIN_BITS 16
#define RBT_HASH_MAX_BITS 32
#define RBT_HASH_OVERCOMMIT 3
#define RBT_HASH_BUCKETSIZE 4096 /* FIXME: What would be a good value here? */
#ifdef RBT_MEM_TEST
#undef RBT_HASH_SIZE
#define RBT_HASH_SIZE 2 /*%< To give the reallocation code a workout. */
#endif /* ifdef RBT_MEM_TEST */
#define GOLDEN_RATIO_32 0x61C88647
#define HASHSIZE(bits) (UINT64_C(1) << (bits))
static inline uint32_t
hash_32(uint32_t val, unsigned int bits) {
REQUIRE(bits <= RBT_HASH_MAX_BITS);
/* High bits are more random. */
return (val * GOLDEN_RATIO_32 >> (32 - bits));
}
struct dns_rbt {
unsigned int magic;
isc_mem_t *mctx;
......@@ -73,7 +87,8 @@ struct dns_rbt {
void (*data_deleter)(void *, void *);
void *deleter_arg;
unsigned int nodecount;
size_t hashsize;
uint16_t hashbits;
uint16_t maxhashbits;
dns_rbtnode_t **hashtable;
void *mmap_location;
};
......@@ -384,8 +399,12 @@ hash_node(dns_rbt_t *rbt, dns_rbtnode_t *node, const dns_name_t *name);
static inline void
unhash_node(dns_rbt_t *rbt, dns_rbtnode_t *node);
static uint32_t
rehash_bits(dns_rbt_t *rbt, size_t newcount);
static void
rehash(dns_rbt_t *rbt, unsigned int newcount);
rehash(dns_rbt_t *rbt, uint32_t newbits);
static void
maybe_rehash(dns_rbt_t *rbt, size_t size);
static inline void
rotate_left(dns_rbtnode_t *node, dns_rbtnode_t **rootp);
......@@ -920,7 +939,7 @@ dns_rbt_deserialize_tree(void *base_address, size_t filesize,
result = ISC_R_INVALIDFILE;
goto cleanup;
}
rehash(rbt, header->nodecount);
maybe_rehash(rbt, header->nodecount);
CHECK(treefix(rbt, base_address, filesize, rbt->root, dns_rootname,
datafixer, fixer_arg, &crc));
......@@ -980,7 +999,8 @@ dns_rbt_create(isc_mem_t *mctx, dns_rbtdeleter_t deleter, void *deleter_arg,
rbt->root = NULL;
rbt->nodecount = 0;
rbt->hashtable = NULL;
rbt->hashsize = 0;
rbt->hashbits = 0;
rbt->maxhashbits = RBT_HASH_MAX_BITS;
rbt->mmap_location = NULL;
result = inithash(rbt);
......@@ -1024,8 +1044,8 @@ dns_rbt_destroy2(dns_rbt_t **rbtp, unsigned int quantum) {
rbt->mmap_location = NULL;
if (rbt->hashtable != NULL) {
isc_mem_put(rbt->mctx, rbt->hashtable,
rbt->hashsize * sizeof(dns_rbtnode_t *));
size_t size = HASHSIZE(rbt->hashbits) * sizeof(dns_rbtnode_t *);
isc_mem_put(rbt->mctx, rbt->hashtable, size);
}
rbt->magic = 0;
......@@ -1045,7 +1065,20 @@ size_t
dns_rbt_hashsize(dns_rbt_t *rbt) {
REQUIRE(VALID_RBT(rbt));
return (rbt->hashsize);
return (1 << rbt->hashbits);
}
isc_result_t
dns_rbt_adjusthashsize(dns_rbt_t *rbt, size_t size) {
REQUIRE(VALID_RBT(rbt));
size_t newsize = size / RBT_HASH_BUCKETSIZE;
rbt->maxhashbits = rehash_bits(rbt, newsize);
maybe_rehash(rbt, newsize);
return (ISC_R_SUCCESS);
}
static inline isc_result_t
......@@ -1544,7 +1577,7 @@ dns_rbt_findnode(dns_rbt_t *rbt, const dns_name_t *name, dns_name_t *foundname,
dns_rbtnode_t *up_current;
unsigned int nlabels;
unsigned int tlabels = 1;
unsigned int hash;
uint32_t hash;
/*
* The case of current not being a subtree root,
......@@ -1587,7 +1620,8 @@ dns_rbt_findnode(dns_rbt_t *rbt, const dns_name_t *name, dns_name_t *foundname,
* Walk all the nodes in the hash bucket pointed
* by the computed hash value.
*/
for (hnode = rbt->hashtable[hash % rbt->hashsize];
for (hnode = rbt->hashtable[hash_32(hash,
rbt->hashbits)];
hnode != NULL; hnode = hnode->hashnext)
{
dns_name_t hnode_name;
......@@ -2267,13 +2301,13 @@ create_node(isc_mem_t *mctx, const dns_name_t *name, dns_rbtnode_t **nodep) {
*/
static inline void
hash_add_node(dns_rbt_t *rbt, dns_rbtnode_t *node, const dns_name_t *name) {
unsigned int hash;
uint32_t hash;
REQUIRE(name != NULL);
HASHVAL(node) = dns_name_fullhash(name, false);
hash = HASHVAL(node) % rbt->hashsize;
hash = hash_32(HASHVAL(node), rbt->hashbits);
HASHNEXT(node) = rbt->hashtable[hash];
rbt->hashtable[hash] = node;
......@@ -2284,45 +2318,56 @@ hash_add_node(dns_rbt_t *rbt, dns_rbtnode_t *node, const dns_name_t *name) {
*/
static isc_result_t
inithash(dns_rbt_t *rbt) {
unsigned int bytes;
rbt->hashsize = RBT_HASH_SIZE;
bytes = (unsigned int)rbt->hashsize * sizeof(dns_rbtnode_t *);
rbt->hashtable = isc_mem_get(rbt->mctx, bytes);
size_t size;
memset(rbt->hashtable, 0, bytes);
rbt->hashbits = RBT_HASH_MIN_BITS;
size = HASHSIZE(rbt->hashbits) * sizeof(dns_rbtnode_t *);
rbt->hashtable = isc_mem_get(rbt->mctx, size);
memset(rbt->hashtable, 0, size);
return (ISC_R_SUCCESS);
}
static uint32_t
rehash_bits(dns_rbt_t *rbt, size_t newcount) {
uint32_t oldbits = rbt->hashbits;
uint32_t newbits = oldbits;
while (newcount >= HASHSIZE(newbits) && newbits <= rbt->maxhashbits) {
newbits += 1;
}
return (newbits);
}
/*
* Rebuild the hashtable to reduce the load factor
*/
static void
rehash(dns_rbt_t *rbt, unsigned int newcount) {
unsigned int oldsize;
rehash(dns_rbt_t *rbt, uint32_t newbits) {
uint32_t oldbits;
size_t oldsize;
dns_rbtnode_t **oldtable;
dns_rbtnode_t *node;
dns_rbtnode_t *nextnode;
unsigned int hash;
unsigned int i;
size_t newsize;
oldsize = (unsigned int)rbt->hashsize;
REQUIRE(rbt->hashbits <= rbt->maxhashbits);
REQUIRE(newbits <= rbt->maxhashbits);
oldbits = rbt->hashbits;
oldsize = HASHSIZE(oldbits);
oldtable = rbt->hashtable;
do {
INSIST((rbt->hashsize * 2 + 1) > rbt->hashsize);
rbt->hashsize = rbt->hashsize * 2 + 1;
} while (newcount >= (rbt->hashsize * 3));
rbt->hashtable = isc_mem_get(rbt->mctx,
rbt->hashsize * sizeof(dns_rbtnode_t *));
for (i = 0; i < rbt->hashsize; i++) {
rbt->hashtable[i] = NULL;
}
rbt->hashbits = newbits;
newsize = HASHSIZE(rbt->hashbits);
rbt->hashtable = isc_mem_get(rbt->mctx,
newsize * sizeof(dns_rbtnode_t *));
memset(rbt->hashtable, 0, newsize * sizeof(dns_rbtnode_t *));
for (i = 0; i < oldsize; i++) {
for (size_t i = 0; i < oldsize; i++) {
dns_rbtnode_t *node;
dns_rbtnode_t *nextnode;
for (node = oldtable[i]; node != NULL; node = nextnode) {
hash = HASHVAL(node) % rbt->hashsize;
uint32_t hash = hash_32(HASHVAL(node), rbt->hashbits);
nextnode = HASHNEXT(node);
HASHNEXT(node) = rbt->hashtable[hash];
rbt->hashtable[hash] = node;
......@@ -2332,6 +2377,14 @@ rehash(dns_rbt_t *rbt, unsigned int newcount) {
isc_mem_put(rbt->mctx, oldtable, oldsize * sizeof(dns_rbtnode_t *));
}
static void
maybe_rehash(dns_rbt_t *rbt, size_t newcount) {
uint32_t newbits = rehash_bits(rbt, newcount);
if (rbt->hashbits < newbits && newbits <= RBT_HASH_MAX_BITS) {
rehash(rbt, newbits);
}
}
/*
* Add a node to the hash table. Rehash the hashtable if the node count
* rises above a critical level.
......@@ -2340,8 +2393,8 @@ static inline void
hash_node(dns_rbt_t *rbt, dns_rbtnode_t *node, const dns_name_t *name) {
REQUIRE(DNS_RBTNODE_VALID(node));
if (rbt->nodecount >= (rbt->hashsize * 3)) {
rehash(rbt, rbt->nodecount);
if (rbt->nodecount >= (HASHSIZE(rbt->hashbits) * RBT_HASH_OVERCOMMIT)) {
maybe_rehash(rbt, rbt->nodecount);
}
hash_add_node(rbt, node, name);
......@@ -2352,12 +2405,12 @@ hash_node(dns_rbt_t *rbt, dns_rbtnode_t *node, const dns_name_t *name) {
*/
static inline void
unhash_node(dns_rbt_t *rbt, dns_rbtnode_t *node) {
unsigned int bucket;
uint32_t bucket;
dns_rbtnode_t *bucket_node;
REQUIRE(DNS_RBTNODE_VALID(node));
bucket = HASHVAL(node) % rbt->hashsize;
bucket = hash_32(HASHVAL(node), rbt->hashbits);
bucket_node = rbt->hashtable[bucket];
if (bucket_node == node) {
......
......@@ -7987,6 +7987,22 @@ hashsize(dns_db_t *db) {
return (size);
}
static isc_result_t
adjusthashsize(dns_db_t *db, size_t size) {
isc_result_t result;
dns_rbtdb_t *rbtdb;
rbtdb = (dns_rbtdb_t *)db;
REQUIRE(VALID_RBTDB(rbtdb));
RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
result = dns_rbt_adjusthashsize(rbtdb->tree, size);
RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
return (result);
}
static void
settask(dns_db_t *db, isc_task_t *task) {
dns_rbtdb_t *rbtdb;
......@@ -8392,7 +8408,8 @@ static dns_dbmethods_t zone_methods = { attach,
getsize,
NULL, /* setservestalettl */
NULL, /* getservestalettl */
setgluecachestats };
setgluecachestats,
adjusthashsize };
static dns_dbmethods_t cache_methods = { attach,
detach,
......@@ -8441,7 +8458,8 @@ static dns_dbmethods_t cache_methods = { attach,
NULL, /* getsize */
setservestalettl,
getservestalettl,
NULL };
NULL,
adjusthashsize };
isc_result_t
dns_rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type,
......
......@@ -954,7 +954,7 @@ make_log_buf(dns_rrl_t *rrl, dns_rrl_entry_t *e, const char *str1,
(void)dns_rdatatype_totext(e->key.s.qtype, &lb);
}
}
snprintf(strbuf, sizeof(strbuf), " (%08x)",
snprintf(strbuf, sizeof(strbuf), " (%08" PRIx32 ")",
e->key.s.qname_hash);
add_log_str(&lb, strbuf, strlen(strbuf));
}
......
......@@ -1309,7 +1309,8 @@ static dns_dbmethods_t sdb_methods = {
NULL, /* getsize */
NULL, /* setservestalettl */
NULL, /* getservestalettl */
NULL /* setgluecachestats */
NULL, /* setgluecachestats */
NULL /* adjusthashsize */
};
static isc_result_t
......
......@@ -1281,7 +1281,8 @@ static dns_dbmethods_t sdlzdb_methods = {
NULL, /* getsize */
NULL, /* setservestalettl */
NULL, /* getservestalettl */
NULL /* setgluecachestats */
NULL, /* setgluecachestats */
NULL /* adjusthashsize */
};
/*
......
......@@ -1706,7 +1706,7 @@ dns_view_flushnode(dns_view_t *view, const dns_name_t *name, bool tree) {
isc_result_t
dns_view_adddelegationonly(dns_view_t *view, const dns_name_t *name) {
dns_name_t *item;
uint32_t hash;
unsigned int hash;
REQUIRE(DNS_VIEW_VALID(view));
......@@ -1736,7 +1736,7 @@ dns_view_adddelegationonly(dns_view_t *view, const dns_name_t *name) {
isc_result_t
dns_view_excludedelegationonly(dns_view_t *view, const dns_name_t *name) {
dns_name_t *item;
uint32_t hash;
unsigned int hash;
REQUIRE(DNS_VIEW_VALID(view));
......@@ -1766,7 +1766,7 @@ dns_view_excludedelegationonly(dns_view_t *view, const dns_name_t *name) {
bool
dns_view_isdelegationonly(dns_view_t *view, const dns_name_t *name) {
dns_name_t *item;
uint32_t hash;
unsigned int hash;
REQUIRE(DNS_VIEW_VALID(view));
......
......@@ -169,6 +169,7 @@ dns_compress_setmethods
dns_compress_setsensitive
dns_counter_fromtext
dns_db_addrdataset
dns_db_adjusthashsize
dns_db_allrdatasets
dns_db_attach
dns_db_attachnode
......@@ -749,6 +750,7 @@ dns_private_chains
dns_private_totext
dns_rbt_addname
dns_rbt_addnode
dns_rbt_adjusthashsize
dns_rbt_create
dns_rbt_deletename
dns_rbt_deletenode
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment