stale-serve and RPZ put in SERVFAIL cache unexpected record
Summary
When I enable serve-stale, and disable access to external upstream servers (recursion), I see unexpected records in SERVFAIL cache. I see SERVFAIL record for records what should be rewritten with RPZ trigger instead of requested record.
BIND version used
BIND 9.18.1-1ubuntu1.2-Ubuntu (Stable Release) <id:>
running on Linux x86_64 5.15.0-1022-aws #26-Ubuntu SMP Thu Oct 13 12:59:25 UTC 2022
built by make with '--build=x86_64-linux-gnu' '--prefix=/usr' '--includedir=${prefix}/include' '--mandir=${prefix}/share/man' '--infodir=${prefix}/share/info' '--sysconfdir=/etc' '--localstatedir=/var' '--disable-option-checking' '--disable-silent-rules' '--libdir=${prefix}/lib/x86_64-linux-gnu' '--runstatedir=/run' '--disable-maintainer-mode' '--disable-dependency-tracking' '--libdir=/usr/lib/x86_64-linux-gnu' '--sysconfdir=/etc/bind' '--with-python=python3' '--localstatedir=/' '--enable-threads' '--enable-largefile' '--with-libtool' '--enable-shared' '--disable-static' '--with-gost=no' '--with-openssl=/usr' '--with-gssapi=yes' '--with-libidn2' '--with-json-c' '--with-lmdb=/usr' '--with-gnu-ld' '--with-maxminddb' '--with-atf=no' '--enable-ipv6' '--enable-rrl' '--enable-filter-aaaa' '--disable-native-pkcs11' 'build_alias=x86_64-linux-gnu' 'CFLAGS=-g -O2 -ffile-prefix-map=/build/bind9-2lYtkE/bind9-9.18.1=. -flto=auto -ffat-lto-objects -flto=auto -ffat-lto-objects -fstack-protector-strong -Wformat -Werror=format-security -fno-strict-aliasing -fno-delete-null-pointer-checks -DNO_VERSION_DATE -DDIG_SIGCHASE' 'LDFLAGS=-Wl,-Bsymbolic-functions -flto=auto -ffat-lto-objects -flto=auto -Wl,-z,relro -Wl,-z,now' 'CPPFLAGS=-Wdate-time -D_FORTIFY_SOURCE=2'
compiled by GCC 11.2.0
compiled with OpenSSL version: OpenSSL 3.0.2 15 Mar 2022
linked to OpenSSL version: OpenSSL 3.0.2 15 Mar 2022
compiled with libuv version: 1.43.0
linked to libuv version: 1.43.0
compiled with libnghttp2 version: 1.43.0
linked to libnghttp2 version: 1.43.0
compiled with libxml2 version: 2.9.13
linked to libxml2 version: 20913
compiled with json-c version: 0.15
linked to json-c version: 0.15
compiled with zlib version: 1.2.11
linked to zlib version: 1.2.11
linked to maxminddb version: 1.5.2
threads support is enabled
default paths:
named configuration: /etc/bind/named.conf
rndc configuration: /etc/bind/rndc.conf
DNSSEC root key: /etc/bind/bind.keys
nsupdate session key: //run/named/session.key
named PID file: //run/named/named.pid
named lock file: //run/named/named.lock
geoip-directory: /usr/share/GeoIP
Steps to reproduce
Configure a minimal BIND 9 recursive resolver with a response policy zone, and then attempt to resolve 321.test.myctl.com.
:
dig 321.test.myctl.com A @127.0.0.1
filter upstreams via iptables (for example), and attempt to resolve it again, you will receive SERVFAIL:
; <<>> DiG 9.18.1-1ubuntu1.2-Ubuntu <<>> 321.test.myctl.com A @127.0.0.1
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: SERVFAIL, id: 20981
;; flags: qr rd ra; QUERY: 1, ANSWER: 0, AUTHORITY: 0, ADDITIONAL: 1
;; OPT PSEUDOSECTION:
; EDNS: version: 0, flags:; udp: 1232
; COOKIE: 8f82c8b75e5cd86b0100000063725d504d726ced8e1e2034 (good)
;; QUESTION SECTION:
;321.test.myctl.com. IN A
;; Query time: 4999 msec
;; SERVER: 127.0.0.1#53(127.0.0.1) (UDP)
;; WHEN: Mon Nov 14 15:22:56 UTC 2022
;; MSG SIZE rcvd: 75
dump named db via rndc dumpdb
and look for SERVFAIL cache:
; SERVFAIL cache
;
; test.myctl.com/A [ttl 968]
In named.log we can see:
resolver: debug 1: fetch: 321.test.myctl.com/A
resolver: debug 1: fetch: 321.test.myctl.com/A
serve-stale: info: 321.test.myctl.com resolver failure, stale answer used
serve-stale: info: test.myctl.com resolver failure, stale answer unavailable
query-errors: info: client @0x7feb441e9f48 127.0.0.1#44401 (321.test.myctl.com): query failed (SERVFAIL) for 321.test.myctl.com/IN/A at query.c:5925
serve-stale: info: 321.test.myctl.com resolver failure, stale answer used
serve-stale: info: test.myctl.com resolver failure, stale answer unavailable
query-errors: info: client @0x7feb44209ef8 127.0.0.1#48831 (321.test.myctl.com): query failed (SERVFAIL) for 321.test.myctl.com/IN/A at query.c:5925
general: info: received control channel command 'dumpdb'
general: info: dumpdb started
Ask the same query second time, anew can see how it was resolved successfully:
; <<>> DiG 9.18.1-1ubuntu1.2-Ubuntu <<>> 321.test.myctl.com A @127.0.0.1
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 31429
;; flags: qr rd ra; QUERY: 1, ANSWER: 3, AUTHORITY: 0, ADDITIONAL: 2
;; OPT PSEUDOSECTION:
; EDNS: version: 0, flags:; udp: 1232
; COOKIE: bdf9d11efa69889d0100000063725d5303f554388f84744c (good)
;; QUESTION SECTION:
;321.test.myctl.com. IN A
;; ANSWER SECTION:
321.test.myctl.com. 30 IN CNAME test.myctl.com.
test.myctl.com. 293 IN CNAME test-cname-a.myctl.com.
test-cname-a.myctl.com. 30 IN A 127.0.0.1
;; ADDITIONAL SECTION:
test.rpz.local. 1 IN SOA localhost. root.localhost. 1 604800 86400 2419200 86400
;; Query time: 0 msec
;; SERVER: 127.0.0.1#53(127.0.0.1) (UDP)
;; WHEN: Mon Nov 14 15:22:59 UTC 2022
;; MSG SIZE rcvd: 205
In named.log file we can see that:
serve-stale: info: 321.test.myctl.com query within stale refresh time, stale answer used
rpz: info: client @0x7feb441e9f48 127.0.0.1#43954 (321.test.myctl.com): rpz QNAME Local-Data rewrite test.myctl.com/A/IN via test.myctl.com.test.rpz.local
resolver: debug 1: fetch: test-cname-a.myctl.com/A
serve-stale: info: 321.test.myctl.com query within stale refresh time, stale answer used
rpz: info: client @0x7feb44209ef8 127.0.0.1#59298 (321.test.myctl.com): rpz QNAME Local-Data rewrite test.myctl.com/A/IN via test.myctl.com.test.rpz.local
resolver: debug 1: fetch: test-cname-a.myctl.com/A
serve-stale: info: test-cname-a.myctl.com resolver failure, stale answer used
serve-stale: info: test-cname-a.myctl.com resolver failure, stale answer used
If we disable serve-stale in config then we see only asked queries in SERVFAIL cache:
; SERVFAIL cache
;
; 321.test.myctl.com/A [ttl 976]
What is the current bug behavior?
test.myctl.com
existence in the SERVFAIL cache is unexpected. If load it pretty high to subdomains with RPZ and CNAMEs, this record will be presented almost always in SERVFAIL cache, and any queries to test.myctl.com
will fail.
What is the expected correct behavior?
I'd expect SERVFAILS only for exact requested queries, instead of something in between, or even answer with stale-data.
Relevant configuration files
logging {
channel "standard_var_log" {
file "/var/log/named/named.log" versions 3 size 104857600;
severity debug 1;
print-time yes;
print-severity yes;
print-category yes;
};
channel "query_var_log" {
file "/var/log/named/querylog" versions 200 size 262144000;
print-time yes;
};
category "default" {
"standard_var_log";
};
category "lame-servers" {
"null";
};
category "queries" {
"query_var_log";
};
};
options {
directory "/var/cache/bind";
listen-on-v6 {
"any";
};
dnssec-validation no;
response-policy {
zone "test.rpz.local" max-policy-ttl 86400;
} break-dnssec yes qname-wait-recurse no;
stale-answer-enable yes;
stale-answer-client-timeout off;
stale-cache-enable yes;
};
zone "." {
type hint;
file "/usr/share/dns/root.hints";
};
zone "localhost" {
type master;
file "/etc/bind/db.local";
};
zone "127.in-addr.arpa" {
type master;
file "/etc/bind/db.127";
};
zone "0.in-addr.arpa" {
type master;
file "/etc/bind/db.0";
};
zone "255.in-addr.arpa" {
type master;
file "/etc/bind/db.255";
};
zone "test.rpz.local" in {
type master;
file "/etc/bind/db.rpz.local";
allow-query {
"localhost";
};
allow-transfer {
"localhost";
};
forwarders {
};
};
zone "myctl.com" in {
type master;
file "/etc/bind/myctl.com.local";
allow-query {
"localhost";
};
allow-transfer {
"localhost";
};
forwarders {
};
};
# cat myctl.com.local
; BIND reverse data file for empty rfc1918 zone
;
; DO NOT EDIT THIS FILE - it is used for multiple zones.
; Instead, copy it, edit named.conf, and use that copy.
;
$ORIGIN .
$TTL 86400
myctl.com IN SOA localhost. root.localhost. (
1 ; Serial
604800 ; Refresh
86400 ; Retry
2419200 ; Expire
86400 ) ; Negative Cache TTL
IN NS ns-canada.topdns.com.
IN NS ns-usa.topdns.com.
IN NS ns-uk.topdns.com.
$ORIGIN myctl.com
test-cname-a NS ns-canada.topdns.com.
NS ns-usa.topdns.com.
NS ns-uk.topdns.com.
test NS ns-canada.topdns.com.
NS ns-usa.topdns.com.
NS ns-uk.topdns.com.
$ORIGIN test.myctl.com
$TTL 300
* CNAME test.myctl.com.
# cat db.rpz.local
; BIND reverse data file for empty rfc1918 zone
;
; DO NOT EDIT THIS FILE - it is used for multiple zones.
; Instead, copy it, edit named.conf, and use that copy.
;
$TTL 900
@ IN SOA localhost. root.localhost. (
1 ; Serial
604800 ; Refresh
86400 ; Retry
2419200 ; Expire
86400 ) ; Negative Cache TTL
;
IN NS localhost.
$TTL 293
test.myctl.com CNAME test-cname-a.myctl.com.