2525. [func] New logging category "query-errors" to provide detailed

			internal information about query failures, especially
			about server failures. [RT #19027]
parent 0cfbb928
2525. [func] New logging category "query-errors" to provide detailed
internal information about query failures, especially
about server failures. [RT #19027]
2524. [port] sunos: dnssec-signzone needs strtoul(). [RT #19129]
2523. [bug] Random type rdata freed by dns_nsec_typepresent().
......
......@@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
/* $Id: check-tool.c,v 1.35 2008/10/24 00:28:00 marka Exp $ */
/* $Id: check-tool.c,v 1.36 2009/01/07 01:46:40 jinmei Exp $ */
/*! \file */
......@@ -115,6 +115,7 @@ static isc_logcategory_t categories[] = {
{ "queries", 0 },
{ "unmatched", 0 },
{ "update-security", 0 },
{ "query-errors", 0 },
{ NULL, 0 }
};
......
......@@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
/* $Id: log.h,v 1.25 2007/06/19 23:46:59 tbox Exp $ */
/* $Id: log.h,v 1.26 2009/01/07 01:46:40 jinmei Exp $ */
#ifndef NAMED_LOG_H
#define NAMED_LOG_H 1
......@@ -36,6 +36,7 @@
#define NS_LOGCATEGORY_QUERIES (&ns_g_categories[4])
#define NS_LOGCATEGORY_UNMATCHED (&ns_g_categories[5])
#define NS_LOGCATEGORY_UPDATE_SECURITY (&ns_g_categories[6])
#define NS_LOGCATEGORY_QUERY_EERRORS (&ns_g_categories[7])
/*
* Backwards compatibility.
......
......@@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
/* $Id: log.c,v 1.48 2009/01/05 23:47:53 tbox Exp $ */
/* $Id: log.c,v 1.49 2009/01/07 01:46:40 jinmei Exp $ */
/*! \file */
......@@ -44,6 +44,7 @@ static isc_logcategory_t categories[] = {
{ "queries", 0 },
{ "unmatched", 0 },
{ "update-security", 0 },
{ "query-errors", 0 },
{ NULL, 0 }
};
......
......@@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
/* $Id: query.c,v 1.315 2008/12/16 02:19:46 jinmei Exp $ */
/* $Id: query.c,v 1.316 2009/01/07 01:46:40 jinmei Exp $ */
/*! \file */
......@@ -120,7 +120,7 @@ typedef struct client_additionalctx {
dns_rdataset_t *rdataset;
} client_additionalctx_t;
static void
static isc_result_t
query_find(ns_client_t *client, dns_fetchevent_t *event, dns_rdatatype_t qtype);
static isc_boolean_t
......@@ -134,6 +134,9 @@ query_findclosestnsec3(dns_name_t *qname, dns_db_t *db,
dns_name_t *fname, isc_boolean_t exact,
dns_name_t *found);
static inline void
log_queryerror(ns_client_t *client, isc_result_t result, int line, int level);
/*%
* Increment query statistics counters.
*/
......@@ -178,9 +181,12 @@ query_send(ns_client_t *client) {
}
static void
query_error(ns_client_t *client, isc_result_t result) {
query_error(ns_client_t *client, isc_result_t result, int line) {
int loglevel = ISC_LOG_DEBUG(3);
switch (result) {
case DNS_R_SERVFAIL:
loglevel = ISC_LOG_DEBUG(1);
inc_stats(client, dns_nsstatscounter_servfail);
break;
case DNS_R_FORMERR:
......@@ -190,6 +196,9 @@ query_error(ns_client_t *client, isc_result_t result) {
inc_stats(client, dns_nsstatscounter_failure);
break;
}
log_queryerror(client, result, line, loglevel);
ns_client_error(client, result);
}
......@@ -3086,8 +3095,12 @@ query_addnxrrsetnsec(ns_client_t *client, dns_db_t *db,
static void
query_resume(isc_task_t *task, isc_event_t *event) {
dns_fetchevent_t *devent = (dns_fetchevent_t *)event;
dns_fetch_t *fetch;
ns_client_t *client;
isc_boolean_t fetch_cancelled, client_shuttingdown;
isc_result_t result;
isc_logcategory_t *logcategory = NS_LOGCATEGORY_QUERY_EERRORS;
int errorloglevel;
/*
* Resume a query after recursion.
......@@ -3124,7 +3137,8 @@ query_resume(isc_task_t *task, isc_event_t *event) {
INSIST(client->query.fetch == NULL);
client->query.attributes &= ~NS_QUERYATTR_RECURSING;
dns_resolver_destroyfetch(&devent->fetch);
fetch = devent->fetch;
devent->fetch = NULL;
/*
* If this client is shutting down, or this transaction
......@@ -3141,7 +3155,7 @@ query_resume(isc_task_t *task, isc_event_t *event) {
query_putrdataset(client, &devent->sigrdataset);
isc_event_free(&event);
if (fetch_cancelled)
query_error(client, DNS_R_SERVFAIL);
query_error(client, DNS_R_SERVFAIL, __LINE__);
else
query_next(client, ISC_R_CANCELED);
/*
......@@ -3149,8 +3163,22 @@ query_resume(isc_task_t *task, isc_event_t *event) {
*/
ns_client_detach(&client);
} else {
query_find(client, devent, 0);
result = query_find(client, devent, 0);
if (result != ISC_R_SUCCESS) {
if (result == DNS_R_SERVFAIL)
errorloglevel = ISC_LOG_DEBUG(2);
else
errorloglevel = ISC_LOG_DEBUG(4);
if (isc_log_wouldlog(ns_g_lctx, errorloglevel)) {
dns_resolver_logfetch(fetch, ns_g_lctx,
logcategory,
NS_LOGMODULE_QUERY,
errorloglevel, ISC_FALSE);
}
}
}
dns_resolver_destroyfetch(&fetch);
}
static isc_result_t
......@@ -3285,6 +3313,7 @@ query_recurse(ns_client_t *client, dns_rdatatype_t qtype, dns_name_t *qdomain,
do { \
eresult = r; \
want_restart = ISC_FALSE; \
line = __LINE__; \
} while (0)
/*
......@@ -3661,7 +3690,7 @@ query_findclosestnsec3(dns_name_t *qname, dns_db_t *db,
* If 'event' is non-NULL, we are returning from recursion and 'qtype'
* is ignored. Otherwise, 'qtype' is the query type.
*/
static void
static isc_result_t
query_find(ns_client_t *client, dns_fetchevent_t *event, dns_rdatatype_t qtype)
{
dns_db_t *db, *zdb;
......@@ -3690,6 +3719,7 @@ query_find(ns_client_t *client, dns_fetchevent_t *event, dns_rdatatype_t qtype)
isc_boolean_t empty_wild;
dns_rdataset_t *noqname;
isc_boolean_t resuming;
int line = -1;
CTRACE("query_find");
......@@ -4847,7 +4877,8 @@ query_find(ns_client_t *client, dns_fetchevent_t *event, dns_rdatatype_t qtype)
* or if the client requested recursion and thus wanted
* the complete answer, send an error response.
*/
query_error(client, eresult);
INSIST(line >= 0);
query_error(client, eresult, line);
}
ns_client_detach(&client);
} else if (!RECURSING(client)) {
......@@ -4873,10 +4904,22 @@ query_find(ns_client_t *client, dns_fetchevent_t *event, dns_rdatatype_t qtype)
client->view->auth_nxdomain == ISC_TRUE)
client->message->flags |= DNS_MESSAGEFLAG_AA;
/*
* If the response is somehow unexpected for the client and this
* is a result of recursion, return an error to the caller
* to indicate it may need to be logged.
*/
if (resuming &&
(ISC_LIST_EMPTY(client->message->sections[DNS_SECTION_ANSWER]) ||
client->message->rcode != dns_rcode_noerror))
eresult = ISC_R_FAILURE;
query_send(client);
ns_client_detach(&client);
}
CTRACE("query_find: done");
return (eresult);
}
static inline void
......@@ -4905,6 +4948,48 @@ log_query(ns_client_t *client, unsigned int flags, unsigned int extflags) {
((flags & DNS_MESSAGEFLAG_CD) != 0) ? "C" : "");
}
static inline void
log_queryerror(ns_client_t *client, isc_result_t result, int line, int level) {
char namebuf[DNS_NAME_FORMATSIZE];
char typename[DNS_RDATATYPE_FORMATSIZE];
char classname[DNS_RDATACLASS_FORMATSIZE];
const char *namep, *typep, *classp, *sep1, *sep2;
dns_rdataset_t *rdataset;
if (!isc_log_wouldlog(ns_g_lctx, level))
return;
namep = typep = classp = sep1 = sep2 = "";
/*
* Query errors can happen for various reasons. In some cases we cannot
* even assume the query contains a valid question section, so we should
* expect exceptional cases.
*/
if (client->query.origqname != NULL) {
dns_name_format(client->query.origqname, namebuf,
sizeof(namebuf));
namep = namebuf;
sep1 = " for ";
rdataset = ISC_LIST_HEAD(client->query.origqname->list);
if (rdataset != NULL) {
dns_rdataclass_format(rdataset->rdclass, classname,
sizeof(classname));
classp = classname;
dns_rdatatype_format(rdataset->type, typename,
sizeof(typename));
typep = typename;
sep2 = "/";
}
}
ns_client_log(client, NS_LOGCATEGORY_QUERY_EERRORS, NS_LOGMODULE_QUERY,
level, "query failed (%s)%s%s%s%s%s%s at %s:%d",
isc_result_totext(result), sep1, namep, sep2,
classp, sep2, typep, __FILE__, line);
}
void
ns_query_start(ns_client_t *client) {
isc_result_t result;
......@@ -4973,7 +5058,7 @@ ns_query_start(ns_client_t *client) {
*/
result = dns_message_firstname(message, DNS_SECTION_QUESTION);
if (result != ISC_R_SUCCESS) {
query_error(client, result);
query_error(client, result, __LINE__);
return;
}
dns_message_currentname(message, DNS_SECTION_QUESTION,
......@@ -4986,9 +5071,9 @@ ns_query_start(ns_client_t *client) {
* There's more than one QNAME in the question
* section.
*/
query_error(client, DNS_R_FORMERR);
query_error(client, DNS_R_FORMERR, __LINE__);
} else
query_error(client, result);
query_error(client, result, __LINE__);
return;
}
......@@ -4999,7 +5084,7 @@ ns_query_start(ns_client_t *client) {
* Check for multiple question queries, since edns1 is dead.
*/
if (message->counts[DNS_SECTION_QUESTION] > 1) {
query_error(client, DNS_R_FORMERR);
query_error(client, DNS_R_FORMERR, __LINE__);
return;
}
......@@ -5020,7 +5105,7 @@ ns_query_start(ns_client_t *client) {
return;
case dns_rdatatype_maila:
case dns_rdatatype_mailb:
query_error(client, DNS_R_NOTIMP);
query_error(client, DNS_R_NOTIMP, __LINE__);
return;
case dns_rdatatype_tkey:
result = dns_tkey_processquery(client->message,
......@@ -5029,10 +5114,10 @@ ns_query_start(ns_client_t *client) {
if (result == ISC_R_SUCCESS)
query_send(client);
else
query_error(client, result);
query_error(client, result, __LINE__);
return;
default: /* TSIG, etc. */
query_error(client, DNS_R_FORMERR);
query_error(client, DNS_R_FORMERR, __LINE__);
return;
}
}
......@@ -5100,5 +5185,5 @@ ns_query_start(ns_client_t *client) {
qclient = NULL;
ns_client_attach(client, &qclient);
query_find(qclient, NULL, qtype);
(void)query_find(qclient, NULL, qtype);
}
......@@ -18,7 +18,7 @@
- PERFORMANCE OF THIS SOFTWARE.
-->
<!-- File: $Id: Bv9ARM-book.xml,v 1.382 2008/12/01 17:11:54 jreed Exp $ -->
<!-- File: $Id: Bv9ARM-book.xml,v 1.383 2009/01/07 01:46:40 jinmei Exp $ -->
<book xmlns:xi="http://www.w3.org/2001/XInclude">
<title>BIND 9 Administrator Reference Manual</title>
......@@ -4287,6 +4287,17 @@ category notify { null; };
</para>
</entry>
</row>
<row rowsep="0">
<entry colname="1">
<para><command>query-errors</command></para>
</entry>
<entry colname="2">
<para>
Information about queries that resulted in some
failure.
</para>
</entry>
</row>
<row rowsep="0">
<entry colname="1">
<para><command>dispatch</command></para>
......@@ -4373,6 +4384,226 @@ category notify { null; };
</tgroup>
</informaltable>
</sect3>
<sect3>
<title>The <command>query-errors</command> Category</title>
<para>
The <command>query-errors</command> category is
specifically intended for debugging purposes: To identify
why and how specific queries result in responses which
indicate an error.
Messages of this category are therefore only logged
with <command>debug</command> levels.
</para>
<para>
At the debug levels of 1 or higher, each response with the
rcode of SERVFAIL is logged as follows:
</para>
<para>
<computeroutput>client 127.0.0.1#61502: query failed (SERVFAIL) for www.example.com/IN/AAAA at query.c:3880</computeroutput>
</para>
<para>
This means an error resulting in SERVFAIL was
detected at line 3880 of source file
<filename>query.c</filename>.
Log messages of this level will particularly
help identify the cause of SERVFAIL for an
authoritative server.
</para>
<para>
At the debug levels of 2 or higher, detailed context
information of recursive resolutions that resulted in
SERVFAIL is logged.
The log message will look like as follows:
</para>
<para>
<computeroutput>fetch completed at resolver.c:2970 for www.example.com/A in 30.000183: timed out/success [domain:example.com,referral:2,restart:7,qrysent:8,timeout:5,lame:0,neterr:0,badresp:1,adberr:0,findfail:0,valfail:0]</computeroutput>
</para>
<para>
The first part before the colon shows that a recursive
resolution for AAAA records of www.example.com completed
in 30.000183 seconds and the final result that led to the
SERVFAIL was determined at line 2970 of source file
<filename>resolver.c</filename>.
</para>
<para>
The following part shows the detected final result and the
latest result of DNSSEC validation.
The latter is always success when no validation attempt
is made.
In this example, this query resulted in SERVFAIL probably
because all name servers are down or unreachable, leading
to a timeout in 30 seconds.
DNSSEC validation was probably not attempted.
</para>
<para>
The last part enclosed in square brackets shows statistics
information collected for this particular resolution
attempt.
The <varname>domain</varname> field shows the deepest zone
that the resolver reached;
it is the zone where the error was finally detected.
The meaning of the other fields is summarized in the
following table.
</para>
<informaltable colsep="0" rowsep="0">
<tgroup cols="2" colsep="0" rowsep="0" tgroupstyle="4Level-table">
<colspec colname="1" colnum="1" colsep="0" />
<colspec colname="2" colnum="2" colsep="0" />
<tbody>
<row rowsep="0">
<entry colname="1">
<para><varname>referral</varname></para>
</entry>
<entry colname="2">
<para>
The number of referrals the resolver received
throughout the resolution process.
In the above example this is 2, which are most
likely com and example.com.
</para>
</entry>
</row>
<row rowsep="0">
<entry colname="1">
<para><varname>restart</varname></para>
</entry>
<entry colname="2">
<para>
The number of cycles that the resolver tried
remote servers at the <varname>domain</varname>
zone.
In each cycle the resolver sends one query
(possibly resending it, depending on the response)
to each known name server of
the <varname>domain</varname> zone.
</para>
</entry>
</row>
<row rowsep="0">
<entry colname="1">
<para><varname>qrysent</varname></para>
</entry>
<entry colname="2">
<para>
The number of queries the resolver sent at the
<varname>domain</varname> zone.
</para>
</entry>
</row>
<row rowsep="0">
<entry colname="1">
<para><varname>timeout</varname></para>
</entry>
<entry colname="2">
<para>
The number of timeouts since the resolver
received the last response.
</para>
</entry>
</row>
<row rowsep="0">
<entry colname="1">
<para><varname>lame</varname></para>
</entry>
<entry colname="2">
<para>
The number of lame servers the resolver detected
at the <varname>domain</varname> zone.
A server is detected to be lame either by an
invalid response or as a result of lookup in
BIND9's address database (ADB), where lame
servers are cached.
</para>
</entry>
</row>
<row rowsep="0">
<entry colname="1">
<para><varname>neterr</varname></para>
</entry>
<entry colname="2">
<para>
The number of erroneous results that the
resolver encountered in sending queries
at the <varname>domain</varname> zone.
One common case is the remote server is
unreachable and the resolver receives an ICMP
unreachable error message.
</para>
</entry>
</row>
<row rowsep="0">
<entry colname="1">
<para><varname>badresp</varname></para>
</entry>
<entry colname="2">
<para>
The number of unexpected responses (other than
<varname>lame</varname>) to queries sent by the
resolver at the <varname>domain</varname> zone.
</para>
</entry>
</row>
<row rowsep="0">
<entry colname="1">
<para><varname>adberr</varname></para>
</entry>
<entry colname="2">
<para>
Failures in finding remote server addresses
of the <varname>domain</varname> zone in the ADB.
One common case of this is that the remote
server's name does not have any address records.
</para>
</entry>
</row>
<row rowsep="0">
<entry colname="1">
<para><varname>findfail</varname></para>
</entry>
<entry colname="2">
<para>
Failures of resolving remote server addresses.
This is a total number of failures throughout
the resolution process.
</para>
</entry>
</row>
<row rowsep="0">
<entry colname="1">
<para><varname>valfail</varname></para>
</entry>
<entry colname="2">
<para>
Failures of DNSSEC validation.
Validation failures are counted throughout
the resolution process (not limited to
the <varname>domain</varname> zone), but should
only happen in <varname>domain</varname>.
</para>
</entry>
</row>
</tbody>
</tgroup>
</informaltable>
<para>
At the debug levels of 3 or higher, the same messages
as those at the debug 1 level are logged for other errors
than SERVFAIL.
Note that negative responses such as NXDOMAIN are not
regarded as errors here.
</para>
<para>
At the debug levels of 4 or higher, the same messages
as those at the debug 2 level are logged for other errors
than SERVFAIL.
Unlike the above case of level 3, messages are logged for
negative responses.
This is because any unexpected results can be difficult to
debug in the recursion case.
</para>
</sect3>
</sect2>
<sect2>
......
......@@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
/* $Id: resolver.h,v 1.60 2008/09/08 05:41:22 marka Exp $ */
/* $Id: resolver.h,v 1.61 2009/01/07 01:46:40 jinmei Exp $ */
#ifndef DNS_RESOLVER_H
#define DNS_RESOLVER_H 1
......@@ -347,6 +347,23 @@ dns_resolver_destroyfetch(dns_fetch_t **fetchp);
*\li *fetchp == NULL.
*/
void
dns_resolver_logfetch(dns_fetch_t *fetch, isc_log_t *lctx,
isc_logcategory_t *category, isc_logmodule_t *module,
int level, isc_boolean_t duplicateok);
/*%<
* Dump a log message on internal state at the completion of given 'fetch'.
* 'lctx', 'category', 'module', and 'level' are used to write the log message.
* By default, only one log message is written even if the corresponding fetch
* context serves multiple clients; if 'duplicateok' is true the suppression
* is disabled and the message can be written every time this function is
* called.
*
* Requires:
*
*\li 'fetch' is a valid fetch, and has completed.
*/
dns_dispatchmgr_t *
dns_resolver_dispatchmgr(dns_resolver_t *resolver);
......
......@@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
/* $Id: resolver.c,v 1.386 2009/01/05 23:47:53 tbox Exp $ */
/* $Id: resolver.c,v 1.387 2009/01/07 01:46:40 jinmei Exp $ */
/*! \file */
......@@ -157,6 +157,12 @@ typedef enum {
fetchstate_done /*%< FETCHDONE events posted. */
} fetchstate;
typedef enum {
badns_unreachable = 0,
badns_response,
badns_validation
} badnstype_t;
struct fetchctx {
/*% Not locked. */
unsigned int magic;
......@@ -245,6 +251,24 @@ struct fetchctx {
*/
isc_uint32_t rand_buf;
isc_uint32_t rand_bits;
/*%
* Fetch-local statistics for detailed logging.
*/
isc_result_t result; /*%< fetch result */
isc_result_t vresult; /*%< validation result */
int exitline;
isc_time_t start;
isc_uint64_t duration;
isc_boolean_t logged;
unsigned int querysent;
unsigned int referrals;
unsigned int lamecount;
unsigned int neterr;
unsigned int badresp;
unsigned int adberr;
unsigned int findfail;
unsigned int valfail;
};
#define FCTX_MAGIC ISC_MAGIC('F', '!', '!', '!')
......@@ -395,7 +419,7 @@ static isc_result_t ncache_adderesult(dns_message_t *message,
static void validated(isc_task_t *task, isc_event_t *event);
static void maybe_destroy(fetchctx_t *fctx);
static void add_bad(fetchctx_t *fctx, dns_adbaddrinfo_t *addrinfo,
isc_result_t reason);
isc_result_t reason, badnstype_t badtype);
/*%
* Increment resolver-related statistics counters.
......@@ -825,12 +849,13 @@ fctx_stopeverything(fetchctx_t *fctx, isc_boolean_t no_response) {
}
static inline void
fctx_sendevents(fetchctx_t *fctx, isc_result_t result) {
fctx_sendevents(fetchctx_t *fctx, isc_result_t result, int line) {
dns_fetchevent_t *event, *next_event;
isc_task_t *task;