Commit f0217670 authored by Shawn Routhier's avatar Shawn Routhier
Browse files

[v4_1_esv] Patch the failover code to avoid deadlocks

Patch for 36810 & 20352
This coves several related problems
1) When the primary is in conflict done it allows the secondary to
transition around resolution interrupted and potentical conflict previously
the primary would die on an illegal state.

2) It allows the servers to restart a bind update request.  Previously if
one of the servers sent an udpate request and there died (or had the communications
interrupted) in some states the first server wouldn't retransmit a new
update request and the other server wouldn't send any bind updates. This
was noticed in potential conflict.

3) Updated the state transitions to move the leases on the ack queue
back to the update queue in case of conflict-done as we might need to
retransmit them all.

4) Updated a transition from startup to potentical conflict instead
of resolution interrupted when the servers reconnect during the startup
phase in order to avoid a diffferent dead lock.
parent a54bc0da
......@@ -128,6 +128,15 @@ by Eric Young (eay@cryptsoft.com).
and no other value is available.
[ISC-Bugs #21323]
- In the failover code, handle the case of communications being interrupted
when the servers are dealing with POTENTIAL-CONFLICT. This patch allows
the primary to accept the secondary moving from POTENTIAL-CONFLICT to
RESOLUTION-INTERRUPTED as well as handling the bind update process better.
In addition the code to resend update or update all requests has been
modified to send requests more often.
[ISC-Bugs #36810]
[ISC-Bugs #20352]
Changes since 4.1-ESV-R10rc1
- None
......
......@@ -3152,6 +3152,7 @@ isc_result_t dhcp_failover_state_signal (omapi_object_t *,
isc_result_t dhcp_failover_state_transition (dhcp_failover_state_t *,
const char *);
isc_result_t dhcp_failover_set_service_state (dhcp_failover_state_t *state);
void dhcp_failover_rescind_updates (dhcp_failover_state_t *);
isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *,
enum failover_state);
isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *,
......
......@@ -1519,8 +1519,16 @@ isc_result_t dhcp_failover_state_transition (dhcp_failover_state_t *state,
/* In these situations, we remain in the current
* state, or if in startup enter those states.
*/
case communications_interrupted:
case conflict_done:
/* As the peer may not have received or may have
* lost track of updates we sent previously we
* rescind them, causing us to retransmit them
* on an update request.
*/
dhcp_failover_rescind_updates(state);
/* fall through */
case communications_interrupted:
case partner_down:
case paused:
case recover:
......@@ -1703,6 +1711,52 @@ isc_result_t dhcp_failover_set_service_state (dhcp_failover_state_t *state)
return ISC_R_SUCCESS;
}
/*!
* \brief Return any leases on the ack queue back to the update queue
*
* Re-schedule any pending updates by moving them from the ack queue
* (update sent awaiting response) back to the update queue (need to
* send an update for this lease). This will result in a retransmission
* of the update.
*
* \param state is the state block for the failover connection we are
* updating.
*/
void dhcp_failover_rescind_updates (dhcp_failover_state_t *state)
{
struct lease *lp;
if (state->ack_queue_tail == NULL)
return;
/* Zap the flags. */
for (lp = state->ack_queue_head; lp; lp = lp->next_pending)
lp->flags = ((lp->flags & ~ON_ACK_QUEUE) | ON_UPDATE_QUEUE);
/* Now hook the ack queue to the beginning of the update queue. */
if (state->update_queue_head) {
lease_reference(&state->ack_queue_tail->next_pending,
state->update_queue_head, MDL);
lease_dereference(&state->update_queue_head, MDL);
}
lease_reference(&state->update_queue_head, state->ack_queue_head, MDL);
if (!state->update_queue_tail) {
#if defined (POINTER_DEBUG)
if (state->ack_queue_tail->next_pending) {
log_error("next pending on ack queue tail.");
abort();
}
#endif
lease_reference(&state->update_queue_tail,
state->ack_queue_tail, MDL);
}
lease_dereference(&state->ack_queue_tail, MDL);
lease_dereference(&state->ack_queue_head, MDL);
state->cur_unacked_updates = 0;
}
isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
enum failover_state new_state)
{
......@@ -1721,37 +1775,9 @@ isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
case normal:
case potential_conflict:
case partner_down:
if (state -> ack_queue_tail) {
struct lease *lp;
/* Zap the flags. */
for (lp = state -> ack_queue_head; lp; lp = lp -> next_pending)
lp -> flags = ((lp -> flags & ~ON_ACK_QUEUE) |
ON_UPDATE_QUEUE);
/* Now hook the ack queue to the beginning of the update
queue. */
if (state -> update_queue_head) {
lease_reference (&state -> ack_queue_tail -> next_pending,
state -> update_queue_head, MDL);
lease_dereference (&state -> update_queue_head, MDL);
}
lease_reference (&state -> update_queue_head,
state -> ack_queue_head, MDL);
if (!state -> update_queue_tail) {
#if defined (POINTER_DEBUG)
if (state -> ack_queue_tail -> next_pending) {
log_error ("next pending on ack queue tail.");
abort ();
}
#endif
lease_reference (&state -> update_queue_tail,
state -> ack_queue_tail, MDL);
}
lease_dereference (&state -> ack_queue_tail, MDL);
lease_dereference (&state -> ack_queue_head, MDL);
state -> cur_unacked_updates = 0;
}
/* Move the ack queue to the update queue */
dhcp_failover_rescind_updates(state);
/* We will re-queue a timeout later, if applicable. */
cancel_timeout (dhcp_failover_keepalive, state);
break;
......@@ -1826,7 +1852,9 @@ isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
break;
case potential_conflict:
if (state -> i_am == primary)
if ((state->i_am == primary) ||
((state->i_am == secondary) &&
(state->partner.state == conflict_done)))
dhcp_failover_send_update_request (state);
break;
......@@ -1927,7 +1955,18 @@ isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *state,
if (state -> partner.state == new_state && state -> me.state) {
switch (state -> me.state) {
case startup:
dhcp_failover_set_state (state, state -> saved_state);
/*
* If we have a peer state we must be connected.
* If so we should move to potential_conflict
* instead of resolution_interrupted, otherwise
* back to whereever we were before we stopped.
*/
if (state->saved_state == resolution_interrupted)
dhcp_failover_set_state(state,
potential_conflict);
else
dhcp_failover_set_state(state,
state->saved_state);
return ISC_R_SUCCESS;
case unknown_state:
......@@ -2146,6 +2185,17 @@ isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *state,
dhcp_failover_set_state(state, new_state);
break;
case potential_conflict:
case resolution_interrupted:
/*
* This can happen when the connection is lost and
* recovered after the primary has moved to
* conflict-done but the secondary is still in
* potential-conflict. In that case, we have to
* remain in conflict-done.
*/
break;
default:
log_fatal("Peer %s: Invalid attempt to move from %s "
"to %s while local state is conflict-done.",
......@@ -4780,35 +4830,42 @@ isc_result_t dhcp_failover_send_update_request (dhcp_failover_state_t *state)
# define FMA (char *)0, (unsigned *)0, 0
#endif
if (!state -> link_to_peer ||
state -> link_to_peer -> type != dhcp_type_failover_link)
return ISC_R_INVALIDARG;
link = (dhcp_failover_link_t *)state -> link_to_peer;
if (!state->link_to_peer ||
state->link_to_peer->type != dhcp_type_failover_link)
return (ISC_R_INVALIDARG);
link = (dhcp_failover_link_t *)state->link_to_peer;
if (!link -> outer || link -> outer -> type != omapi_type_connection)
return ISC_R_INVALIDARG;
if (!link->outer || link->outer->type != omapi_type_connection)
return (ISC_R_INVALIDARG);
if (state -> curUPD)
return ISC_R_ALREADYRUNNING;
/* We allow an update to be restarted in case we requested an update
* and were interrupted by something. If we had an ALL going we need
* to restart that. Otherwise we simply continue with the request */
if (state->curUPD == FTM_UPDREQALL) {
return (dhcp_failover_send_update_request_all(state));
}
status = (dhcp_failover_put_message
(link, link -> outer,
FTM_UPDREQ, link->xid++,
(failover_option_t *)0));
status = (dhcp_failover_put_message(link, link->outer, FTM_UPDREQ,
link->xid++, NULL));
if (status == ISC_R_SUCCESS)
state -> curUPD = FTM_UPDREQ;
state->curUPD = FTM_UPDREQ;
#if defined (DEBUG_FAILOVER_MESSAGES)
if (status != ISC_R_SUCCESS)
failover_print (FMA, " (failed)");
failover_print (FMA, ")");
failover_print(FMA, " (failed)");
failover_print(FMA, ")");
if (obufix) {
log_debug ("%s", obuf);
log_debug("%s", obuf);
}
#endif
log_info ("Sent update request message to %s", state -> name);
return status;
if (status == ISC_R_SUCCESS) {
log_info("Sent update request message to %s", state->name);
} else {
log_error("Failed to send update request all message to %s: %s",
state->name, isc_result_totext(status));
}
return (status);
}
isc_result_t dhcp_failover_send_update_request_all (dhcp_failover_state_t
......@@ -4826,36 +4883,39 @@ isc_result_t dhcp_failover_send_update_request_all (dhcp_failover_state_t
# define FMA (char *)0, (unsigned *)0, 0
#endif
if (!state -> link_to_peer ||
state -> link_to_peer -> type != dhcp_type_failover_link)
return ISC_R_INVALIDARG;
link = (dhcp_failover_link_t *)state -> link_to_peer;
if (!state->link_to_peer ||
state->link_to_peer->type != dhcp_type_failover_link)
return (ISC_R_INVALIDARG);
link = (dhcp_failover_link_t *)state->link_to_peer;
if (!link -> outer || link -> outer -> type != omapi_type_connection)
return ISC_R_INVALIDARG;
if (!link->outer || link->outer->type != omapi_type_connection)
return (ISC_R_INVALIDARG);
/* If there is an UPDREQ in progress, then upgrade to UPDREQALL. */
if (state -> curUPD && (state -> curUPD != FTM_UPDREQ))
return ISC_R_ALREADYRUNNING;
/* We allow an update to be restarted in case we requested an update
* and were interrupted by something.
*/
status = (dhcp_failover_put_message
(link, link -> outer,
FTM_UPDREQALL, link->xid++,
(failover_option_t *)0));
status = (dhcp_failover_put_message(link, link->outer, FTM_UPDREQALL,
link->xid++, NULL));
if (status == ISC_R_SUCCESS)
state -> curUPD = FTM_UPDREQALL;
state->curUPD = FTM_UPDREQALL;
#if defined (DEBUG_FAILOVER_MESSAGES)
if (status != ISC_R_SUCCESS)
failover_print (FMA, " (failed)");
failover_print (FMA, ")");
failover_print(FMA, " (failed)");
failover_print(FMA, ")");
if (obufix) {
log_debug ("%s", obuf);
log_debug("%s", obuf);
}
#endif
log_info ("Sent update request all message to %s", state -> name);
return status;
if (status == ISC_R_SUCCESS) {
log_info("Sent update request all message to %s", state->name);
} else {
log_error("Failed to send update request all message to %s: %s",
state->name, isc_result_totext(status));
}
return (status);
}
isc_result_t dhcp_failover_send_update_done (dhcp_failover_state_t *state)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment