HDFS-10761: libhdfs++: Fix broken logic in HA retry policy. Contributed by James Clampffer

This commit is contained in:
James 2016-08-22 17:34:59 -04:00 committed by James Clampffer
parent b9cf0e932d
commit 05ddb31081
2 changed files with 28 additions and 33 deletions

View File

@ -56,20 +56,21 @@ RetryAction FixedDelayWithFailover::ShouldRetry(const Status &s, uint64_t retrie
(void)isIdempotentOrAtMostOnce;
LOG_TRACE(kRPC, << "FixedDelayWithFailover::ShouldRetry(retries=" << retries << ", failovers=" << failovers << ")");
if(s.code() == ::asio::error::timed_out && failovers < max_failover_retries_) {
if(failovers < max_failover_retries_ && (s.code() == ::asio::error::timed_out || s.get_server_exception_type() == Status::kStandbyException) )
{
// Try connecting to another NN in case this one keeps timing out
// Can add the backoff wait specified by dfs.client.failover.sleep.base.millis here
return RetryAction::failover(delay_);
}
if(retries < max_retries_) {
LOG_TRACE(kRPC, << "FixedDelayWithFailover::ShouldRetry: retries < max_retries_");
if(retries < max_retries_ && failovers < max_failover_retries_) {
LOG_TRACE(kRPC, << "FixedDelayWithFailover::ShouldRetry: retries < max_retries_ && failovers < max_failover_retries_");
return RetryAction::retry(delay_);
} else if (retries >= max_retries_ && failovers < max_failover_retries_) {
LOG_TRACE(kRPC, << "FixedDelayWithFailover::ShouldRetry: retries >= max_retries_ && failovers < max_failover_retries_");
return RetryAction::failover(delay_);
} else if (retries >= max_retries_ && failovers == max_failover_retries_) {
LOG_TRACE(kRPC, << "FixedDelayWithFailover::ShouldRetry: retries >= max_retries_ && failovers == max_failover_retries_");
} else if (retries <= max_retries_ && failovers == max_failover_retries_) {
LOG_TRACE(kRPC, << "FixedDelayWithFailover::ShouldRetry: retries <= max_retries_ && failovers == max_failover_retries_");
// 1 last retry on new connection
return RetryAction::retry(delay_);
}

View File

@ -291,11 +291,6 @@ void RpcEngine::RpcCommsError(
optional<RetryAction> head_action = optional<RetryAction>();
//We are talking to the Standby NN, let's talk to the active one instead.
if(ha_persisted_info_ && status.get_server_exception_type() == Status::kStandbyException) {
LOG_INFO(kRPC, << "Received StandbyException. Failing over.");
head_action = RetryAction::failover(std::max(0,options_.rpc_retry_delay_ms));
} else {
// Filter out anything with too many retries already
for (auto it = pendingRequests.begin(); it < pendingRequests.end();) {
auto req = *it;
@ -325,7 +320,6 @@ void RpcEngine::RpcCommsError(
++it;
}
}
}
// If we have reqests that need to be re-sent, ensure that we have a connection
// and send the requests to it