HDFS-17290: Adds disconnected client rpc backoff metrics (#6359)

This commit is contained in:
Lei Yang 2024-01-04 20:24:10 -08:00 committed by GitHub
parent 7d3b6a36b8
commit 661c784662
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 27 additions and 0 deletions

View File

@ -3133,6 +3133,13 @@ private void internalQueueCall(Call call, boolean blocking)
// For example, IPC clients using FailoverOnNetworkExceptionRetry handle // For example, IPC clients using FailoverOnNetworkExceptionRetry handle
// RetriableException. // RetriableException.
rpcMetrics.incrClientBackoff(); rpcMetrics.incrClientBackoff();
// Clients that are directly put into lowest priority queue are backed off and disconnected.
if (cqe.getCause() instanceof RpcServerException) {
RpcServerException ex = (RpcServerException) cqe.getCause();
if (ex.getRpcStatusProto() == RpcStatusProto.FATAL) {
rpcMetrics.incrClientBackoffDisconnected();
}
}
// unwrap retriable exception. // unwrap retriable exception.
throw cqe.getCause(); throw cqe.getCause();
} }

View File

@ -141,6 +141,8 @@ public static RpcMetrics create(Server server, Configuration conf) {
MutableCounterLong rpcAuthorizationSuccesses; MutableCounterLong rpcAuthorizationSuccesses;
@Metric("Number of client backoff requests") @Metric("Number of client backoff requests")
MutableCounterLong rpcClientBackoff; MutableCounterLong rpcClientBackoff;
@Metric("Number of disconnected client backoff requests")
MutableCounterLong rpcClientBackoffDisconnected;
@Metric("Number of slow RPC calls") @Metric("Number of slow RPC calls")
MutableCounterLong rpcSlowCalls; MutableCounterLong rpcSlowCalls;
@Metric("Number of requeue calls") @Metric("Number of requeue calls")
@ -342,6 +344,22 @@ public void incrClientBackoff() {
rpcClientBackoff.incr(); rpcClientBackoff.incr();
} }
/**
* Client was disconnected due to backoff
*/
public void incrClientBackoffDisconnected() {
rpcClientBackoffDisconnected.incr();
}
/**
* Returns the number of disconnected backoffs.
* @return long
*/
public long getClientBackoffDisconnected() {
return rpcClientBackoffDisconnected.value();
}
/** /**
* Increments the Slow RPC counter. * Increments the Slow RPC counter.
*/ */

View File

@ -87,6 +87,7 @@ The default timeunit used for RPC metrics is milliseconds (as per the below desc
| `RpcAuthorizationFailures` | Total number of authorization failures | | `RpcAuthorizationFailures` | Total number of authorization failures |
| `RpcAuthorizationSuccesses` | Total number of authorization successes | | `RpcAuthorizationSuccesses` | Total number of authorization successes |
| `RpcClientBackoff` | Total number of client backoff requests | | `RpcClientBackoff` | Total number of client backoff requests |
| `RpcClientBackoffDisconnected` | Total number of client backoff requests that are disconnected. This is a subset of RpcClientBackoff |
| `RpcSlowCalls` | Total number of slow RPC calls | | `RpcSlowCalls` | Total number of slow RPC calls |
| `RpcRequeueCalls` | Total number of requeue RPC calls | | `RpcRequeueCalls` | Total number of requeue RPC calls |
| `RpcCallsSuccesses` | Total number of RPC calls that are successfully processed | | `RpcCallsSuccesses` | Total number of RPC calls that are successfully processed |

View File

@ -1528,6 +1528,7 @@ public Void call() throws ServiceException, InterruptedException {
IOException unwrapExeption = re.unwrapRemoteException(); IOException unwrapExeption = re.unwrapRemoteException();
if (unwrapExeption instanceof RetriableException) { if (unwrapExeption instanceof RetriableException) {
succeeded = true; succeeded = true;
assertEquals(1L, server.getRpcMetrics().getClientBackoffDisconnected());
} else { } else {
lastException = unwrapExeption; lastException = unwrapExeption;
} }