From 661c784662cd755463e709f2f1ea96d0df3de70d Mon Sep 17 00:00:00 2001 From: Lei Yang Date: Thu, 4 Jan 2024 20:24:10 -0800 Subject: [PATCH] HDFS-17290: Adds disconnected client rpc backoff metrics (#6359) --- .../java/org/apache/hadoop/ipc/Server.java | 7 +++++++ .../apache/hadoop/ipc/metrics/RpcMetrics.java | 18 ++++++++++++++++++ .../hadoop-common/src/site/markdown/Metrics.md | 1 + .../java/org/apache/hadoop/ipc/TestRPC.java | 1 + 4 files changed, 27 insertions(+) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java index 53497e9707..f33f5dc4a3 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Server.java @@ -3133,6 +3133,13 @@ private void internalQueueCall(Call call, boolean blocking) // For example, IPC clients using FailoverOnNetworkExceptionRetry handle // RetriableException. rpcMetrics.incrClientBackoff(); + // Clients that are directly put into lowest priority queue are backed off and disconnected. + if (cqe.getCause() instanceof RpcServerException) { + RpcServerException ex = (RpcServerException) cqe.getCause(); + if (ex.getRpcStatusProto() == RpcStatusProto.FATAL) { + rpcMetrics.incrClientBackoffDisconnected(); + } + } // unwrap retriable exception. throw cqe.getCause(); } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/metrics/RpcMetrics.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/metrics/RpcMetrics.java index b9be973204..2ca6693e10 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/metrics/RpcMetrics.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/metrics/RpcMetrics.java @@ -141,6 +141,8 @@ public static RpcMetrics create(Server server, Configuration conf) { MutableCounterLong rpcAuthorizationSuccesses; @Metric("Number of client backoff requests") MutableCounterLong rpcClientBackoff; + @Metric("Number of disconnected client backoff requests") + MutableCounterLong rpcClientBackoffDisconnected; @Metric("Number of slow RPC calls") MutableCounterLong rpcSlowCalls; @Metric("Number of requeue calls") @@ -342,6 +344,22 @@ public void incrClientBackoff() { rpcClientBackoff.incr(); } + /** + * Client was disconnected due to backoff + */ + public void incrClientBackoffDisconnected() { + rpcClientBackoffDisconnected.incr(); + } + + /** + * Returns the number of disconnected backoffs. + * @return long + */ + public long getClientBackoffDisconnected() { + return rpcClientBackoffDisconnected.value(); + } + + /** * Increments the Slow RPC counter. */ diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md index 10f5624a76..cb2b48d60f 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md @@ -87,6 +87,7 @@ The default timeunit used for RPC metrics is milliseconds (as per the below desc | `RpcAuthorizationFailures` | Total number of authorization failures | | `RpcAuthorizationSuccesses` | Total number of authorization successes | | `RpcClientBackoff` | Total number of client backoff requests | +| `RpcClientBackoffDisconnected` | Total number of client backoff requests that are disconnected. This is a subset of RpcClientBackoff | | `RpcSlowCalls` | Total number of slow RPC calls | | `RpcRequeueCalls` | Total number of requeue RPC calls | | `RpcCallsSuccesses` | Total number of RPC calls that are successfully processed | diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestRPC.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestRPC.java index 88d9204f69..f9b03721b5 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestRPC.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ipc/TestRPC.java @@ -1528,6 +1528,7 @@ public Void call() throws ServiceException, InterruptedException { IOException unwrapExeption = re.unwrapRemoteException(); if (unwrapExeption instanceof RetriableException) { succeeded = true; + assertEquals(1L, server.getRpcMetrics().getClientBackoffDisconnected()); } else { lastException = unwrapExeption; }