From b53d19a343e110dbcf0ec710e9d491ec6bd77a51 Mon Sep 17 00:00:00 2001 From: avijayanhwx <14299376+avijayanhwx@users.noreply.github.com> Date: Tue, 3 Sep 2019 02:48:50 -0700 Subject: [PATCH] HDDS-1783 : Latency metric for applyTransaction in ContainerStateMachine (#1363). --- .../transport/server/ratis/CSMMetrics.java | 14 ++++++++++++++ .../server/ratis/ContainerStateMachine.java | 15 ++++++++++++--- .../main/compose/ozonesecure-mr/docker-config | 2 +- .../transport/server/ratis/TestCSMMetrics.java | 16 ++++++++++++++++ 4 files changed, 43 insertions(+), 4 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java index 104a43368a..9893ae4834 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/CSMMetrics.java @@ -60,6 +60,9 @@ public class CSMMetrics { private @Metric MutableCounterLong numStartTransactionVerifyFailures; private @Metric MutableCounterLong numContainerNotOpenVerifyFailures; + private @Metric MutableRate applyTransaction; + private @Metric MutableRate writeStateMachineData; + public CSMMetrics() { int numCmdTypes = ContainerProtos.Type.values().length; this.opsLatency = new MutableRate[numCmdTypes]; @@ -186,6 +189,10 @@ public long getNumBytesCommittedCount() { return numBytesCommittedCount.value(); } + public MutableRate getApplyTransactionLatency() { + return applyTransaction; + } + public void incPipelineLatency(ContainerProtos.Type type, long latencyNanos) { opsLatency[type.ordinal()].add(latencyNanos); transactionLatency.add(latencyNanos); @@ -199,6 +206,13 @@ public void incNumContainerNotOpenVerifyFailures() { numContainerNotOpenVerifyFailures.incr(); } + public void recordApplyTransactionCompletion(long latencyNanos) { + applyTransaction.add(latencyNanos); + } + + public void recordWriteStateMachineCompletion(long latencyNanos) { + writeStateMachineData.add(latencyNanos); + } public void unRegister() { MetricsSystem ms = DefaultMetricsSystem.instance(); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java index 4f876bc020..0780f84988 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java @@ -411,7 +411,8 @@ private ExecutorService getCommandExecutor( } private CompletableFuture handleWriteChunk( - ContainerCommandRequestProto requestProto, long entryIndex, long term) { + ContainerCommandRequestProto requestProto, long entryIndex, long term, + long startTime) { final WriteChunkRequestProto write = requestProto.getWriteChunk(); RaftServer server = ratisServer.getServer(); Preconditions.checkState(server instanceof RaftServerProxy); @@ -461,6 +462,8 @@ private CompletableFuture handleWriteChunk( write.getBlockID() + " logIndex " + entryIndex + " chunkName " + write.getChunkData().getChunkName()); raftFuture.complete(r::toByteString); + metrics.recordWriteStateMachineCompletion( + Time.monotonicNowNanos() - startTime); } writeChunkFutureMap.remove(entryIndex); @@ -477,6 +480,7 @@ private CompletableFuture handleWriteChunk( public CompletableFuture writeStateMachineData(LogEntryProto entry) { try { metrics.incNumWriteStateMachineOps(); + long writeStateMachineStartTime = Time.monotonicNowNanos(); ContainerCommandRequestProto requestProto = getContainerCommandRequestProto( entry.getStateMachineLogEntry().getLogData()); @@ -493,7 +497,7 @@ public CompletableFuture writeStateMachineData(LogEntryProto entry) { switch (cmdType) { case WriteChunk: return handleWriteChunk(requestProto, entry.getIndex(), - entry.getTerm()); + entry.getTerm(), writeStateMachineStartTime); default: throw new IllegalStateException("Cmd Type:" + cmdType + " should not have state machine data"); @@ -673,6 +677,7 @@ public CompletableFuture applyTransaction(TransactionContext trx) { .setTerm(trx.getLogEntry().getTerm()) .setLogIndex(index); + long applyTxnStartTime = Time.monotonicNowNanos(); try { applyTransactionSemaphore.acquire(); metrics.incNumApplyTransactionsOps(); @@ -740,7 +745,11 @@ public CompletableFuture applyTransaction(TransactionContext trx) { } } return applyTransactionFuture; - }).whenComplete((r, t) -> applyTransactionSemaphore.release()); + }).whenComplete((r, t) -> { + applyTransactionSemaphore.release(); + metrics.recordApplyTransactionCompletion( + Time.monotonicNowNanos() - applyTxnStartTime); + }); return applyTransactionFuture; } catch (IOException | InterruptedException e) { metrics.incNumApplyTransactionsFails(); diff --git a/hadoop-ozone/dist/src/main/compose/ozonesecure-mr/docker-config b/hadoop-ozone/dist/src/main/compose/ozonesecure-mr/docker-config index 6565eefeac..f5c5fbd230 100644 --- a/hadoop-ozone/dist/src/main/compose/ozonesecure-mr/docker-config +++ b/hadoop-ozone/dist/src/main/compose/ozonesecure-mr/docker-config @@ -47,7 +47,7 @@ HDFS-SITE.XML_dfs.datanode.address=0.0.0.0:1019 HDFS-SITE.XML_dfs.datanode.http.address=0.0.0.0:1012 CORE-SITE.XML_dfs.data.transfer.protection=authentication CORE-SITE.XML_hadoop.security.authentication=kerberos -COER-SITE.XML_hadoop.security.auth_to_local=RULE:[2:$1@$0](.*@EXAMPLE.COM)s/@.*///L +CORE-SITE.XML_hadoop.security.auth_to_local=RULE:[2:$1@$0](.*@EXAMPLE.COM)s/@.*///L CORE-SITE.XML_hadoop.security.key.provider.path=kms://http@kms:9600/kms #temporary disable authorization as org.apache.hadoop.yarn.server.api.ResourceTrackerPB is not properly annotated to support it diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java index 4853059a52..8e3c9578fd 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestCSMMetrics.java @@ -19,6 +19,7 @@ package org.apache.hadoop.ozone.container.common.transport.server.ratis; import static org.apache.hadoop.test.MetricsAsserts.assertCounter; +import static org.apache.hadoop.test.MetricsAsserts.getDoubleGauge; import static org.apache.hadoop.test.MetricsAsserts.getMetrics; import java.io.File; @@ -49,6 +50,8 @@ import org.apache.hadoop.hdds.conf.OzoneConfiguration; import static org.apache.ratis.rpc.SupportedRpcType.GRPC; +import static org.junit.Assert.assertTrue; + import org.apache.ratis.protocol.RaftGroupId; import org.apache.ratis.util.function.CheckedBiConsumer; @@ -118,6 +121,12 @@ static void runContainerStateMachineMetrics( assertCounter("NumStartTransactionVerifyFailures", 0L, metric); assertCounter("NumContainerNotOpenVerifyFailures", 0L, metric); assertCounter("WriteChunkNumOps", 0L, metric); + double applyTransactionLatency = getDoubleGauge( + "ApplyTransactionAvgTime", metric); + assertTrue(applyTransactionLatency == 0.0); + double writeStateMachineLatency = getDoubleGauge( + "WriteStateMachineDataAvgTime", metric); + assertTrue(writeStateMachineLatency == 0.0); // Write Chunk BlockID blockID = ContainerTestHelper.getTestBlockID(ContainerTestHelper. @@ -152,6 +161,13 @@ static void runContainerStateMachineMetrics( RaftGroupId.valueOf(pipeline.getId().getId()).toString()); assertCounter("NumQueryStateMachineOps", 1L, metric); assertCounter("NumApplyTransactionOps", 1L, metric); + applyTransactionLatency = getDoubleGauge( + "ApplyTransactionAvgTime", metric); + assertTrue(applyTransactionLatency > 0.0); + writeStateMachineLatency = getDoubleGauge( + "WriteStateMachineDataAvgTime", metric); + assertTrue(writeStateMachineLatency > 0.0); + } finally { if (client != null) { client.close();