From 6ccf4fbf8a8374c289370f67b26ac05abad30ebc Mon Sep 17 00:00:00 2001 From: Haohui Mai Date: Wed, 1 Apr 2015 16:54:46 -0700 Subject: [PATCH] HDFS-8008. Support client-side back off when the datanodes are congested. Contributed by Haohui Mai. --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../org/apache/hadoop/hdfs/DataStreamer.java | 63 +++++++++++++++++++ .../protocol/datatransfer/PipelineAck.java | 4 ++ .../hadoop/hdfs/TestDFSOutputStream.java | 42 +++++++++++++ 4 files changed, 112 insertions(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 1d9e200044..34c05565cf 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -868,6 +868,9 @@ Release 2.7.0 - UNRELEASED HDFS-7742. Favoring decommissioning node for replication can cause a block to stay underreplicated for long periods (Nathan Roberts via kihwal) + HDFS-8008. Support client-side back off when the datanodes are congested. + (wheat9) + OPTIMIZATIONS HDFS-7454. Reduce memory footprint for AclEntries in NameNode. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java index 9c437babe8..6ff4c2427a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java @@ -218,6 +218,13 @@ private static void releaseBuffer(List packets, ByteArrayManager bam) private boolean failPacket = false; private final long dfsclientSlowLogThresholdMs; private long artificialSlowdown = 0; + // List of congested data nodes. The stream will back off if the DataNodes + // are congested + private final ArrayList congestedNodes = new ArrayList<>(); + private static final int CONGESTION_BACKOFF_MEAN_TIME_IN_MS = 5000; + private static final int CONGESTION_BACK_OFF_MAX_TIME_IN_MS = + CONGESTION_BACKOFF_MEAN_TIME_IN_MS * 10; + private int lastCongestionBackoffTime; private final LoadingCache excludedNodes; @@ -386,6 +393,11 @@ public void run() { one = createHeartbeatPacket(); assert one != null; } else { + try { + backOffIfNecessary(); + } catch (InterruptedException e) { + DFSClient.LOG.warn("Caught exception ", e); + } one = dataQueue.getFirst(); // regular data packet long parents[] = one.getTraceParents(); if (parents.length > 0) { @@ -815,9 +827,14 @@ public void run() { long seqno = ack.getSeqno(); // processes response status from datanodes. + ArrayList congestedNodesFromAck = new ArrayList<>(); for (int i = ack.getNumOfReplies()-1; i >=0 && dfsClient.clientRunning; i--) { final Status reply = PipelineAck.getStatusFromHeader(ack .getHeaderFlag(i)); + if (PipelineAck.getECNFromHeader(ack.getHeaderFlag(i)) == + PipelineAck.ECN.CONGESTED) { + congestedNodesFromAck.add(targets[i]); + } // Restart will not be treated differently unless it is // the local node or the only one in the pipeline. if (PipelineAck.isRestartOOBStatus(reply) && @@ -839,6 +856,18 @@ public void run() { } } + if (!congestedNodesFromAck.isEmpty()) { + synchronized (congestedNodes) { + congestedNodes.clear(); + congestedNodes.addAll(congestedNodesFromAck); + } + } else { + synchronized (congestedNodes) { + congestedNodes.clear(); + lastCongestionBackoffTime = 0; + } + } + assert seqno != PipelineAck.UNKOWN_SEQNO : "Ack for unknown seqno should be a failed ack: " + ack; if (seqno == DFSPacket.HEART_BEAT_SEQNO) { // a heartbeat ack @@ -1543,6 +1572,40 @@ private LocatedBlock locateFollowingBlock(DatanodeInfo[] excludedNodes) } } + /** + * This function sleeps for a certain amount of time when the writing + * pipeline is congested. The function calculates the time based on a + * decorrelated filter. + * + * @see + * + * http://www.awsarchitectureblog.com/2015/03/backoff.html + */ + private void backOffIfNecessary() throws InterruptedException { + int t = 0; + synchronized (congestedNodes) { + if (!congestedNodes.isEmpty()) { + StringBuilder sb = new StringBuilder("DataNode"); + for (DatanodeInfo i : congestedNodes) { + sb.append(' ').append(i); + } + int range = Math.abs(lastCongestionBackoffTime * 3 - + CONGESTION_BACKOFF_MEAN_TIME_IN_MS); + int base = Math.min(lastCongestionBackoffTime * 3, + CONGESTION_BACKOFF_MEAN_TIME_IN_MS); + t = Math.min(CONGESTION_BACK_OFF_MAX_TIME_IN_MS, + (int)(base + Math.random() * range)); + lastCongestionBackoffTime = t; + sb.append(" are congested. Backing off for ").append(t).append(" ms"); + DFSClient.LOG.info(sb.toString()); + congestedNodes.clear(); + } + } + if (t != 0) { + Thread.sleep(t); + } + } + /** * get the block this streamer is writing to * diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/PipelineAck.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/PipelineAck.java index 9bd4115b59..a811f39ece 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/PipelineAck.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/datatransfer/PipelineAck.java @@ -257,6 +257,10 @@ public static Status getStatusFromHeader(int header) { return StatusFormat.getStatus(header); } + public static ECN getECNFromHeader(int header) { + return StatusFormat.getECN(header); + } + public static int setStatusForHeader(int old, Status status) { return StatusFormat.setStatus(old, status); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSOutputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSOutputStream.java index b47e7f1510..a410e74b13 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSOutputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSOutputStream.java @@ -17,20 +17,31 @@ */ package org.apache.hadoop.hdfs; +import java.io.DataOutputStream; import java.io.IOException; import java.lang.reflect.Field; import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.LinkedList; import java.util.concurrent.atomic.AtomicReference; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.protocol.DatanodeInfo; +import org.apache.hadoop.hdfs.protocol.ExtendedBlock; +import org.apache.hadoop.hdfs.protocol.HdfsFileStatus; +import org.apache.hadoop.hdfs.protocol.datatransfer.BlockConstructionStage; import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; import org.mockito.internal.util.reflection.Whitebox; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.when; + public class TestDFSOutputStream { static MiniDFSCluster cluster; @@ -100,6 +111,37 @@ public void testComputePacketChunkSize() Assert.assertTrue((Integer) field.get(dos) + 257 < packetSize); } + @Test + public void testCongestionBackoff() throws IOException { + DFSClient.Conf dfsClientConf = mock(DFSClient.Conf.class); + DFSClient client = mock(DFSClient.class); + when(client.getConf()).thenReturn(dfsClientConf); + client.clientRunning = true; + DataStreamer stream = new DataStreamer( + mock(HdfsFileStatus.class), + mock(ExtendedBlock.class), + client, + "foo", null, null, null, null); + + DataOutputStream blockStream = mock(DataOutputStream.class); + doThrow(new IOException()).when(blockStream).flush(); + Whitebox.setInternalState(stream, "blockStream", blockStream); + Whitebox.setInternalState(stream, "stage", + BlockConstructionStage.PIPELINE_CLOSE); + @SuppressWarnings("unchecked") + LinkedList dataQueue = (LinkedList) + Whitebox.getInternalState(stream, "dataQueue"); + @SuppressWarnings("unchecked") + ArrayList congestedNodes = (ArrayList) + Whitebox.getInternalState(stream, "congestedNodes"); + congestedNodes.add(mock(DatanodeInfo.class)); + DFSPacket packet = mock(DFSPacket.class); + when(packet.getTraceParents()).thenReturn(new long[] {}); + dataQueue.add(packet); + stream.run(); + Assert.assertTrue(congestedNodes.isEmpty()); + } + @AfterClass public static void tearDown() { cluster.shutdown();