diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 385a07d154..6e5d3f5f79 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -317,6 +317,9 @@ Trunk (Unreleased) HDFS-6374. setXAttr should require the user to be the owner of the file or directory (Charles Lamb via wang) + HDFS-6110 adding more slow action log in critical write path + (Liang Xie via stack) + Release 2.5.0 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java index 5c7095931a..4d2da799e1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSClient.java @@ -276,6 +276,7 @@ public static class Conf { final int retryTimesForGetLastBlockLength; final int retryIntervalForGetLastBlockLength; final long datanodeRestartTimeout; + final long dfsclientSlowIoWarningThresholdMs; final boolean useLegacyBlockReader; final boolean useLegacyBlockReaderLocal; @@ -430,6 +431,9 @@ public Conf(Configuration conf) { datanodeRestartTimeout = conf.getLong( DFS_CLIENT_DATANODE_RESTART_TIMEOUT_KEY, DFS_CLIENT_DATANODE_RESTART_TIMEOUT_DEFAULT) * 1000; + dfsclientSlowIoWarningThresholdMs = conf.getLong( + DFSConfigKeys.DFS_CLIENT_SLOW_IO_WARNING_THRESHOLD_KEY, + DFSConfigKeys.DFS_CLIENT_SLOW_IO_WARNING_THRESHOLD_DEFAULT); } public boolean isUseLegacyBlockReaderLocal() { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index e9a435873c..5835f21131 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -645,5 +645,12 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final int DFS_NFS_REGISTRATION_PORT_DEFAULT = 40; // Currently unassigned. public static final String DFS_NFS_ALLOW_INSECURE_PORTS_KEY = "dfs.nfs.allow.insecure.ports"; public static final boolean DFS_NFS_ALLOW_INSECURE_PORTS_DEFAULT = true; - + + // Slow io warning log threshold settings for dfsclient and datanode. + public static final String DFS_CLIENT_SLOW_IO_WARNING_THRESHOLD_KEY = + "dfs.client.slow.io.warning.threshold.ms"; + public static final long DFS_CLIENT_SLOW_IO_WARNING_THRESHOLD_DEFAULT = 30000; + public static final String DFS_DATANODE_SLOW_IO_WARNING_THRESHOLD_KEY = + "dfs.datanode.slow.io.warning.threshold.ms"; + public static final long DFS_DATANODE_SLOW_IO_WARNING_THRESHOLD_DEFAULT = 300; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSOutputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSOutputStream.java index e2bc316f02..8ae750d8e8 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSOutputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSOutputStream.java @@ -122,6 +122,7 @@ public class DFSOutputStream extends FSOutputSummer implements Syncable, CanSetDropBehind { private static final int MAX_PACKETS = 80; // each packet 64K, total 5MB private final DFSClient dfsClient; + private final long dfsclientSlowLogThresholdMs; private Socket s; // closed is accessed by different threads under different locks. private volatile boolean closed = false; @@ -788,11 +789,19 @@ public void run() { // process responses from datanodes. try { // read an ack from the pipeline + long begin = Time.monotonicNow(); ack.readFields(blockReplyStream); - if (DFSClient.LOG.isDebugEnabled()) { + long duration = Time.monotonicNow() - begin; + if (duration > dfsclientSlowLogThresholdMs + && ack.getSeqno() != Packet.HEART_BEAT_SEQNO) { + DFSClient.LOG + .warn("Slow ReadProcessor read fields took " + duration + + "ms (threshold=" + dfsclientSlowLogThresholdMs + "ms); ack: " + + ack + ", targets: " + Arrays.asList(targets)); + } else if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("DFSClient " + ack); } - + long seqno = ack.getSeqno(); // processes response status from datanodes. for (int i = ack.getNumOfReplies()-1; i >=0 && dfsClient.clientRunning; i--) { @@ -1570,6 +1579,8 @@ private DFSOutputStream(DFSClient dfsClient, String src, Progressable progress, } this.checksum = checksum; + this.dfsclientSlowLogThresholdMs = + dfsClient.getConf().dfsclientSlowIoWarningThresholdMs; } /** Construct a new output stream for creating a file. */ @@ -2001,6 +2012,7 @@ private void waitForAckedSeqno(long seqno) throws IOException { if (DFSClient.LOG.isDebugEnabled()) { DFSClient.LOG.debug("Waiting for ack for: " + seqno); } + long begin = Time.monotonicNow(); try { synchronized (dataQueue) { while (!closed) { @@ -2020,6 +2032,11 @@ private void waitForAckedSeqno(long seqno) throws IOException { checkClosed(); } catch (ClosedChannelException e) { } + long duration = Time.monotonicNow() - begin; + if (duration > dfsclientSlowLogThresholdMs) { + DFSClient.LOG.warn("Slow waitForAckedSeqno took " + duration + + "ms (threshold=" + dfsclientSlowLogThresholdMs + "ms)"); + } } private synchronized void start() { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java index a7643b571e..e36d40ccae 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java @@ -69,7 +69,7 @@ class BlockReceiver implements Closeable { @VisibleForTesting static long CACHE_DROP_LAG_BYTES = 8 * 1024 * 1024; - + private final long datanodeSlowLogThresholdMs; private DataInputStream in = null; // from where data are read private DataChecksum clientChecksum; // checksum used by client private DataChecksum diskChecksum; // checksum we write to disk @@ -140,7 +140,7 @@ class BlockReceiver implements Closeable { this.isDatanode = clientname.length() == 0; this.isClient = !this.isDatanode; this.restartBudget = datanode.getDnConf().restartReplicaExpiry; - + this.datanodeSlowLogThresholdMs = datanode.getDnConf().datanodeSlowIoWarningThresholdMs; //for datanode, we have //1: clientName.length() == 0, and //2: stage == null or PIPELINE_SETUP_CREATE @@ -335,6 +335,7 @@ public void close() throws IOException { */ void flushOrSync(boolean isSync) throws IOException { long flushTotalNanos = 0; + long begin = Time.monotonicNow(); if (checksumOut != null) { long flushStartNanos = System.nanoTime(); checksumOut.flush(); @@ -363,6 +364,12 @@ void flushOrSync(boolean isSync) throws IOException { datanode.metrics.incrFsyncCount(); } } + long duration = Time.monotonicNow() - begin; + if (duration > datanodeSlowLogThresholdMs) { + LOG.warn("Slow flushOrSync took " + duration + "ms (threshold=" + + datanodeSlowLogThresholdMs + "ms), isSync:" + isSync + ", flushTotalNanos=" + + flushTotalNanos + "ns"); + } } /** @@ -488,8 +495,14 @@ private int receivePacket() throws IOException { //First write the packet to the mirror: if (mirrorOut != null && !mirrorError) { try { + long begin = Time.monotonicNow(); packetReceiver.mirrorPacketTo(mirrorOut); mirrorOut.flush(); + long duration = Time.monotonicNow() - begin; + if (duration > datanodeSlowLogThresholdMs) { + LOG.warn("Slow BlockReceiver write packet to mirror took " + duration + + "ms (threshold=" + datanodeSlowLogThresholdMs + "ms)"); + } } catch (IOException e) { handleMirrorOutError(e); } @@ -572,7 +585,13 @@ private int receivePacket() throws IOException { int numBytesToDisk = (int)(offsetInBlock-onDiskLen); // Write data to disk. + long begin = Time.monotonicNow(); out.write(dataBuf.array(), startByteToDisk, numBytesToDisk); + long duration = Time.monotonicNow() - begin; + if (duration > datanodeSlowLogThresholdMs) { + LOG.warn("Slow BlockReceiver write data to disk cost:" + duration + + "ms (threshold=" + datanodeSlowLogThresholdMs + "ms)"); + } // If this is a partial chunk, then verify that this is the only // chunk in the packet. Calculate new crc for this chunk. @@ -638,6 +657,7 @@ private void manageWriterOsCache(long offsetInBlock) { try { if (outFd != null && offsetInBlock > lastCacheManagementOffset + CACHE_DROP_LAG_BYTES) { + long begin = Time.monotonicNow(); // // For SYNC_FILE_RANGE_WRITE, we want to sync from // lastCacheManagementOffset to a position "two windows ago" @@ -670,6 +690,11 @@ private void manageWriterOsCache(long offsetInBlock) { NativeIO.POSIX.POSIX_FADV_DONTNEED); } lastCacheManagementOffset = offsetInBlock; + long duration = Time.monotonicNow() - begin; + if (duration > datanodeSlowLogThresholdMs) { + LOG.warn("Slow manageWriterOsCache took " + duration + + "ms (threshold=" + datanodeSlowLogThresholdMs + "ms)"); + } } } catch (Throwable t) { LOG.warn("Error managing cache for writer of block " + block, t); @@ -1299,9 +1324,15 @@ private void sendAckUpstreamUnprotected(PipelineAck ack, long seqno, replicaInfo.setBytesAcked(offsetInBlock); } // send my ack back to upstream datanode + long begin = Time.monotonicNow(); replyAck.write(upstreamOut); upstreamOut.flush(); - if (LOG.isDebugEnabled()) { + long duration = Time.monotonicNow() - begin; + if (duration > datanodeSlowLogThresholdMs) { + LOG.warn("Slow PacketResponder send ack to upstream took " + duration + + "ms (threshold=" + datanodeSlowLogThresholdMs + "ms), " + myString + + ", replyAck=" + replyAck); + } else if (LOG.isDebugEnabled()) { LOG.debug(myString + ", replyAck=" + replyAck); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java index d72c3b692c..27dba1baca 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java @@ -79,6 +79,8 @@ public class DNConf { final long deleteReportInterval; final long initialBlockReportDelay; final long cacheReportInterval; + final long dfsclientSlowIoWarningThresholdMs; + final long datanodeSlowIoWarningThresholdMs; final int writePacketSize; final String minimumNameNodeVersion; @@ -129,7 +131,14 @@ public DNConf(Configuration conf) { DFS_BLOCKREPORT_SPLIT_THRESHOLD_DEFAULT); this.cacheReportInterval = conf.getLong(DFS_CACHEREPORT_INTERVAL_MSEC_KEY, DFS_CACHEREPORT_INTERVAL_MSEC_DEFAULT); - + + this.dfsclientSlowIoWarningThresholdMs = conf.getLong( + DFSConfigKeys.DFS_CLIENT_SLOW_IO_WARNING_THRESHOLD_KEY, + DFSConfigKeys.DFS_CLIENT_SLOW_IO_WARNING_THRESHOLD_DEFAULT); + this.datanodeSlowIoWarningThresholdMs = conf.getLong( + DFSConfigKeys.DFS_DATANODE_SLOW_IO_WARNING_THRESHOLD_KEY, + DFSConfigKeys.DFS_DATANODE_SLOW_IO_WARNING_THRESHOLD_DEFAULT); + long initBRDelay = conf.getLong( DFS_BLOCKREPORT_INITIAL_DELAY_KEY, DFS_BLOCKREPORT_INITIAL_DELAY_DEFAULT) * 1000L; @@ -168,7 +177,7 @@ public DNConf(Configuration conf) { DFS_DATANODE_RESTART_REPLICA_EXPIRY_KEY, DFS_DATANODE_RESTART_REPLICA_EXPIRY_DEFAULT) * 1000L; } - + // We get minimumNameNodeVersion via a method so it can be mocked out in tests. String getMinimumNameNodeVersion() { return this.minimumNameNodeVersion; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml index e1ca17d564..34b36a5c38 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml @@ -1930,4 +1930,22 @@ + + dfs.client.slow.io.warning.threshold.ms + 30000 + The threshold in milliseconds at which we will log a slow + io warning in a dfsclient. By default, this parameter is set to 30000 + milliseconds (30 seconds). + + + + + dfs.datanode.slow.io.warning.threshold.ms + 300 + The threshold in milliseconds at which we will log a slow + io warning in a datanode. By default, this parameter is set to 300 + milliseconds. + + +