From 81192e4e415d359ca832eff50d6f64c3da7acb73 Mon Sep 17 00:00:00 2001 From: Todd Lipcon Date: Wed, 27 Mar 2013 18:42:43 +0000 Subject: [PATCH] HDFS-4621. Additional logging to help diagnose slow QJM syncs. Contributed by Todd Lipcon. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1461777 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 2 ++ .../qjournal/client/IPCLoggerChannel.java | 8 ++++++ .../hdfs/qjournal/client/QuorumCall.java | 28 +++++++++++++++++++ .../hadoop/hdfs/qjournal/server/Journal.java | 8 ++++++ 4 files changed, 46 insertions(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index b79493c676..856a1c74cf 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -371,6 +371,8 @@ Release 2.0.5-beta - UNRELEASED HDFS-4635. Move BlockManager#computeCapacity to LightWeightGSet. (suresh) + HDFS-4621. Additional logging to help diagnose slow QJM syncs. (todd) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannel.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannel.java index 1c82af858c..c64acd85d7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannel.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/IPCLoggerChannel.java @@ -133,6 +133,8 @@ public class IPCLoggerChannel implements AsyncLogger { private Stopwatch lastHeartbeatStopwatch = new Stopwatch(); private static final long HEARTBEAT_INTERVAL_MILLIS = 1000; + + private static final long WARN_JOURNAL_MILLIS_THRESHOLD = 1000; static final Factory FACTORY = new AsyncLogger.Factory() { @Override @@ -371,6 +373,12 @@ public Void call() throws IOException { now - submitNanos, TimeUnit.NANOSECONDS); metrics.addWriteEndToEndLatency(endToEndTime); metrics.addWriteRpcLatency(rpcTime); + if (rpcTime / 1000 > WARN_JOURNAL_MILLIS_THRESHOLD) { + QuorumJournalManager.LOG.warn( + "Took " + (rpcTime / 1000) + "ms to send a batch of " + + numTxns + " edits (" + data.length + " bytes) to " + + "remote journal " + IPCLoggerChannel.this); + } } synchronized (IPCLoggerChannel.this) { highestAckedTxId = firstTxnId + numTxns - 1; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumCall.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumCall.java index c42f6b917a..f15e4626f4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumCall.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumCall.java @@ -24,6 +24,7 @@ import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.util.Time; +import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.collect.Maps; import com.google.common.util.concurrent.FutureCallback; @@ -120,6 +121,15 @@ public synchronized void waitFor( String msg = String.format( "Waited %s ms (timeout=%s ms) for a response for %s", waited, millis, operationName); + if (!successes.isEmpty()) { + msg += ". Succeeded so far: [" + Joiner.on(",").join(successes.keySet()) + "]"; + } + if (!exceptions.isEmpty()) { + msg += ". Exceptions so far: [" + getExceptionMapString() + "]"; + } + if (successes.isEmpty() && exceptions.isEmpty()) { + msg += ". No responses yet."; + } if (waited > millis * WAIT_PROGRESS_WARN_THRESHOLD) { QuorumJournalManager.LOG.warn(msg); } else { @@ -227,4 +237,22 @@ public static String mapToString( } return sb.toString(); } + + /** + * Return a string suitable for displaying to the user, containing + * any exceptions that have been received so far. + */ + private String getExceptionMapString() { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for (Map.Entry e : exceptions.entrySet()) { + if (!first) { + sb.append(", "); + } + first = false; + sb.append(e.getKey()).append(": ") + .append(e.getValue().getLocalizedMessage()); + } + return sb.toString(); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/Journal.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/Journal.java index e6b179009c..58965ce02a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/Journal.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/Journal.java @@ -128,6 +128,10 @@ class Journal implements Closeable { private final JournalMetrics metrics; + /** + * Time threshold for sync calls, beyond which a warning should be logged to the console. + */ + private static final int WARN_SYNC_MILLIS_THRESHOLD = 1000; Journal(File logDir, String journalId, StorageErrorReporter errorReporter) throws IOException { @@ -370,6 +374,10 @@ synchronized void journal(RequestInfo reqInfo, sw.stop(); metrics.addSync(sw.elapsedTime(TimeUnit.MICROSECONDS)); + if (sw.elapsedTime(TimeUnit.MILLISECONDS) > WARN_SYNC_MILLIS_THRESHOLD) { + LOG.warn("Sync of transaction range " + firstTxnId + "-" + lastTxnId + + " took " + sw.elapsedTime(TimeUnit.MILLISECONDS) + "ms"); + } if (isLagging) { // This batch of edits has already been committed on a quorum of other