diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-3077.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-3077.txt index 065a21edfb..0a47ee2f14 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-3077.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-3077.txt @@ -54,3 +54,5 @@ HDFS-3893. QJM: Make QJM work with security enabled. (atm) HDFS-3897. QJM: TestBlockToken fails after HDFS-3893. (atm) HDFS-3898. QJM: enable TCP_NODELAY for IPC (todd) + +HDFS-3885. QJM: optimize log sync when JN is lagging behind (todd) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperEditLogOutputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperEditLogOutputStream.java index 6267871aae..4cc4b2d960 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperEditLogOutputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/contrib/bkjournal/src/main/java/org/apache/hadoop/contrib/bkjournal/BookKeeperEditLogOutputStream.java @@ -84,7 +84,7 @@ public void create() throws IOException { @Override public void close() throws IOException { setReadyToFlush(); - flushAndSync(); + flushAndSync(true); try { lh.close(); } catch (InterruptedException ie) { @@ -130,7 +130,7 @@ public void setReadyToFlush() throws IOException { } @Override - public void flushAndSync() throws IOException { + public void flushAndSync(boolean durable) throws IOException { assert(syncLatch != null); try { syncLatch.await(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumOutputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumOutputStream.java index 4ea95ee2a8..90633b4c5e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumOutputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumOutputStream.java @@ -77,7 +77,7 @@ public void setReadyToFlush() throws IOException { } @Override - protected void flushAndSync() throws IOException { + protected void flushAndSync(boolean durable) throws IOException { int numReadyBytes = buf.countReadyBytes(); if (numReadyBytes > 0) { int numReadyTxns = buf.countReadyTxns(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/Journal.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/Journal.java index 61bb22c5a4..99ba41bea6 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/Journal.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/Journal.java @@ -301,17 +301,23 @@ synchronized void journal(RequestInfo reqInfo, if (LOG.isTraceEnabled()) { LOG.trace("Writing txid " + firstTxnId + "-" + lastTxnId); } + + // If the edit has already been marked as committed, we know + // it has been fsynced on a quorum of other nodes, and we are + // "catching up" with the rest. Hence we do not need to fsync. + boolean isLagging = lastTxnId <= committedTxnId.get(); + boolean shouldFsync = !isLagging; curSegment.writeRaw(records, 0, records.length); curSegment.setReadyToFlush(); Stopwatch sw = new Stopwatch(); sw.start(); - curSegment.flush(); + curSegment.flush(shouldFsync); sw.stop(); metrics.addSync(sw.elapsedTime(TimeUnit.MICROSECONDS)); - - if (committedTxnId.get() > lastTxnId) { + + if (isLagging) { // This batch of edits has already been committed on a quorum of other // nodes. So, we are in "catch up" mode. This gets its own metric. metrics.batchesWrittenWhileLagging.incr(1); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java index 5a28f7c512..97cfe136e4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java @@ -114,7 +114,7 @@ public void setReadyToFlush() throws IOException { } @Override // EditLogOutputStream - protected void flushAndSync() throws IOException { + protected void flushAndSync(boolean durable) throws IOException { assert out.getLength() == 0 : "Output buffer is not empty"; int numReadyTxns = doubleBuf.countReadyTxns(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java index f7a8b337a6..fb11ae0a37 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java @@ -176,7 +176,7 @@ public void setReadyToFlush() throws IOException { * accumulates new log records while readyBuffer will be flushed and synced. */ @Override - public void flushAndSync() throws IOException { + public void flushAndSync(boolean durable) throws IOException { if (fp == null) { throw new IOException("Trying to use aborted output stream"); } @@ -186,7 +186,7 @@ public void flushAndSync() throws IOException { } preallocate(); // preallocate file if necessay doubleBuf.flushTo(fp); - if (!shouldSkipFsyncForTests) { + if (durable && !shouldSkipFsyncForTests) { fc.force(false); // metadata updates not needed } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogOutputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogOutputStream.java index ec418f532e..d5b7bffd10 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogOutputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/EditLogOutputStream.java @@ -93,18 +93,24 @@ abstract public void writeRaw(byte[] bytes, int offset, int length) /** * Flush and sync all data that is ready to be flush * {@link #setReadyToFlush()} into underlying persistent store. + * @param durable if true, the edits should be made truly durable before + * returning * @throws IOException */ - abstract protected void flushAndSync() throws IOException; + abstract protected void flushAndSync(boolean durable) throws IOException; /** * Flush data to persistent store. * Collect sync metrics. */ public void flush() throws IOException { + flush(true); + } + + public void flush(boolean durable) throws IOException { numSync++; long start = now(); - flushAndSync(); + flushAndSync(durable); long end = now(); totalTimeSync += (end - start); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java index cbdb8a113d..8ed073d10d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/JournalSet.java @@ -471,12 +471,12 @@ public void apply(JournalAndStream jas) throws IOException { } @Override - protected void flushAndSync() throws IOException { + protected void flushAndSync(final boolean durable) throws IOException { mapJournalsAndReportErrors(new JournalClosure() { @Override public void apply(JournalAndStream jas) throws IOException { if (jas.isActive()) { - jas.getCurrentStream().flushAndSync(); + jas.getCurrentStream().flushAndSync(durable); } } }, "flushAndSync"); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/BinaryEditsVisitor.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/BinaryEditsVisitor.java index 1911ef71d2..8c4fcbbbe3 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/BinaryEditsVisitor.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/BinaryEditsVisitor.java @@ -56,7 +56,7 @@ public void start(int version) throws IOException { @Override public void close(Throwable error) throws IOException { elfos.setReadyToFlush(); - elfos.flushAndSync(); + elfos.flushAndSync(true); elfos.close(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java index 06af8a9f1f..28e601b241 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java @@ -1222,7 +1222,7 @@ static void validateNoCrash(byte garbage[]) throws IOException { elfos.create(); elfos.writeRaw(garbage, 0, garbage.length); elfos.setReadyToFlush(); - elfos.flushAndSync(); + elfos.flushAndSync(true); elfos.close(); elfos = null; file = new File(TEST_LOG_NAME); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogFileOutputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogFileOutputStream.java index 22ab02d2a9..24446d655d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogFileOutputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLogFileOutputStream.java @@ -55,7 +55,7 @@ public void deleteEditsFile() { static void flushAndCheckLength(EditLogFileOutputStream elos, long expectedLength) throws IOException { elos.setReadyToFlush(); - elos.flushAndSync(); + elos.flushAndSync(true); assertEquals(expectedLength, elos.getFile().length()); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeRecovery.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeRecovery.java index 23fd3b51a7..a8dac5701e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeRecovery.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeRecovery.java @@ -74,7 +74,7 @@ static void runEditLogTest(EditLogTestSetup elts) throws IOException { elts.addTransactionsToLog(elfos, cache); elfos.setReadyToFlush(); - elfos.flushAndSync(); + elfos.flushAndSync(true); elfos.close(); elfos = null; file = new File(TEST_LOG_NAME);