HDFS-3863. Track last "committed" txid in QJM. Contributed by Todd Lipcon.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-3077@1380976 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Todd Lipcon 2012-09-05 04:13:19 +00:00
parent 6f5ae49063
commit 8021d9199f
16 changed files with 421 additions and 43 deletions

View File

@ -36,3 +36,5 @@ HDFS-3839. QJM: hadoop-daemon.sh should be updated to accept "journalnode" (eli)
HDFS-3845. Fixes for edge cases in QJM recovery protocol (todd) HDFS-3845. Fixes for edge cases in QJM recovery protocol (todd)
HDFS-3877. QJM: Provide defaults for dfs.journalnode.*address (eli) HDFS-3877. QJM: Provide defaults for dfs.journalnode.*address (eli)
HDFS-3863. Track last "committed" txid in QJM (todd)

View File

@ -123,6 +123,13 @@ public ListenableFuture<Void> acceptRecovery(SegmentStateProto log,
*/ */
public void setEpoch(long e); public void setEpoch(long e);
/**
* Let the logger know the highest committed txid across all loggers in the
* set. This txid may be higher than the last committed txid for <em>this</em>
* logger. See HDFS-3863 for details.
*/
public void setCommittedTxId(long txid);
/** /**
* Build an HTTP URL to fetch the log segment with the given startTxId. * Build an HTTP URL to fetch the log segment with the given startTxId.
*/ */

View File

@ -99,6 +99,18 @@ private void setEpoch(long e) {
} }
} }
/**
* Set the highest successfully committed txid seen by the writer.
* This should be called after a successful write to a quorum, and is used
* for extra sanity checks against the protocol. See HDFS-3863.
*/
public void setCommittedTxId(long txid) {
for (AsyncLogger logger : loggers) {
logger.setCommittedTxId(txid);
}
}
/** /**
* @return the epoch number for this writer. This may only be called after * @return the epoch number for this writer. This may only be called after
* a successful call to {@link #createNewUniqueEpoch(NamespaceInfo)}. * a successful call to {@link #createNewUniqueEpoch(NamespaceInfo)}.

View File

@ -30,6 +30,7 @@
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocolPB.PBHelper; import org.apache.hadoop.hdfs.protocolPB.PBHelper;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetEditLogManifestResponseProto; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetEditLogManifestResponseProto;
@ -73,6 +74,8 @@ public class IPCLoggerChannel implements AsyncLogger {
private final ListeningExecutorService executor; private final ListeningExecutorService executor;
private long ipcSerial = 0; private long ipcSerial = 0;
private long epoch = -1; private long epoch = -1;
private long committedTxId = HdfsConstants.INVALID_TXID;
private final String journalId; private final String journalId;
private final NamespaceInfo nsInfo; private final NamespaceInfo nsInfo;
private int httpPort = -1; private int httpPort = -1;
@ -115,11 +118,20 @@ public IPCLoggerChannel(Configuration conf,
executor = MoreExecutors.listeningDecorator( executor = MoreExecutors.listeningDecorator(
createExecutor()); createExecutor());
} }
@Override @Override
public synchronized void setEpoch(long epoch) { public synchronized void setEpoch(long epoch) {
this.epoch = epoch; this.epoch = epoch;
} }
@Override
public synchronized void setCommittedTxId(long txid) {
Preconditions.checkArgument(txid >= committedTxId,
"Trying to move committed txid backwards in client " +
"old: %s new: %s", committedTxId, txid);
this.committedTxId = txid;
}
@Override @Override
public void close() { public void close() {
// No more tasks may be submitted after this point. // No more tasks may be submitted after this point.
@ -183,7 +195,8 @@ public URL buildURLToFetchLogs(long segmentTxId) {
private synchronized RequestInfo createReqInfo() { private synchronized RequestInfo createReqInfo() {
Preconditions.checkState(epoch > 0, "bad epoch: " + epoch); Preconditions.checkState(epoch > 0, "bad epoch: " + epoch);
return new RequestInfo(journalId, epoch, ipcSerial++); return new RequestInfo(journalId, epoch, ipcSerial++,
committedTxId);
} }
@VisibleForTesting @VisibleForTesting

View File

@ -265,6 +265,21 @@ private void recoverUnclosedSegment(long segmentTxId) throws IOException {
SegmentStateProto logToSync = bestResponse.getSegmentState(); SegmentStateProto logToSync = bestResponse.getSegmentState();
assert segmentTxId == logToSync.getStartTxId(); assert segmentTxId == logToSync.getStartTxId();
// Sanity check: none of the loggers should be aware of a higher
// txid than the txid we intend to truncate to
for (Map.Entry<AsyncLogger, PrepareRecoveryResponseProto> e :
prepareResponses.entrySet()) {
AsyncLogger logger = e.getKey();
PrepareRecoveryResponseProto resp = e.getValue();
if (resp.hasLastCommittedTxId() &&
resp.getLastCommittedTxId() > logToSync.getEndTxId()) {
throw new AssertionError("Decided to synchronize log to " + logToSync +
" but logger " + logger + " had seen txid " +
resp.getLastCommittedTxId() + " committed");
}
}
URL syncFromUrl = bestLogger.buildURLToFetchLogs(segmentTxId); URL syncFromUrl = bestLogger.buildURLToFetchLogs(segmentTxId);
QuorumCall<AsyncLogger,Void> accept = loggers.acceptRecovery(logToSync, syncFromUrl); QuorumCall<AsyncLogger,Void> accept = loggers.acceptRecovery(logToSync, syncFromUrl);

View File

@ -102,6 +102,11 @@ protected void flushAndSync() throws IOException {
segmentTxId, firstTxToFlush, segmentTxId, firstTxToFlush,
numReadyTxns, data); numReadyTxns, data);
loggers.waitForWriteQuorum(qcall, 20000); // TODO: configurable timeout loggers.waitForWriteQuorum(qcall, 20000); // TODO: configurable timeout
// Since we successfully wrote this batch, let the loggers know. Any future
// RPCs will thus let the loggers know of the most recent transaction, even
// if a logger has fallen behind.
loggers.setCommittedTxId(firstTxToFlush + numReadyTxns - 1);
} }
} }
} }

View File

@ -18,17 +18,21 @@
package org.apache.hadoop.hdfs.qjournal.protocol; package org.apache.hadoop.hdfs.qjournal.protocol;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
@InterfaceAudience.Private @InterfaceAudience.Private
public class RequestInfo { public class RequestInfo {
private String jid; private String jid;
private long epoch; private long epoch;
private long ipcSerialNumber; private long ipcSerialNumber;
private long committedTxId;
public RequestInfo(String jid, long epoch, long ipcSerialNumber) { public RequestInfo(String jid, long epoch, long ipcSerialNumber,
long committedTxId) {
this.jid = jid; this.jid = jid;
this.epoch = epoch; this.epoch = epoch;
this.ipcSerialNumber = ipcSerialNumber; this.ipcSerialNumber = ipcSerialNumber;
this.committedTxId = committedTxId;
} }
public long getEpoch() { public long getEpoch() {
@ -51,4 +55,11 @@ public void setIpcSerialNumber(long ipcSerialNumber) {
this.ipcSerialNumber = ipcSerialNumber; this.ipcSerialNumber = ipcSerialNumber;
} }
public long getCommittedTxId() {
return committedTxId;
}
public boolean hasCommittedTxId() {
return (committedTxId != HdfsConstants.INVALID_TXID);
}
} }

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.hdfs.qjournal.protocolPB; package org.apache.hadoop.hdfs.qjournal.protocolPB;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocolPB.PBHelper; import org.apache.hadoop.hdfs.protocolPB.PBHelper;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos;
@ -199,6 +200,8 @@ private RequestInfo convert(
return new RequestInfo( return new RequestInfo(
reqInfo.getJournalId().getIdentifier(), reqInfo.getJournalId().getIdentifier(),
reqInfo.getEpoch(), reqInfo.getEpoch(),
reqInfo.getIpcSerialNumber()); reqInfo.getIpcSerialNumber(),
reqInfo.hasCommittedTxId() ?
reqInfo.getCommittedTxId() : HdfsConstants.INVALID_TXID);
} }
} }

View File

@ -40,6 +40,7 @@
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryRequestProto; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryRequestProto;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PurgeLogsRequestProto; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PurgeLogsRequestProto;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.RequestInfoProto;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.StartLogSegmentRequestProto; import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.StartLogSegmentRequestProto;
import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo; import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo;
@ -143,11 +144,14 @@ public void journal(RequestInfo reqInfo,
private QJournalProtocolProtos.RequestInfoProto convert( private QJournalProtocolProtos.RequestInfoProto convert(
RequestInfo reqInfo) { RequestInfo reqInfo) {
return QJournalProtocolProtos.RequestInfoProto.newBuilder() RequestInfoProto.Builder builder = RequestInfoProto.newBuilder()
.setJournalId(convertJournalId(reqInfo.getJournalId())) .setJournalId(convertJournalId(reqInfo.getJournalId()))
.setEpoch(reqInfo.getEpoch()) .setEpoch(reqInfo.getEpoch())
.setIpcSerialNumber(reqInfo.getIpcSerialNumber()) .setIpcSerialNumber(reqInfo.getIpcSerialNumber());
.build(); if (reqInfo.hasCommittedTxId()) {
builder.setCommittedTxId(reqInfo.getCommittedTxId());
}
return builder.build();
} }
@Override @Override

View File

@ -46,10 +46,13 @@
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
import org.apache.hadoop.hdfs.util.AtomicFileOutputStream; import org.apache.hadoop.hdfs.util.AtomicFileOutputStream;
import org.apache.hadoop.hdfs.util.BestEffortLongFile;
import org.apache.hadoop.hdfs.util.PersistentLongFile; import org.apache.hadoop.hdfs.util.PersistentLongFile;
import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IOUtils;
import com.google.common.base.Preconditions; import com.google.common.base.Preconditions;
import com.google.common.collect.Range;
import com.google.common.collect.Ranges;
import com.google.protobuf.ByteString; import com.google.protobuf.ByteString;
import com.google.protobuf.TextFormat; import com.google.protobuf.TextFormat;
@ -85,8 +88,17 @@ class Journal implements Closeable {
*/ */
private PersistentLongFile lastWriterEpoch; private PersistentLongFile lastWriterEpoch;
/**
* Lower-bound on the last committed transaction ID. This is not
* depended upon for correctness, but acts as a sanity check
* during the recovery procedures, and as a visibility mark
* for clients reading in-progress logs.
*/
private BestEffortLongFile committedTxnId;
private static final String LAST_PROMISED_FILENAME = "last-promised-epoch"; private static final String LAST_PROMISED_FILENAME = "last-promised-epoch";
private static final String LAST_WRITER_EPOCH = "last-writer-epoch"; private static final String LAST_WRITER_EPOCH = "last-writer-epoch";
private static final String COMMITTED_TXID_FILENAME = "committed-txid";
private final FileJournalManager fjm; private final FileJournalManager fjm;
@ -98,6 +110,9 @@ class Journal implements Closeable {
new File(currentDir, LAST_PROMISED_FILENAME), 0); new File(currentDir, LAST_PROMISED_FILENAME), 0);
this.lastWriterEpoch = new PersistentLongFile( this.lastWriterEpoch = new PersistentLongFile(
new File(currentDir, LAST_WRITER_EPOCH), 0); new File(currentDir, LAST_WRITER_EPOCH), 0);
this.committedTxnId = new BestEffortLongFile(
new File(currentDir, COMMITTED_TXID_FILENAME),
HdfsConstants.INVALID_TXID);
this.fjm = storage.getJournalManager(); this.fjm = storage.getJournalManager();
} }
@ -113,22 +128,21 @@ private synchronized void scanStorage() throws IOException {
} }
LOG.info("Scanning storage " + fjm); LOG.info("Scanning storage " + fjm);
List<EditLogFile> files = fjm.getLogFiles(0); List<EditLogFile> files = fjm.getLogFiles(0);
if (files.isEmpty()) {
curSegmentTxId = HdfsConstants.INVALID_TXID; curSegmentTxId = HdfsConstants.INVALID_TXID;
return;
}
EditLogFile latestLog = files.get(files.size() - 1); while (!files.isEmpty()) {
EditLogFile latestLog = files.remove(files.size() - 1);
latestLog.validateLog(); latestLog.validateLog();
LOG.info("Latest log is " + latestLog); LOG.info("Latest log is " + latestLog);
if (latestLog.getLastTxId() == HdfsConstants.INVALID_TXID) { if (latestLog.getLastTxId() == HdfsConstants.INVALID_TXID) {
// the log contains no transactions // the log contains no transactions
LOG.warn("Latest log " + latestLog + " has no transactions. " + LOG.warn("Latest log " + latestLog + " has no transactions. " +
"moving it aside"); "moving it aside and looking for previous log");
latestLog.moveAsideEmptyFile(); latestLog.moveAsideEmptyFile();
curSegmentTxId = HdfsConstants.INVALID_TXID;
} else { } else {
curSegmentTxId = latestLog.getFirstTxId(); curSegmentTxId = latestLog.getFirstTxId();
break;
}
} }
} }
@ -150,6 +164,8 @@ void format(NamespaceInfo nsInfo) throws IOException {
@Override // Closeable @Override // Closeable
public void close() throws IOException { public void close() throws IOException {
storage.close(); storage.close();
IOUtils.closeStream(committedTxnId);
} }
JNStorage getStorage() { JNStorage getStorage() {
@ -165,6 +181,10 @@ synchronized long getLastPromisedEpoch() throws IOException {
return lastPromisedEpoch.get(); return lastPromisedEpoch.get();
} }
synchronized long getCommittedTxnIdForTests() throws IOException {
return committedTxnId.get();
}
/** /**
* Try to create a new epoch for this journal. * Try to create a new epoch for this journal.
* @param nsInfo the namespace, which is verified for consistency or used to * @param nsInfo the namespace, which is verified for consistency or used to
@ -213,8 +233,8 @@ synchronized NewEpochResponseProto newEpoch(
synchronized void journal(RequestInfo reqInfo, synchronized void journal(RequestInfo reqInfo,
long segmentTxId, long firstTxnId, long segmentTxId, long firstTxnId,
int numTxns, byte[] records) throws IOException { int numTxns, byte[] records) throws IOException {
checkWriteRequest(reqInfo);
checkFormatted(); checkFormatted();
checkWriteRequest(reqInfo);
// TODO: if a JN goes down and comes back up, then it will throw // TODO: if a JN goes down and comes back up, then it will throw
// this exception on every edit. We should instead send back // this exception on every edit. We should instead send back
@ -245,6 +265,7 @@ synchronized void journal(RequestInfo reqInfo,
if (LOG.isTraceEnabled()) { if (LOG.isTraceEnabled()) {
LOG.trace("Writing txid " + firstTxnId + "-" + (firstTxnId + numTxns - 1)); LOG.trace("Writing txid " + firstTxnId + "-" + (firstTxnId + numTxns - 1));
} }
curSegment.writeRaw(records, 0, records.length); curSegment.writeRaw(records, 0, records.length);
curSegment.setReadyToFlush(); curSegment.setReadyToFlush();
curSegment.flush(); curSegment.flush();
@ -270,6 +291,15 @@ private synchronized void checkRequest(RequestInfo reqInfo) throws IOException {
// TODO: some check on serial number that they only increase from a given // TODO: some check on serial number that they only increase from a given
// client // client
if (reqInfo.hasCommittedTxId()) {
Preconditions.checkArgument(
reqInfo.getCommittedTxId() >= committedTxnId.get(),
"Client trying to move committed txid backward from " +
committedTxnId.get() + " to " + reqInfo.getCommittedTxId());
committedTxnId.set(reqInfo.getCommittedTxId());
}
} }
private synchronized void checkWriteRequest(RequestInfo reqInfo) throws IOException { private synchronized void checkWriteRequest(RequestInfo reqInfo) throws IOException {
@ -296,8 +326,8 @@ private void checkFormatted() throws JournalNotFormattedException {
public synchronized void startLogSegment(RequestInfo reqInfo, long txid) public synchronized void startLogSegment(RequestInfo reqInfo, long txid)
throws IOException { throws IOException {
assert fjm != null; assert fjm != null;
checkRequest(reqInfo);
checkFormatted(); checkFormatted();
checkRequest(reqInfo);
if (curSegment != null) { if (curSegment != null) {
LOG.warn("Client is requesting a new log segment " + txid + LOG.warn("Client is requesting a new log segment " + txid +
@ -352,8 +382,8 @@ public synchronized void startLogSegment(RequestInfo reqInfo, long txid)
*/ */
public synchronized void finalizeLogSegment(RequestInfo reqInfo, long startTxId, public synchronized void finalizeLogSegment(RequestInfo reqInfo, long startTxId,
long endTxId) throws IOException { long endTxId) throws IOException {
checkRequest(reqInfo);
checkFormatted(); checkFormatted();
checkRequest(reqInfo);
if (startTxId == curSegmentTxId) { if (startTxId == curSegmentTxId) {
if (curSegment != null) { if (curSegment != null) {
@ -397,8 +427,8 @@ public synchronized void finalizeLogSegment(RequestInfo reqInfo, long startTxId,
*/ */
public synchronized void purgeLogsOlderThan(RequestInfo reqInfo, public synchronized void purgeLogsOlderThan(RequestInfo reqInfo,
long minTxIdToKeep) throws IOException { long minTxIdToKeep) throws IOException {
checkRequest(reqInfo);
checkFormatted(); checkFormatted();
checkRequest(reqInfo);
fjm.purgeLogsOlderThan(minTxIdToKeep); fjm.purgeLogsOlderThan(minTxIdToKeep);
purgePaxosDecisionsOlderThan(minTxIdToKeep); purgePaxosDecisionsOlderThan(minTxIdToKeep);
@ -492,8 +522,8 @@ private SegmentStateProto getSegmentInfo(long segmentTxId)
*/ */
public synchronized PrepareRecoveryResponseProto prepareRecovery( public synchronized PrepareRecoveryResponseProto prepareRecovery(
RequestInfo reqInfo, long segmentTxId) throws IOException { RequestInfo reqInfo, long segmentTxId) throws IOException {
checkRequest(reqInfo);
checkFormatted(); checkFormatted();
checkRequest(reqInfo);
PrepareRecoveryResponseProto.Builder builder = PrepareRecoveryResponseProto.Builder builder =
PrepareRecoveryResponseProto.newBuilder(); PrepareRecoveryResponseProto.newBuilder();
@ -519,6 +549,9 @@ public synchronized PrepareRecoveryResponseProto prepareRecovery(
} }
builder.setLastWriterEpoch(lastWriterEpoch.get()); builder.setLastWriterEpoch(lastWriterEpoch.get());
if (committedTxnId.get() != HdfsConstants.INVALID_TXID) {
builder.setLastCommittedTxId(committedTxnId.get());
}
PrepareRecoveryResponseProto resp = builder.build(); PrepareRecoveryResponseProto resp = builder.build();
LOG.info("Prepared recovery for segment " + segmentTxId + ": " + LOG.info("Prepared recovery for segment " + segmentTxId + ": " +
@ -532,8 +565,8 @@ public synchronized PrepareRecoveryResponseProto prepareRecovery(
public synchronized void acceptRecovery(RequestInfo reqInfo, public synchronized void acceptRecovery(RequestInfo reqInfo,
SegmentStateProto segment, URL fromUrl) SegmentStateProto segment, URL fromUrl)
throws IOException { throws IOException {
checkRequest(reqInfo);
checkFormatted(); checkFormatted();
checkRequest(reqInfo);
long segmentTxId = segment.getStartTxId(); long segmentTxId = segment.getStartTxId();
// TODO: right now, a recovery of a segment when the log is // TODO: right now, a recovery of a segment when the log is
@ -563,8 +596,22 @@ public synchronized void acceptRecovery(RequestInfo reqInfo,
": no current segment in place"); ": no current segment in place");
} else { } else {
LOG.info("Synchronizing log " + TextFormat.shortDebugString(segment) + LOG.info("Synchronizing log " + TextFormat.shortDebugString(segment) +
": old segment " + TextFormat.shortDebugString(segment) + " is " + ": old segment " + TextFormat.shortDebugString(currentSegment) +
"not the right length"); " is not the right length");
// Paranoid sanity check: if the new log is shorter than the log we
// currently have, we should not end up discarding any transactions
// which are already Committed.
if (txnRange(currentSegment).contains(committedTxnId.get()) &&
!txnRange(segment).contains(committedTxnId.get())) {
throw new AssertionError(
"Cannot replace segment " +
TextFormat.shortDebugString(currentSegment) +
" with new segment " +
TextFormat.shortDebugString(segment) +
": would discard already-committed txn " +
committedTxnId.get());
}
} }
syncLog(reqInfo, segment, fromUrl); syncLog(reqInfo, segment, fromUrl);
} else { } else {
@ -581,6 +628,12 @@ public synchronized void acceptRecovery(RequestInfo reqInfo,
TextFormat.shortDebugString(newData)); TextFormat.shortDebugString(newData));
} }
private Range<Long> txnRange(SegmentStateProto seg) {
Preconditions.checkArgument(seg.hasEndTxId(),
"invalid segment: %s", seg);
return Ranges.closed(seg.getStartTxId(), seg.getEndTxId());
}
/** /**
* Synchronize a log segment from another JournalNode. * Synchronize a log segment from another JournalNode.
* @param reqInfo the request info for the recovery IPC * @param reqInfo the request info for the recovery IPC

View File

@ -0,0 +1,117 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.util;
import java.io.Closeable;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.io.IOUtils;
import com.google.common.io.Files;
import com.google.common.primitives.Longs;
/**
* Class that represents a file on disk which stores a single <code>long</code>
* value, but does not make any effort to make it truly durable. This is in
* contrast to {@link PersistentLongFile} which fsync()s the value on every
* change.
*
* This should be used for values which are updated frequently (such that
* performance is important) and not required to be up-to-date for correctness.
*
* This class also differs in that it stores the value as binary data instead
* of a textual string.
*/
@InterfaceAudience.Private
public class BestEffortLongFile implements Closeable {
private final File file;
private final long defaultVal;
private long value;
private FileChannel ch = null;
private ByteBuffer buf = ByteBuffer.allocate(Long.SIZE/8);
public BestEffortLongFile(File file, long defaultVal) {
this.file = file;
this.defaultVal = defaultVal;
}
public long get() throws IOException {
lazyOpen();
return value;
}
public void set(long newVal) throws IOException {
lazyOpen();
buf.clear();
buf.putLong(newVal);
buf.flip();
IOUtils.writeFully(ch, buf, 0);
value = newVal;
}
private void lazyOpen() throws IOException {
if (ch != null) {
return;
}
// Load current value.
byte[] data = null;
try {
data = Files.toByteArray(file);
} catch (FileNotFoundException fnfe) {
// Expected - this will use default value.
}
if (data != null && data.length != 0) {
if (data.length != Longs.BYTES) {
throw new IOException("File " + file + " had invalid length: " +
data.length);
}
value = Longs.fromByteArray(data);
} else {
value = defaultVal;
}
// Now open file for future writes.
RandomAccessFile raf = new RandomAccessFile(file, "rw");
try {
ch = raf.getChannel();
} finally {
if (ch == null) {
IOUtils.closeStream(raf);
}
}
}
@Override
public void close() throws IOException {
if (ch != null) {
ch.close();
}
}
}

View File

@ -31,6 +31,13 @@ message RequestInfoProto {
required JournalIdProto journalId = 1; required JournalIdProto journalId = 1;
required uint64 epoch = 2; required uint64 epoch = 2;
required uint64 ipcSerialNumber = 3; required uint64 ipcSerialNumber = 3;
// Whenever a writer makes a request, it informs
// the node of the latest committed txid. This may
// be higher than the transaction data included in the
// request itself, eg in the case that the node has
// fallen behind.
optional uint64 committedTxId = 4;
} }
message SegmentStateProto { message SegmentStateProto {
@ -163,6 +170,11 @@ message PrepareRecoveryResponseProto {
optional SegmentStateProto segmentState = 1; optional SegmentStateProto segmentState = 1;
optional uint64 acceptedInEpoch = 2; optional uint64 acceptedInEpoch = 2;
required uint64 lastWriterEpoch = 3; required uint64 lastWriterEpoch = 3;
// The highest committed txid that this logger has ever seen.
// This may be higher than the data it actually has, in the case
// that it was lagging before the old writer crashed.
optional uint64 lastCommittedTxId = 4;
} }
/** /**

View File

@ -186,6 +186,8 @@ public void testWriteEditsOneSlow() throws Exception {
Mockito.doReturn(slowLog).when(spyLoggers.get(2)).sendEdits( Mockito.doReturn(slowLog).when(spyLoggers.get(2)).sendEdits(
anyLong(), eq(1L), eq(1), Mockito.<byte[]>any()); anyLong(), eq(1L), eq(1), Mockito.<byte[]>any());
stm.flush(); stm.flush();
Mockito.verify(spyLoggers.get(0)).setCommittedTxId(1L);
} }
private EditLogOutputStream createLogSegment() throws IOException { private EditLogOutputStream createLogSegment() throws IOException {

View File

@ -33,6 +33,7 @@
import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.common.StorageErrorReporter; import org.apache.hadoop.hdfs.server.common.StorageErrorReporter;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.GenericTestUtils;
import org.junit.After; import org.junit.After;
import org.junit.Assume; import org.junit.Assume;
@ -70,6 +71,11 @@ public void verifyNoStorageErrors() throws Exception{
.reportErrorOnFile(Mockito.<File>any()); .reportErrorOnFile(Mockito.<File>any());
} }
@After
public void cleanup() {
IOUtils.closeStream(journal);
}
@Test @Test
public void testEpochHandling() throws Exception { public void testEpochHandling() throws Exception {
assertEquals(0, journal.getLastPromisedEpoch()); assertEquals(0, journal.getLastPromisedEpoch());
@ -88,16 +94,14 @@ public void testEpochHandling() throws Exception {
"Proposed epoch 3 <= last promise 3", ioe); "Proposed epoch 3 <= last promise 3", ioe);
} }
try { try {
journal.startLogSegment(new RequestInfo(JID, 1L, 1L), journal.startLogSegment(makeRI(1), 12345L);
12345L);
fail("Should have rejected call from prior epoch"); fail("Should have rejected call from prior epoch");
} catch (IOException ioe) { } catch (IOException ioe) {
GenericTestUtils.assertExceptionContains( GenericTestUtils.assertExceptionContains(
"epoch 1 is less than the last promised epoch 3", ioe); "epoch 1 is less than the last promised epoch 3", ioe);
} }
try { try {
journal.journal(new RequestInfo(JID, 1L, 1L), journal.journal(makeRI(1), 12345L, 100L, 0, new byte[0]);
12345L, 100L, 0, new byte[0]);
fail("Should have rejected call from prior epoch"); fail("Should have rejected call from prior epoch");
} catch (IOException ioe) { } catch (IOException ioe) {
GenericTestUtils.assertExceptionContains( GenericTestUtils.assertExceptionContains(
@ -105,11 +109,26 @@ public void testEpochHandling() throws Exception {
} }
} }
@Test
public void testMaintainCommittedTxId() throws Exception {
journal.newEpoch(FAKE_NSINFO, 1);
journal.startLogSegment(makeRI(1), 1);
// Send txids 1-3, with a request indicating only 0 committed
journal.journal(new RequestInfo(JID, 1, 2, 0), 1, 1, 3,
QJMTestUtil.createTxnData(1, 3));
assertEquals(0, journal.getCommittedTxnIdForTests());
// Send 4-6, with request indicating that through 3 is committed.
journal.journal(new RequestInfo(JID, 1, 3, 3), 1, 4, 3,
QJMTestUtil.createTxnData(4, 6));
assertEquals(3, journal.getCommittedTxnIdForTests());
}
@Test @Test
public void testRestartJournal() throws Exception { public void testRestartJournal() throws Exception {
journal.newEpoch(FAKE_NSINFO, 1); journal.newEpoch(FAKE_NSINFO, 1);
journal.startLogSegment(new RequestInfo("j", 1, 1), 1); journal.startLogSegment(makeRI(1), 1);
journal.journal(new RequestInfo("j", 1, 2), 1, 1, 2, journal.journal(makeRI(2), 1, 1, 2,
QJMTestUtil.createTxnData(1, 2)); QJMTestUtil.createTxnData(1, 2));
// Don't finalize. // Don't finalize.
@ -129,6 +148,23 @@ public void testRestartJournal() throws Exception {
assertEquals(1, newEpoch.getLastSegmentTxId()); assertEquals(1, newEpoch.getLastSegmentTxId());
} }
/**
* Test that, if the writer crashes at the very beginning of a segment,
* before any transactions are written, that the next newEpoch() call
* returns the prior segment txid as its most recent segment.
*/
@Test
public void testNewEpochAtBeginningOfSegment() throws Exception {
journal.newEpoch(FAKE_NSINFO, 1);
journal.startLogSegment(makeRI(1), 1);
journal.journal(makeRI(2), 1, 1, 2,
QJMTestUtil.createTxnData(1, 2));
journal.finalizeLogSegment(makeRI(3), 1, 2);
journal.startLogSegment(makeRI(3), 3);
NewEpochResponseProto resp = journal.newEpoch(FAKE_NSINFO, 2);
assertEquals(1, resp.getLastSegmentTxId());
}
@Test @Test
public void testJournalLocking() throws Exception { public void testJournalLocking() throws Exception {
Assume.assumeTrue(journal.getStorage().getStorageDir(0).isLockSupported()); Assume.assumeTrue(journal.getStorage().getStorageDir(0).isLockSupported());
@ -289,7 +325,7 @@ public void testStartLogSegmentWhenAlreadyExists() throws Exception {
} }
private static RequestInfo makeRI(int serial) { private static RequestInfo makeRI(int serial) {
return new RequestInfo(JID, 1, serial); return new RequestInfo(JID, 1, serial, 0);
} }
@Test @Test

View File

@ -119,8 +119,8 @@ public void testReturnsSegmentInfoAtEpochTransition() throws Exception {
response = ch.newEpoch(4).get(); response = ch.newEpoch(4).get();
ch.setEpoch(4); ch.setEpoch(4);
// Because the new segment is empty, it is equivalent to not having // Because the new segment is empty, it is equivalent to not having
// started writing it. // started writing it. Hence, we should return the prior segment txid.
assertEquals(0, response.getLastSegmentTxId()); assertEquals(1, response.getLastSegmentTxId());
} }
@Test @Test

View File

@ -0,0 +1,86 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.util;
import java.io.File;
import java.io.IOException;
import java.util.Random;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.io.IOUtils;
import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.*;
public class TestBestEffortLongFile {
private static final File FILE = new File(MiniDFSCluster.getBaseDirectory() +
File.separatorChar + "TestBestEffortLongFile");
@Before
public void cleanup() {
if (FILE.exists()) {
assertTrue(FILE.delete());
}
FILE.getParentFile().mkdirs();
}
@Test
public void testGetSet() throws IOException {
BestEffortLongFile f = new BestEffortLongFile(FILE, 12345L);
try {
// Before the file exists, should return default.
assertEquals(12345L, f.get());
// And first access should open it.
assertTrue(FILE.exists());
Random r = new Random();
for (int i = 0; i < 100; i++) {
long newVal = r.nextLong();
// Changing the value should be reflected in the next get() call.
f.set(newVal);
assertEquals(newVal, f.get());
// And should be reflected in a new instance (ie it actually got
// written to the file)
BestEffortLongFile f2 = new BestEffortLongFile(FILE, 999L);
try {
assertEquals(newVal, f2.get());
} finally {
IOUtils.closeStream(f2);
}
}
} finally {
IOUtils.closeStream(f);
}
}
@Test
public void testTruncatedFileReturnsDefault() throws IOException {
assertTrue(FILE.createNewFile());
assertEquals(0, FILE.length());
BestEffortLongFile f = new BestEffortLongFile(FILE, 12345L);
try {
assertEquals(12345L, f.get());
} finally {
f.close();
}
}
}