HDFS-2212. Refactor double-buffering code out of EditLogOutputStreams. Contributed by Todd Lipcon

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1151736 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Eli Collins 2011-07-28 05:46:28 +00:00
parent 60c41c55e7
commit 44320eed17
6 changed files with 234 additions and 115 deletions

View File

@ -616,6 +616,9 @@ Trunk (unreleased changes)
HDFS-2195. Refactor StorageDirectory to not be an non-static inner class.
(todd via eli)
HDFS-2212. Refactor double-buffering code out of EditLogOutputStreams.
(todd via eli)
OPTIMIZATIONS
HDFS-1458. Improve checkpoint performance by avoiding unnecessary image

View File

@ -17,18 +17,14 @@
*/
package org.apache.hadoop.hdfs.server.namenode;
import java.io.DataOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.net.NetUtils;
@ -46,21 +42,9 @@ class EditLogBackupOutputStream extends EditLogOutputStream {
private NamenodeProtocol backupNode; // RPC proxy to backup node
private NamenodeRegistration bnRegistration; // backup node registration
private NamenodeRegistration nnRegistration; // active node registration
private ArrayList<BufferedOp> bufCurrent; // current buffer for writing
private ArrayList<BufferedOp> bufReady; // buffer ready for flushing
private EditsDoubleBuffer doubleBuf;
private DataOutputBuffer out; // serialized output sent to backup node
private static class BufferedOp {
public final FSEditLogOpCodes opCode;
public final byte[] bytes;
public BufferedOp(FSEditLogOpCodes opCode, byte[] bytes) {
this.opCode = opCode;
this.bytes = bytes;
}
}
EditLogBackupOutputStream(NamenodeRegistration bnReg, // backup node
NamenodeRegistration nnReg) // active name-node
throws IOException {
@ -78,8 +62,7 @@ public BufferedOp(FSEditLogOpCodes opCode, byte[] bytes) {
Storage.LOG.error("Error connecting to: " + bnAddress, e);
throw e;
}
this.bufCurrent = new ArrayList<BufferedOp>();
this.bufReady = new ArrayList<BufferedOp>();
this.doubleBuf = new EditsDoubleBuffer(DEFAULT_BUFFER_SIZE);
this.out = new DataOutputBuffer(DEFAULT_BUFFER_SIZE);
}
@ -95,13 +78,8 @@ public JournalType getType() {
@Override // EditLogOutputStream
void write(FSEditLogOp op) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
DataOutputStream s = new DataOutputStream(baos);
FSEditLogOp.Writer w = new FSEditLogOp.Writer(s);
w.writeOp(op);
bufCurrent.add(new BufferedOp(op.opCode, baos.toByteArray()));
}
doubleBuf.writeOp(op);
}
@Override
void writeRaw(byte[] bytes, int offset, int length) throws IOException {
@ -113,55 +91,37 @@ void writeRaw(byte[] bytes, int offset, int length) throws IOException {
*/
@Override // EditLogOutputStream
void create() throws IOException {
bufCurrent.clear();
assert bufReady.size() == 0 : "previous data is not flushed yet";
assert doubleBuf.isFlushed() : "previous data is not flushed yet";
this.doubleBuf = new EditsDoubleBuffer(DEFAULT_BUFFER_SIZE);
}
@Override // EditLogOutputStream
public void close() throws IOException {
// close should have been called after all pending transactions
// have been flushed & synced.
int size = bufCurrent.size();
int size = doubleBuf.countBufferedBytes();
if (size != 0) {
throw new IOException("BackupEditStream has " + size +
" records still to be flushed and cannot be closed.");
}
RPC.stopProxy(backupNode); // stop the RPC threads
bufCurrent = bufReady = null;
doubleBuf.close();
doubleBuf = null;
}
@Override // EditLogOutputStream
void setReadyToFlush() throws IOException {
assert bufReady.size() == 0 : "previous data is not flushed yet";
ArrayList<BufferedOp> tmp = bufReady;
bufReady = bufCurrent;
bufCurrent = tmp;
doubleBuf.setReadyToFlush();
}
@Override // EditLogOutputStream
protected void flushAndSync() throws IOException {
assert out.size() == 0 : "Output buffer is not empty";
int bufReadySize = bufReady.size();
for(int idx = 0; idx < bufReadySize; idx++) {
BufferedOp jRec = null;
for(; idx < bufReadySize; idx++) {
jRec = bufReady.get(idx);
if(jRec.opCode.getOpCode()
>= FSEditLogOpCodes.OP_JSPOOL_START.getOpCode())
break; // special operation should be sent in a separate call to BN
out.write(jRec.bytes, 0, jRec.bytes.length);
}
if(out.size() > 0)
send(NamenodeProtocol.JA_JOURNAL);
if(idx == bufReadySize)
break;
// operation like start journal spool or increment checkpoint time
// is a separate call to BN
out.write(jRec.bytes, 0, jRec.bytes.length);
send(jRec.opCode.getOpCode());
// XXX: this code won't work in trunk, but it's redone
// in HDFS-1073 where it's simpler.
doubleBuf.flushTo(out);
if (out.size() > 0) {
send(NamenodeProtocol.JA_JOURNAL);
}
bufReady.clear(); // erase all data in the buffer
out.reset(); // reset buffer to the start position
}
/**

View File

@ -24,12 +24,9 @@
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.zip.Checksum;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Writable;
import com.google.common.annotations.VisibleForTesting;
@ -43,10 +40,7 @@ class EditLogFileOutputStream extends EditLogOutputStream {
private File file;
private FileOutputStream fp; // file stream for storing edit logs
private FileChannel fc; // channel of the file stream for sync
private DataOutputBuffer bufCurrent; // current buffer for writing
private DataOutputBuffer bufReady; // buffer ready for flushing
private FSEditLogOp.Writer writer;
final private int initBufferSize; // inital buffer size
private EditsDoubleBuffer doubleBuf;
static ByteBuffer fill = ByteBuffer.allocateDirect(1024 * 1024); // preallocation, 1MB
static {
@ -68,10 +62,7 @@ class EditLogFileOutputStream extends EditLogOutputStream {
EditLogFileOutputStream(File name, int size) throws IOException {
super();
file = name;
initBufferSize = size;
bufCurrent = new DataOutputBuffer(size);
bufReady = new DataOutputBuffer(size);
writer = new FSEditLogOp.Writer(bufCurrent);
doubleBuf = new EditsDoubleBuffer(size);
RandomAccessFile rp = new RandomAccessFile(name, "rw");
fp = new FileOutputStream(rp.getFD()); // open for append
fc = rp.getChannel();
@ -91,23 +82,13 @@ public JournalType getType() {
/** {@inheritDoc} */
@Override
void write(FSEditLogOp op) throws IOException {
int start = bufCurrent.getLength();
writer.writeOp(op);
// write transaction checksum
int end = bufCurrent.getLength();
Checksum checksum = FSEditLog.getChecksum();
checksum.reset();
checksum.update(bufCurrent.getData(), start, end-start);
int sum = (int)checksum.getValue();
bufCurrent.writeInt(sum);
doubleBuf.writeOp(op);
}
/** {@inheritDoc} */
@Override
void writeRaw(byte[] bytes, int offset, int length) throws IOException {
bufCurrent.write(bytes, offset, length);
doubleBuf.writeRaw(bytes, offset, length);
}
/**
@ -117,7 +98,7 @@ void writeRaw(byte[] bytes, int offset, int length) throws IOException {
void create() throws IOException {
fc.truncate(0);
fc.position(0);
bufCurrent.writeInt(FSConstants.LAYOUT_VERSION);
doubleBuf.getCurrentBuf().writeInt(FSConstants.LAYOUT_VERSION);
setReadyToFlush();
flush();
}
@ -128,21 +109,9 @@ public void close() throws IOException {
// close should have been called after all pending transactions
// have been flushed & synced.
// if already closed, just skip
if(bufCurrent != null)
{
int bufSize = bufCurrent.size();
if (bufSize != 0) {
throw new IOException("FSEditStream has " + bufSize
+ " bytes still to be flushed and cannot " + "be closed.");
}
bufCurrent.close();
bufCurrent = null;
writer = null;
}
if(bufReady != null) {
bufReady.close();
bufReady = null;
if (doubleBuf != null) {
doubleBuf.close();
doubleBuf = null;
}
// remove the last INVALID marker from transaction log.
@ -156,9 +125,8 @@ public void close() throws IOException {
fp = null;
}
} finally {
IOUtils.cleanup(FSNamesystem.LOG, bufCurrent, bufReady, fc, fp);
bufCurrent = bufReady = null;
writer = null;
IOUtils.cleanup(FSNamesystem.LOG, fc, fp);
doubleBuf = null;
fc = null;
fp = null;
}
@ -170,12 +138,8 @@ public void close() throws IOException {
*/
@Override
void setReadyToFlush() throws IOException {
assert bufReady.size() == 0 : "previous data is not flushed yet";
bufCurrent.write(FSEditLogOpCodes.OP_INVALID.getOpCode()); // insert eof marker
DataOutputBuffer tmp = bufReady;
bufReady = bufCurrent;
bufCurrent = tmp;
writer = new FSEditLogOp.Writer(bufCurrent);
doubleBuf.getCurrentBuf().write(FSEditLogOpCodes.OP_INVALID.getOpCode()); // insert eof marker
doubleBuf.setReadyToFlush();
}
/**
@ -185,8 +149,7 @@ void setReadyToFlush() throws IOException {
@Override
protected void flushAndSync() throws IOException {
preallocate(); // preallocate file if necessary
bufReady.writeTo(fp); // write data to file
bufReady.reset(); // erase all data in the buffer
doubleBuf.flushTo(fp);
fc.force(false); // metadata updates not needed because of preallocation
fc.position(fc.position() - 1); // skip back the end-of-file marker
}
@ -196,7 +159,7 @@ protected void flushAndSync() throws IOException {
*/
@Override
public boolean shouldForceSync() {
return bufReady.size() >= initBufferSize;
return doubleBuf.shouldForceSync();
}
/**
@ -205,8 +168,8 @@ public boolean shouldForceSync() {
@Override
long length() throws IOException {
// file size - header size + size of both buffers
return fc.size() - EDITS_FILE_HEADER_SIZE_BYTES + bufReady.size()
+ bufCurrent.size();
return fc.size() - EDITS_FILE_HEADER_SIZE_BYTES +
doubleBuf.countBufferedBytes();
}
// allocate a big chunk of data

View File

@ -0,0 +1,105 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.Writer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IOUtils;
import com.google.common.base.Preconditions;
/**
* A double-buffer for edits. New edits are written into the first buffer
* while the second is available to be flushed. Each time the double-buffer
* is flushed, the two internal buffers are swapped. This allows edits
* to progress concurrently to flushes without allocating new buffers each
* time.
*/
class EditsDoubleBuffer {
private DataOutputBuffer bufCurrent; // current buffer for writing
private DataOutputBuffer bufReady; // buffer ready for flushing
private final int initBufferSize;
private Writer writer;
public EditsDoubleBuffer(int defaultBufferSize) {
initBufferSize = defaultBufferSize;
bufCurrent = new DataOutputBuffer(initBufferSize);
bufReady = new DataOutputBuffer(initBufferSize);
writer = new FSEditLogOp.Writer(bufCurrent);
}
public void writeOp(FSEditLogOp op) throws IOException {
writer.writeOp(op);
}
void writeRaw(byte[] bytes, int offset, int length) throws IOException {
bufCurrent.write(bytes, offset, length);
}
void close() throws IOException {
Preconditions.checkNotNull(bufCurrent);
Preconditions.checkNotNull(bufReady);
int bufSize = bufCurrent.size();
if (bufSize != 0) {
throw new IOException("FSEditStream has " + bufSize
+ " bytes still to be flushed and cannot be closed.");
}
IOUtils.cleanup(null, bufCurrent, bufReady);
bufCurrent = bufReady = null;
}
void setReadyToFlush() {
assert isFlushed() : "previous data not flushed yet";
DataOutputBuffer tmp = bufReady;
bufReady = bufCurrent;
bufCurrent = tmp;
writer = new FSEditLogOp.Writer(bufCurrent);
}
/**
* Writes the content of the "ready" buffer to the given output stream,
* and resets it. Does not swap any buffers.
*/
void flushTo(OutputStream out) throws IOException {
bufReady.writeTo(out); // write data to file
bufReady.reset(); // erase all data in the buffer
}
boolean shouldForceSync() {
return bufReady.size() >= initBufferSize;
}
DataOutputBuffer getCurrentBuf() {
return bufCurrent;
}
public boolean isFlushed() {
return bufReady.size() == 0;
}
public int countBufferedBytes() {
return bufReady.size() + bufCurrent.size();
}
}

View File

@ -37,6 +37,7 @@
import static org.apache.hadoop.hdfs.server.namenode.FSEditLogOpCodes.*;
import org.apache.hadoop.security.token.delegation.DelegationKey;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.Writable;
@ -1341,10 +1342,10 @@ private static long readLongWritable(DataInputStream in) throws IOException {
* Class for writing editlog ops
*/
public static class Writer {
private final DataOutputStream out;
private final DataOutputBuffer buf;
public Writer(DataOutputStream out) {
this.out = out;
public Writer(DataOutputBuffer out) {
this.buf = out;
}
/**
@ -1354,9 +1355,15 @@ public Writer(DataOutputStream out) {
* @throws IOException if an error occurs during writing.
*/
public void writeOp(FSEditLogOp op) throws IOException {
out.writeByte(op.opCode.getOpCode());
op.writeFields(out);
int start = buf.getLength();
buf.writeByte(op.opCode.getOpCode());
op.writeFields(buf);
int end = buf.getLength();
Checksum checksum = FSEditLog.getChecksum();
checksum.reset();
checksum.update(buf.getData(), start, end-start);
int sum = (int)checksum.getValue();
buf.writeInt(sum);
}
}

View File

@ -0,0 +1,81 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode;
import static org.junit.Assert.*;
import java.io.IOException;
import org.apache.hadoop.io.DataOutputBuffer;
import org.junit.Test;
public class TestEditsDoubleBuffer {
@Test
public void testDoubleBuffer() throws IOException {
EditsDoubleBuffer buf = new EditsDoubleBuffer(1024);
assertTrue(buf.isFlushed());
byte[] data = new byte[100];
buf.writeRaw(data, 0, data.length);
assertEquals("Should count new data correctly",
data.length, buf.countBufferedBytes());
assertTrue("Writing to current buffer should not affect flush state",
buf.isFlushed());
// Swap the buffers
buf.setReadyToFlush();
assertEquals("Swapping buffers should still count buffered bytes",
data.length, buf.countBufferedBytes());
assertFalse(buf.isFlushed());
// Flush to a stream
DataOutputBuffer outBuf = new DataOutputBuffer();
buf.flushTo(outBuf);
assertEquals(data.length, outBuf.getLength());
assertTrue(buf.isFlushed());
assertEquals(0, buf.countBufferedBytes());
// Write some more
buf.writeRaw(data, 0, data.length);
assertEquals("Should count new data correctly",
data.length, buf.countBufferedBytes());
buf.setReadyToFlush();
buf.flushTo(outBuf);
assertEquals(data.length * 2, outBuf.getLength());
assertEquals(0, buf.countBufferedBytes());
outBuf.close();
}
@Test
public void shouldFailToCloseWhenUnflushed() throws IOException {
EditsDoubleBuffer buf = new EditsDoubleBuffer(1024);
buf.writeRaw(new byte[1], 0, 1);
try {
buf.close();
fail("Did not fail to close with unflushed data");
} catch (IOException ioe) {
if (!ioe.toString().contains("still to be flushed")) {
throw ioe;
}
}
}
}