From 401db4fc65140979fe7665983e36905e886df971 Mon Sep 17 00:00:00 2001 From: Zhe Zhang Date: Thu, 8 Sep 2016 11:54:33 -0700 Subject: [PATCH] HDFS-8901. Use ByteBuffer in striping positional read. Contributed by Sammi Chen and Kai Zheng. --- .../org/apache/hadoop/util/DataChecksum.java | 2 +- .../apache/hadoop/hdfs/DFSInputStream.java | 68 ++++--- .../hadoop/hdfs/DFSStripedInputStream.java | 24 ++- .../hadoop/hdfs/util/StripedBlockUtil.java | 185 ++++++++++-------- .../hdfs/TestDFSStripedInputStream.java | 121 +++++++++++- .../hdfs/util/TestStripedBlockUtil.java | 22 ++- 6 files changed, 285 insertions(+), 137 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DataChecksum.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DataChecksum.java index 3a53ed9e6a..6982a920d2 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DataChecksum.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/DataChecksum.java @@ -304,7 +304,7 @@ public void verifyChunkedSums(ByteBuffer data, ByteBuffer checksums, bytesPerChecksum, checksums.array(), crcsOffset, fileName, basePos); return; } - if (NativeCrc32.isAvailable()) { + if (NativeCrc32.isAvailable() && data.isDirect()) { NativeCrc32.verifyChunkedSums(bytesPerChecksum, type.id, checksums, data, fileName, basePos); } else { diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java index 7a10ba4d0c..31fa89757f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java @@ -533,7 +533,8 @@ private List getFinalizedBlockRange( * Open a DataInputStream to a DataNode so that it can be read from. * We get block ID and the IDs of the destinations at startup, from the namenode. */ - private synchronized DatanodeInfo blockSeekTo(long target) throws IOException { + private synchronized DatanodeInfo blockSeekTo(long target) + throws IOException { if (target >= getFileLength()) { throw new IOException("Attempted to read past end of file"); } @@ -962,14 +963,14 @@ private static String getBestNodeDNAddrPairErrorString( } protected void fetchBlockByteRange(LocatedBlock block, long start, long end, - byte[] buf, int offset, CorruptedBlocks corruptedBlocks) + ByteBuffer buf, CorruptedBlocks corruptedBlocks) throws IOException { block = refreshLocatedBlock(block); while (true) { DNAddrPair addressPair = chooseDataNode(block, null); try { actualGetFromOneDataNode(addressPair, block, start, end, - buf, offset, corruptedBlocks); + buf, corruptedBlocks); return; } catch (IOException e) { checkInterrupted(e); // check if the read has been interrupted @@ -988,12 +989,10 @@ private Callable getFromOneDataNode(final DNAddrPair datanode, return new Callable() { @Override public ByteBuffer call() throws Exception { - byte[] buf = bb.array(); - int offset = bb.position(); try (TraceScope ignored = dfsClient.getTracer(). newScope("hedgedRead" + hedgedReadId, parentSpanId)) { - actualGetFromOneDataNode(datanode, block, start, end, buf, - offset, corruptedBlocks); + actualGetFromOneDataNode(datanode, block, start, end, bb, + corruptedBlocks); return bb; } } @@ -1007,13 +1006,12 @@ public ByteBuffer call() throws Exception { * @param block the located block containing the requested data * @param startInBlk the startInBlk offset of the block * @param endInBlk the endInBlk offset of the block - * @param buf the given byte array into which the data is read - * @param offset the offset in buf + * @param buf the given byte buffer into which the data is read * @param corruptedBlocks map recording list of datanodes with corrupted * block replica */ void actualGetFromOneDataNode(final DNAddrPair datanode, LocatedBlock block, - final long startInBlk, final long endInBlk, byte[] buf, int offset, + final long startInBlk, final long endInBlk, ByteBuffer buf, CorruptedBlocks corruptedBlocks) throws IOException { DFSClientFaultInjector.get().startFetchFromDatanode(); @@ -1031,7 +1029,22 @@ void actualGetFromOneDataNode(final DNAddrPair datanode, LocatedBlock block, DFSClientFaultInjector.get().fetchFromDatanodeException(); reader = getBlockReader(block, startInBlk, len, datanode.addr, datanode.storageType, datanode.info); - int nread = reader.readAll(buf, offset, len); + + //Behave exactly as the readAll() call + ByteBuffer tmp = buf.duplicate(); + tmp.limit(tmp.position() + len); + tmp = tmp.slice(); + int nread = 0; + int ret; + while (true) { + ret = reader.read(tmp); + if (ret <= 0) { + break; + } + nread += ret; + } + buf.position(buf.position() + nread); + IOUtilsClient.updateReadStatistics(readStatistics, nread, reader); dfsClient.updateFileSystemReadStats( reader.getNetworkDistance(), nread); @@ -1098,7 +1111,7 @@ protected LocatedBlock refreshLocatedBlock(LocatedBlock block) * time. We then wait on which ever read returns first. */ private void hedgedFetchBlockByteRange(LocatedBlock block, long start, - long end, byte[] buf, int offset, CorruptedBlocks corruptedBlocks) + long end, ByteBuffer buf, CorruptedBlocks corruptedBlocks) throws IOException { final DfsClientConf conf = dfsClient.getConf(); ArrayList> futures = new ArrayList<>(); @@ -1130,8 +1143,8 @@ private void hedgedFetchBlockByteRange(LocatedBlock block, long start, conf.getHedgedReadThresholdMillis(), TimeUnit.MILLISECONDS); if (future != null) { ByteBuffer result = future.get(); - System.arraycopy(result.array(), result.position(), buf, offset, - len); + result.flip(); + buf.put(result); return; } DFSClient.LOG.debug("Waited {}ms to read from {}; spawning hedged " @@ -1173,8 +1186,8 @@ private void hedgedFetchBlockByteRange(LocatedBlock block, long start, // cancel the rest. cancelAll(futures); dfsClient.getHedgedReadMetrics().incHedgedReadWins(); - System.arraycopy(result.array(), result.position(), buf, offset, - len); + result.flip(); + buf.put(result); return; } catch (InterruptedException ie) { // Ignore and retry @@ -1244,7 +1257,8 @@ protected static boolean tokenRefetchNeeded(IOException ex, * access key from its memory since it's considered expired based on * the estimated expiration date. */ - if (ex instanceof InvalidBlockTokenException || ex instanceof InvalidToken) { + if (ex instanceof InvalidBlockTokenException || + ex instanceof InvalidToken) { DFSClient.LOG.info("Access token was invalid when connecting to " + targetAddr + " : " + ex); return true; @@ -1272,7 +1286,8 @@ public int read(long position, byte[] buffer, int offset, int length) try (TraceScope scope = dfsClient. newReaderTraceScope("DFSInputStream#byteArrayPread", src, position, length)) { - int retLen = pread(position, buffer, offset, length); + ByteBuffer bb = ByteBuffer.wrap(buffer, offset, length); + int retLen = pread(position, bb); if (retLen < length) { dfsClient.addRetLenToReaderScope(scope, retLen); } @@ -1280,7 +1295,7 @@ public int read(long position, byte[] buffer, int offset, int length) } } - private int pread(long position, byte[] buffer, int offset, int length) + private int pread(long position, ByteBuffer buffer) throws IOException { // sanity checks dfsClient.checkOpen(); @@ -1292,6 +1307,7 @@ private int pread(long position, byte[] buffer, int offset, int length) if ((position < 0) || (position >= filelen)) { return -1; } + int length = buffer.remaining(); int realLen = length; if ((position + length) > filelen) { realLen = (int)(filelen - position); @@ -1304,14 +1320,16 @@ private int pread(long position, byte[] buffer, int offset, int length) CorruptedBlocks corruptedBlocks = new CorruptedBlocks(); for (LocatedBlock blk : blockRange) { long targetStart = position - blk.getStartOffset(); - long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart); + int bytesToRead = (int) Math.min(remaining, + blk.getBlockSize() - targetStart); + long targetEnd = targetStart + bytesToRead - 1; try { if (dfsClient.isHedgedReadsEnabled() && !blk.isStriped()) { hedgedFetchBlockByteRange(blk, targetStart, - targetStart + bytesToRead - 1, buffer, offset, corruptedBlocks); + targetEnd, buffer, corruptedBlocks); } else { - fetchBlockByteRange(blk, targetStart, targetStart + bytesToRead - 1, - buffer, offset, corruptedBlocks); + fetchBlockByteRange(blk, targetStart, targetEnd, + buffer, corruptedBlocks); } } finally { // Check and report if any block replicas are corrupted. @@ -1323,7 +1341,6 @@ private int pread(long position, byte[] buffer, int offset, int length) remaining -= bytesToRead; position += bytesToRead; - offset += bytesToRead; } assert remaining == 0 : "Wrong number of bytes read."; return realLen; @@ -1457,7 +1474,8 @@ private boolean seekToBlockSource(long targetPos) * If another node could not be found, then returns false. */ @Override - public synchronized boolean seekToNewSource(long targetPos) throws IOException { + public synchronized boolean seekToNewSource(long targetPos) + throws IOException { if (currentNode == null) { return seekToBlockSource(targetPos); } diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSStripedInputStream.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSStripedInputStream.java index 9ca8005f23..ccaf6a78db 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSStripedInputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSStripedInputStream.java @@ -307,8 +307,8 @@ private void readOneStripe(CorruptedBlocks corruptedBlocks) stripeLimit - stripeBufOffset); LocatedStripedBlock blockGroup = (LocatedStripedBlock) currentLocatedBlock; - AlignedStripe[] stripes = StripedBlockUtil.divideOneStripe(ecPolicy, cellSize, - blockGroup, offsetInBlockGroup, + AlignedStripe[] stripes = StripedBlockUtil.divideOneStripe(ecPolicy, + cellSize, blockGroup, offsetInBlockGroup, offsetInBlockGroup + stripeRange.length - 1, curStripeBuf); final LocatedBlock[] blks = StripedBlockUtil.parseStripedBlockGroup( blockGroup, cellSize, dataBlkNum, parityBlkNum); @@ -523,13 +523,13 @@ private LocatedStripedBlock getBlockGroupAt(long offset) throws IOException { */ @Override protected void fetchBlockByteRange(LocatedBlock block, long start, - long end, byte[] buf, int offset, CorruptedBlocks corruptedBlocks) + long end, ByteBuffer buf, CorruptedBlocks corruptedBlocks) throws IOException { // Refresh the striped block group LocatedStripedBlock blockGroup = getBlockGroupAt(block.getStartOffset()); AlignedStripe[] stripes = StripedBlockUtil.divideByteRangeIntoStripes( - ecPolicy, cellSize, blockGroup, start, end, buf, offset); + ecPolicy, cellSize, blockGroup, start, end, buf); CompletionService readService = new ExecutorCompletionService<>( dfsClient.getStripedReadsThreadPool()); final LocatedBlock[] blks = StripedBlockUtil.parseStripedBlockGroup( @@ -542,6 +542,7 @@ protected void fetchBlockByteRange(LocatedBlock block, long start, blks, preaderInfos, corruptedBlocks); preader.readStripe(); } + buf.position(buf.position() + (int)(end - start + 1)); } finally { for (BlockReaderInfo preaderInfo : preaderInfos) { closeReader(preaderInfo); @@ -698,16 +699,15 @@ boolean createBlockReader(LocatedBlock block, int chunkIndex) } private ByteBufferStrategy[] getReadStrategies(StripingChunk chunk) { - if (chunk.byteBuffer != null) { - ByteBufferStrategy strategy = - new ByteBufferStrategy(chunk.byteBuffer, readStatistics, dfsClient); + if (chunk.useByteBuffer()) { + ByteBufferStrategy strategy = new ByteBufferStrategy( + chunk.getByteBuffer(), readStatistics, dfsClient); return new ByteBufferStrategy[]{strategy}; } else { ByteBufferStrategy[] strategies = - new ByteBufferStrategy[chunk.byteArray.getOffsets().length]; + new ByteBufferStrategy[chunk.getChunkBuffer().getSlices().size()]; for (int i = 0; i < strategies.length; i++) { - ByteBuffer buffer = ByteBuffer.wrap(chunk.byteArray.buf(), - chunk.byteArray.getOffsets()[i], chunk.byteArray.getLengths()[i]); + ByteBuffer buffer = chunk.getChunkBuffer().getSlice(i); strategies[i] = new ByteBufferStrategy(buffer, readStatistics, dfsClient); } @@ -814,7 +814,7 @@ void readStripe() throws IOException { } class PositionStripeReader extends StripeReader { - private byte[][] decodeInputs = null; + private ByteBuffer[] decodeInputs = null; PositionStripeReader(CompletionService service, AlignedStripe alignedStripe, LocatedBlock[] targetBlocks, @@ -836,8 +836,6 @@ boolean prepareParityChunk(int index) { Preconditions.checkState(index >= dataBlkNum && alignedStripe.chunks[index] == null); alignedStripe.chunks[index] = new StripingChunk(decodeInputs[index]); - alignedStripe.chunks[index].addByteArraySlice(0, - (int) alignedStripe.getSpanInBlock()); return true; } diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/util/StripedBlockUtil.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/util/StripedBlockUtil.java index c8827d9b56..4dbbc3dd70 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/util/StripedBlockUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/util/StripedBlockUtil.java @@ -73,7 +73,8 @@ @InterfaceAudience.Private public class StripedBlockUtil { - public static final Logger LOG = LoggerFactory.getLogger(StripedBlockUtil.class); + public static final Logger LOG = + LoggerFactory.getLogger(StripedBlockUtil.class); /** * Parses a striped block group into individual blocks. @@ -312,16 +313,17 @@ public static long spaceConsumedByStripedBlock(long numDataBlkBytes, * schedule a new fetch request with the decoding input buffer as transfer * destination. */ - public static byte[][] initDecodeInputs(AlignedStripe alignedStripe, + public static ByteBuffer[] initDecodeInputs(AlignedStripe alignedStripe, int dataBlkNum, int parityBlkNum) { - byte[][] decodeInputs = - new byte[dataBlkNum + parityBlkNum][(int) alignedStripe.getSpanInBlock()]; + ByteBuffer[] decodeInputs = new ByteBuffer[dataBlkNum + parityBlkNum]; + for (int i = 0; i < decodeInputs.length; i++) { + decodeInputs[i] = ByteBuffer.allocate( + (int) alignedStripe.getSpanInBlock()); + } // read the full data aligned stripe for (int i = 0; i < dataBlkNum; i++) { if (alignedStripe.chunks[i] == null) { alignedStripe.chunks[i] = new StripingChunk(decodeInputs[i]); - alignedStripe.chunks[i].addByteArraySlice(0, - (int) alignedStripe.getSpanInBlock()); } } return decodeInputs; @@ -334,14 +336,21 @@ public static byte[][] initDecodeInputs(AlignedStripe alignedStripe, * When all pending requests have returned, this method should be called to * finalize decode input buffers. */ - public static void finalizeDecodeInputs(final byte[][] decodeInputs, + public static void finalizeDecodeInputs(final ByteBuffer[] decodeInputs, AlignedStripe alignedStripe) { for (int i = 0; i < alignedStripe.chunks.length; i++) { final StripingChunk chunk = alignedStripe.chunks[i]; if (chunk != null && chunk.state == StripingChunk.FETCHED) { - chunk.copyTo(decodeInputs[i]); + if (chunk.useChunkBuffer()) { + chunk.getChunkBuffer().copyTo(decodeInputs[i]); + } else { + chunk.getByteBuffer().flip(); + } } else if (chunk != null && chunk.state == StripingChunk.ALLZERO) { - Arrays.fill(decodeInputs[i], (byte) 0); + //ZERO it. Will be better handled in other following issue. + byte[] emptyBytes = new byte[decodeInputs[i].limit()]; + decodeInputs[i].put(emptyBytes); + decodeInputs[i].flip(); } else { decodeInputs[i] = null; } @@ -351,7 +360,7 @@ public static void finalizeDecodeInputs(final byte[][] decodeInputs, /** * Decode based on the given input buffers and erasure coding policy. */ - public static void decodeAndFillBuffer(final byte[][] decodeInputs, + public static void decodeAndFillBuffer(final ByteBuffer[] decodeInputs, AlignedStripe alignedStripe, int dataBlkNum, int parityBlkNum, RawErasureDecoder decoder) { // Step 1: prepare indices and output buffers for missing data units @@ -364,8 +373,11 @@ public static void decodeAndFillBuffer(final byte[][] decodeInputs, } } decodeIndices = Arrays.copyOf(decodeIndices, pos); - byte[][] decodeOutputs = - new byte[decodeIndices.length][(int) alignedStripe.getSpanInBlock()]; + ByteBuffer[] decodeOutputs = new ByteBuffer[decodeIndices.length]; + for (int i = 0; i < decodeOutputs.length; i++) { + decodeOutputs[i] = ByteBuffer.allocate( + (int) alignedStripe.getSpanInBlock()); + } // Step 2: decode into prepared output buffers decoder.decode(decodeInputs, decodeIndices, decodeOutputs); @@ -374,8 +386,8 @@ public static void decodeAndFillBuffer(final byte[][] decodeInputs, for (int i = 0; i < decodeIndices.length; i++) { int missingBlkIdx = decodeIndices[i]; StripingChunk chunk = alignedStripe.chunks[missingBlkIdx]; - if (chunk.state == StripingChunk.MISSING) { - chunk.copyFrom(decodeOutputs[i]); + if (chunk.state == StripingChunk.MISSING && chunk.useChunkBuffer()) { + chunk.getChunkBuffer().copyFrom(decodeOutputs[i]); } } } @@ -402,7 +414,8 @@ public static AlignedStripe[] divideOneStripe(ErasureCodingPolicy ecPolicy, // Step 4: calculate each chunk's position in destination buffer. Since the // whole read range is within a single stripe, the logic is simpler here. - int bufOffset = (int) (rangeStartInBlockGroup % ((long) cellSize * dataBlkNum)); + int bufOffset = + (int) (rangeStartInBlockGroup % ((long) cellSize * dataBlkNum)); for (StripingCell cell : cells) { long cellStart = cell.idxInInternalBlk * cellSize + cell.offset; long cellEnd = cellStart + cell.size - 1; @@ -437,15 +450,14 @@ public static AlignedStripe[] divideOneStripe(ErasureCodingPolicy ecPolicy, * @param rangeStartInBlockGroup The byte range's start offset in block group * @param rangeEndInBlockGroup The byte range's end offset in block group * @param buf Destination buffer of the read operation for the byte range - * @param offsetInBuf Start offset into the destination buffer * * At most 5 stripes will be generated from each logical range, as * demonstrated in the header of {@link AlignedStripe}. */ - public static AlignedStripe[] divideByteRangeIntoStripes(ErasureCodingPolicy ecPolicy, + public static AlignedStripe[] divideByteRangeIntoStripes( + ErasureCodingPolicy ecPolicy, int cellSize, LocatedStripedBlock blockGroup, - long rangeStartInBlockGroup, long rangeEndInBlockGroup, byte[] buf, - int offsetInBuf) { + long rangeStartInBlockGroup, long rangeEndInBlockGroup, ByteBuffer buf) { // Step 0: analyze range and calculate basic parameters final int dataBlkNum = ecPolicy.getNumDataUnits(); @@ -462,7 +474,7 @@ public static AlignedStripe[] divideByteRangeIntoStripes(ErasureCodingPolicy ecP AlignedStripe[] stripes = mergeRangesForInternalBlocks(ecPolicy, ranges); // Step 4: calculate each chunk's position in destination buffer - calcualteChunkPositionsInBuf(cellSize, stripes, cells, buf, offsetInBuf); + calcualteChunkPositionsInBuf(cellSize, stripes, cells, buf); // Step 5: prepare ALLZERO blocks prepareAllZeroChunks(blockGroup, stripes, cellSize, dataBlkNum); @@ -476,7 +488,8 @@ public static AlignedStripe[] divideByteRangeIntoStripes(ErasureCodingPolicy ecP * used by {@link DFSStripedOutputStream} in encoding */ @VisibleForTesting - private static StripingCell[] getStripingCellsOfByteRange(ErasureCodingPolicy ecPolicy, + private static StripingCell[] getStripingCellsOfByteRange( + ErasureCodingPolicy ecPolicy, int cellSize, LocatedStripedBlock blockGroup, long rangeStartInBlockGroup, long rangeEndInBlockGroup) { Preconditions.checkArgument( @@ -511,7 +524,8 @@ private static StripingCell[] getStripingCellsOfByteRange(ErasureCodingPolicy ec * the physical byte range (inclusive) on each stored internal block. */ @VisibleForTesting - private static VerticalRange[] getRangesForInternalBlocks(ErasureCodingPolicy ecPolicy, + private static VerticalRange[] getRangesForInternalBlocks( + ErasureCodingPolicy ecPolicy, int cellSize, StripingCell[] cells) { int dataBlkNum = ecPolicy.getNumDataUnits(); int parityBlkNum = ecPolicy.getNumParityUnits(); @@ -575,8 +589,7 @@ private static AlignedStripe[] mergeRangesForInternalBlocks( } private static void calcualteChunkPositionsInBuf(int cellSize, - AlignedStripe[] stripes, StripingCell[] cells, byte[] buf, - int offsetInBuf) { + AlignedStripe[] stripes, StripingCell[] cells, ByteBuffer buf) { /** * | <--------------- AlignedStripe --------------->| * @@ -598,6 +611,7 @@ private static void calcualteChunkPositionsInBuf(int cellSize, for (StripingCell cell : cells) { long cellStart = cell.idxInInternalBlk * cellSize + cell.offset; long cellEnd = cellStart + cell.size - 1; + StripingChunk chunk; for (AlignedStripe s : stripes) { long stripeEnd = s.getOffsetInBlock() + s.getSpanInBlock() - 1; long overlapStart = Math.max(cellStart, s.getOffsetInBlock()); @@ -606,11 +620,13 @@ private static void calcualteChunkPositionsInBuf(int cellSize, if (overLapLen <= 0) { continue; } - if (s.chunks[cell.idxInStripe] == null) { - s.chunks[cell.idxInStripe] = new StripingChunk(buf); + chunk = s.chunks[cell.idxInStripe]; + if (chunk == null) { + chunk = new StripingChunk(); + s.chunks[cell.idxInStripe] = chunk; } - s.chunks[cell.idxInStripe].addByteArraySlice( - (int)(offsetInBuf + done + overlapStart - cellStart), overLapLen); + chunk.getChunkBuffer().addSlice(buf, + (int) (done + overlapStart - cellStart), overLapLen); } done += cell.size; } @@ -833,88 +849,89 @@ public static class StripingChunk { */ public int state = REQUESTED; - public final ChunkByteArray byteArray; - public final ByteBuffer byteBuffer; + private final ChunkByteBuffer chunkBuffer; + private final ByteBuffer byteBuffer; - public StripingChunk(byte[] buf) { - this.byteArray = new ChunkByteArray(buf); + public StripingChunk() { + this.chunkBuffer = new ChunkByteBuffer(); byteBuffer = null; } public StripingChunk(ByteBuffer buf) { - this.byteArray = null; + this.chunkBuffer = null; this.byteBuffer = buf; } public StripingChunk(int state) { - this.byteArray = null; + this.chunkBuffer = null; this.byteBuffer = null; this.state = state; } - public void addByteArraySlice(int offset, int length) { - assert byteArray != null; - byteArray.offsetsInBuf.add(offset); - byteArray.lengthsInBuf.add(length); + public boolean useByteBuffer(){ + return byteBuffer != null; } - void copyTo(byte[] target) { - assert byteArray != null; - byteArray.copyTo(target); + public boolean useChunkBuffer() { + return chunkBuffer != null; } - void copyFrom(byte[] src) { - assert byteArray != null; - byteArray.copyFrom(src); + public ByteBuffer getByteBuffer() { + assert byteBuffer != null; + return byteBuffer; + } + + public ChunkByteBuffer getChunkBuffer() { + assert chunkBuffer != null; + return chunkBuffer; } } - public static class ChunkByteArray { - private final byte[] buf; - private final List offsetsInBuf; - private final List lengthsInBuf; + /** + * A utility to manage ByteBuffer slices for a reader. + */ + public static class ChunkByteBuffer { + private final List slices; - ChunkByteArray(byte[] buf) { - this.buf = buf; - this.offsetsInBuf = new ArrayList<>(); - this.lengthsInBuf = new ArrayList<>(); + ChunkByteBuffer() { + this.slices = new ArrayList<>(); } - public int[] getOffsets() { - int[] offsets = new int[offsetsInBuf.size()]; - for (int i = 0; i < offsets.length; i++) { - offsets[i] = offsetsInBuf.get(i); + public void addSlice(ByteBuffer buffer, int offset, int len) { + ByteBuffer tmp = buffer.duplicate(); + tmp.position(buffer.position() + offset); + tmp.limit(buffer.position() + offset + len); + slices.add(tmp.slice()); + } + + public ByteBuffer getSlice(int i) { + return slices.get(i); + } + + public List getSlices() { + return slices; + } + + /** + * Note: target will be ready-to-read state after the call. + */ + void copyTo(ByteBuffer target) { + for (ByteBuffer slice : slices) { + slice.flip(); + target.put(slice); } - return offsets; + target.flip(); } - public int[] getLengths() { - int[] lens = new int[this.lengthsInBuf.size()]; - for (int i = 0; i < lens.length; i++) { - lens[i] = this.lengthsInBuf.get(i); - } - return lens; - } - - public byte[] buf() { - return buf; - } - - void copyTo(byte[] target) { - int posInBuf = 0; - for (int i = 0; i < offsetsInBuf.size(); i++) { - System.arraycopy(buf, offsetsInBuf.get(i), - target, posInBuf, lengthsInBuf.get(i)); - posInBuf += lengthsInBuf.get(i); - } - } - - void copyFrom(byte[] src) { - int srcPos = 0; - for (int j = 0; j < offsetsInBuf.size(); j++) { - System.arraycopy(src, srcPos, buf, offsetsInBuf.get(j), - lengthsInBuf.get(j)); - srcPos += lengthsInBuf.get(j); + void copyFrom(ByteBuffer src) { + ByteBuffer tmp; + int len; + for (ByteBuffer slice : slices) { + len = slice.remaining(); + tmp = src.duplicate(); + tmp.limit(tmp.position() + len); + slice.put(tmp); + src.position(src.position() + len); } } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSStripedInputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSStripedInputStream.java index 18c2de91d7..1e27745e49 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSStripedInputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSStripedInputStream.java @@ -57,7 +57,8 @@ public class TestDFSStripedInputStream { - public static final Log LOG = LogFactory.getLog(TestDFSStripedInputStream.class); + public static final Log LOG = + LogFactory.getLog(TestDFSStripedInputStream.class); private MiniDFSCluster cluster; private Configuration conf = new Configuration(); @@ -272,12 +273,16 @@ public void testPreadWithDNFailure() throws Exception { // |10 | done += in.read(0, readBuffer, 0, delta); assertEquals(delta, done); + assertArrayEquals(Arrays.copyOf(expected, done), + Arrays.copyOf(readBuffer, done)); // both head and trail cells are partial // |c_0 |c_1 |c_2 |c_3 |c_4 |c_5 | // |256K - 10|missing|256K|256K|256K - 10|not in range| done += in.read(delta, readBuffer, delta, CELLSIZE * (DATA_BLK_NUM - 1) - 2 * delta); assertEquals(CELLSIZE * (DATA_BLK_NUM - 1) - delta, done); + assertArrayEquals(Arrays.copyOf(expected, done), + Arrays.copyOf(readBuffer, done)); // read the rest done += in.read(done, readBuffer, done, readSize - done); assertEquals(readSize, done); @@ -291,8 +296,8 @@ public void testStatefulRead() throws Exception { testStatefulRead(true, true); } - private void testStatefulRead(boolean useByteBuffer, boolean cellMisalignPacket) - throws Exception { + private void testStatefulRead(boolean useByteBuffer, + boolean cellMisalignPacket) throws Exception { final int numBlocks = 2; final int fileSize = numBlocks * BLOCK_GROUP_SIZE; if (cellMisalignPacket) { @@ -302,7 +307,8 @@ private void testStatefulRead(boolean useByteBuffer, boolean cellMisalignPacket) } DFSTestUtil.createStripedFile(cluster, filePath, null, numBlocks, NUM_STRIPE_PER_BLOCK, false); - LocatedBlocks lbs = fs.getClient().namenode.getBlockLocations(filePath.toString(), 0, fileSize); + LocatedBlocks lbs = fs.getClient().namenode. + getBlockLocations(filePath.toString(), 0, fileSize); assert lbs.getLocatedBlocks().size() == numBlocks; for (LocatedBlock lb : lbs.getLocatedBlocks()) { @@ -360,4 +366,111 @@ private void testStatefulRead(boolean useByteBuffer, boolean cellMisalignPacket) } fs.delete(filePath, true); } + + @Test + public void testStatefulReadWithDNFailure() throws Exception { + final int numBlocks = 4; + final int failedDNIdx = DATA_BLK_NUM - 1; + DFSTestUtil.createStripedFile(cluster, filePath, null, numBlocks, + NUM_STRIPE_PER_BLOCK, false); + LocatedBlocks lbs = fs.getClient().namenode.getBlockLocations( + filePath.toString(), 0, BLOCK_GROUP_SIZE); + + assert lbs.get(0) instanceof LocatedStripedBlock; + LocatedStripedBlock bg = (LocatedStripedBlock) (lbs.get(0)); + for (int i = 0; i < DATA_BLK_NUM + PARITY_BLK_NUM; i++) { + Block blk = new Block(bg.getBlock().getBlockId() + i, + NUM_STRIPE_PER_BLOCK * CELLSIZE, + bg.getBlock().getGenerationStamp()); + blk.setGenerationStamp(bg.getBlock().getGenerationStamp()); + cluster.injectBlocks(i, Arrays.asList(blk), + bg.getBlock().getBlockPoolId()); + } + DFSStripedInputStream in = + new DFSStripedInputStream(fs.getClient(), filePath.toString(), false, + ecPolicy, null); + int readSize = BLOCK_GROUP_SIZE; + byte[] readBuffer = new byte[readSize]; + byte[] expected = new byte[readSize]; + /** A variation of {@link DFSTestUtil#fillExpectedBuf} for striped blocks */ + for (int i = 0; i < NUM_STRIPE_PER_BLOCK; i++) { + for (int j = 0; j < DATA_BLK_NUM; j++) { + for (int k = 0; k < CELLSIZE; k++) { + int posInBlk = i * CELLSIZE + k; + int posInFile = i * CELLSIZE * DATA_BLK_NUM + j * CELLSIZE + k; + expected[posInFile] = SimulatedFSDataset.simulatedByte( + new Block(bg.getBlock().getBlockId() + j), posInBlk); + } + } + } + + ErasureCoderOptions coderOptions = new ErasureCoderOptions( + DATA_BLK_NUM, PARITY_BLK_NUM); + RawErasureDecoder rawDecoder = CodecUtil.createRawDecoder(conf, + ecPolicy.getCodecName(), coderOptions); + + // Update the expected content for decoded data + int[] missingBlkIdx = new int[PARITY_BLK_NUM]; + for (int i = 0; i < missingBlkIdx.length; i++) { + if (i == 0) { + missingBlkIdx[i] = failedDNIdx; + } else { + missingBlkIdx[i] = DATA_BLK_NUM + i; + } + } + cluster.stopDataNode(failedDNIdx); + for (int i = 0; i < NUM_STRIPE_PER_BLOCK; i++) { + byte[][] decodeInputs = new byte[DATA_BLK_NUM + PARITY_BLK_NUM][CELLSIZE]; + byte[][] decodeOutputs = new byte[missingBlkIdx.length][CELLSIZE]; + for (int j = 0; j < DATA_BLK_NUM; j++) { + int posInBuf = i * CELLSIZE * DATA_BLK_NUM + j * CELLSIZE; + if (j != failedDNIdx) { + System.arraycopy(expected, posInBuf, decodeInputs[j], 0, CELLSIZE); + } + } + for (int j = DATA_BLK_NUM; j < DATA_BLK_NUM + PARITY_BLK_NUM; j++) { + for (int k = 0; k < CELLSIZE; k++) { + int posInBlk = i * CELLSIZE + k; + decodeInputs[j][k] = SimulatedFSDataset.simulatedByte( + new Block(bg.getBlock().getBlockId() + j), posInBlk); + } + } + for (int m : missingBlkIdx) { + decodeInputs[m] = null; + } + rawDecoder.decode(decodeInputs, missingBlkIdx, decodeOutputs); + int posInBuf = i * CELLSIZE * DATA_BLK_NUM + failedDNIdx * CELLSIZE; + System.arraycopy(decodeOutputs[0], 0, expected, posInBuf, CELLSIZE); + } + + int delta = 10; + int done = 0; + // read a small delta, shouldn't trigger decode + // |cell_0 | + // |10 | + done += in.read(readBuffer, 0, delta); + assertEquals(delta, done); + // both head and trail cells are partial + // |c_0 |c_1 |c_2 |c_3 |c_4 |c_5 | + // |256K - 10|missing|256K|256K|256K - 10|not in range| + while (done < (CELLSIZE * (DATA_BLK_NUM - 1) - 2 * delta)) { + int ret = in.read(readBuffer, delta, + CELLSIZE * (DATA_BLK_NUM - 1) - 2 * delta); + assertTrue(ret > 0); + done += ret; + } + assertEquals(CELLSIZE * (DATA_BLK_NUM - 1) - delta, done); + // read the rest + + int restSize; + restSize = readSize - done; + while (done < restSize) { + int ret = in.read(readBuffer, done, restSize); + assertTrue(ret > 0); + done += ret; + } + + assertEquals(readSize, done); + assertArrayEquals(expected, readBuffer); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/util/TestStripedBlockUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/util/TestStripedBlockUtil.java index 96fc79c46c..7d9d7dc540 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/util/TestStripedBlockUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/util/TestStripedBlockUtil.java @@ -36,6 +36,7 @@ import org.junit.Test; import org.junit.rules.Timeout; +import java.nio.ByteBuffer; import java.util.Random; import static org.junit.Assert.assertEquals; @@ -242,7 +243,8 @@ public void testGetInternalBlockLength () { */ @Test public void testDivideByteRangeIntoStripes() { - byte[] assembled = new byte[BLK_GROUP_STRIPE_NUM * FULL_STRIPE_SIZE]; + ByteBuffer assembled = + ByteBuffer.allocate(BLK_GROUP_STRIPE_NUM * FULL_STRIPE_SIZE); for (int bgSize : blockGroupSizes) { LocatedStripedBlock blockGroup = createDummyLocatedBlock(bgSize); byte[][] internalBlkBufs = createInternalBlkBuffers(bgSize); @@ -252,7 +254,7 @@ public void testDivideByteRangeIntoStripes() { continue; } AlignedStripe[] stripes = divideByteRangeIntoStripes(EC_POLICY, - CELLSIZE, blockGroup, brStart, brStart + brSize - 1, assembled, 0); + CELLSIZE, blockGroup, brStart, brStart + brSize - 1, assembled); for (AlignedStripe stripe : stripes) { for (int i = 0; i < DATA_BLK_NUM; i++) { @@ -261,21 +263,21 @@ public void testDivideByteRangeIntoStripes() { continue; } int done = 0; - for (int j = 0; j < chunk.byteArray.getLengths().length; j++) { - System.arraycopy(internalBlkBufs[i], - (int) stripe.getOffsetInBlock() + done, assembled, - chunk.byteArray.getOffsets()[j], - chunk.byteArray.getLengths()[j]); - done += chunk.byteArray.getLengths()[j]; + int len; + for (ByteBuffer slice : chunk.getChunkBuffer().getSlices()) { + len = slice.remaining(); + slice.put(internalBlkBufs[i], + (int) stripe.getOffsetInBlock() + done, len); + done += len; } } } for (int i = 0; i < brSize; i++) { - if (hashIntToByte(brStart + i) != assembled[i]) { + if (hashIntToByte(brStart + i) != assembled.get(i)) { System.out.println("Oops"); } assertEquals("Byte at " + (brStart + i) + " should be the same", - hashIntToByte(brStart + i), assembled[i]); + hashIntToByte(brStart + i), assembled.get(i)); } } }