HDFS-8320. Erasure coding: consolidate striping-related terminologies. Contributed by Zhe Zhang and Jing Zhao.

This commit is contained in:
Zhe Zhang 2015-05-18 22:55:27 -07:00 committed by Zhe Zhang
parent b008348dbf
commit 7434c44b16
7 changed files with 403 additions and 417 deletions

View File

@ -223,3 +223,5 @@
HDFS-8418. Fix the isNeededReplication calculation for Striped block in NN. HDFS-8418. Fix the isNeededReplication calculation for Striped block in NN.
(Yi Liu via jing9) (Yi Liu via jing9)
HDFS-8320. Erasure coding: consolidate striping-related terminologies. (zhz)

View File

@ -23,19 +23,18 @@
import org.apache.hadoop.fs.StorageType; import org.apache.hadoop.fs.StorageType;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock; import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock;
import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException; import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
import org.apache.hadoop.hdfs.util.StripedBlockUtil; import org.apache.hadoop.hdfs.util.StripedBlockUtil;
import org.apache.hadoop.io.ByteBufferPool; import org.apache.hadoop.io.ByteBufferPool;
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.planReadPortions;
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.divideByteRangeIntoStripes; import static org.apache.hadoop.hdfs.util.StripedBlockUtil.divideByteRangeIntoStripes;
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.initDecodeInputs; import static org.apache.hadoop.hdfs.util.StripedBlockUtil.initDecodeInputs;
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.decodeAndFillBuffer; import static org.apache.hadoop.hdfs.util.StripedBlockUtil.decodeAndFillBuffer;
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.getNextCompletedStripedRead; import static org.apache.hadoop.hdfs.util.StripedBlockUtil.getNextCompletedStripedRead;
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.ReadPortion; import static org.apache.hadoop.hdfs.util.StripedBlockUtil.getStartOffsetsForInternalBlocks;
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.parseStripedBlockGroup;
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.AlignedStripe; import static org.apache.hadoop.hdfs.util.StripedBlockUtil.AlignedStripe;
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.StripingChunk; import static org.apache.hadoop.hdfs.util.StripedBlockUtil.StripingChunk;
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.StripingChunkReadResult; import static org.apache.hadoop.hdfs.util.StripedBlockUtil.StripingChunkReadResult;
@ -65,30 +64,9 @@
import java.util.concurrent.Callable; import java.util.concurrent.Callable;
import java.util.concurrent.Future; import java.util.concurrent.Future;
/****************************************************************************** /**
* DFSStripedInputStream reads from striped block groups, illustrated below: * DFSStripedInputStream reads from striped block groups
* */
* | <- Striped Block Group -> |
* blk_0 blk_1 blk_2 <- A striped block group has
* | | | {@link #dataBlkNum} blocks
* v v v
* +------+ +------+ +------+
* |cell_0| |cell_1| |cell_2| <- The logical read order should be
* +------+ +------+ +------+ cell_0, cell_1, ...
* |cell_3| |cell_4| |cell_5|
* +------+ +------+ +------+
* |cell_6| |cell_7| |cell_8|
* +------+ +------+ +------+
* |cell_9|
* +------+ <- A cell contains {@link #cellSize} bytes of data
*
* Three styles of read will eventually be supported:
* 1. Stateful read
* 2. pread without decode support
* This is implemented by calculating the portion of read from each block and
* issuing requests to each DataNode in parallel.
* 3. pread with decode support: TODO: will be supported after HDFS-7678
*****************************************************************************/
public class DFSStripedInputStream extends DFSInputStream { public class DFSStripedInputStream extends DFSInputStream {
private static class ReaderRetryPolicy { private static class ReaderRetryPolicy {
@ -207,22 +185,24 @@ private synchronized void blockSeekTo(long target) throws IOException {
currentLocatedBlock = targetBlockGroup; currentLocatedBlock = targetBlockGroup;
final long offsetIntoBlockGroup = getOffsetInBlockGroup(); final long offsetIntoBlockGroup = getOffsetInBlockGroup();
LocatedBlock[] targetBlocks = StripedBlockUtil.parseStripedBlockGroup( LocatedBlock[] targetBlocks = parseStripedBlockGroup(
targetBlockGroup, cellSize, dataBlkNum, parityBlkNum); targetBlockGroup, cellSize, dataBlkNum, parityBlkNum);
// The purpose is to get start offset into each block // The purpose is to get start offset into each block.
ReadPortion[] readPortions = planReadPortions(groupSize, cellSize, long[] offsetsForInternalBlocks = getStartOffsetsForInternalBlocks(schema,
offsetIntoBlockGroup, 0, 0); targetBlockGroup, offsetIntoBlockGroup);
Preconditions.checkNotNull(offsetsForInternalBlocks);
final ReaderRetryPolicy retry = new ReaderRetryPolicy(); final ReaderRetryPolicy retry = new ReaderRetryPolicy();
for (int i = 0; i < groupSize; i++) { for (int i = 0; i < groupSize; i++) {
LocatedBlock targetBlock = targetBlocks[i]; LocatedBlock targetBlock = targetBlocks[i];
if (targetBlock != null) { if (targetBlock != null) {
long offsetInBlock = offsetsForInternalBlocks[i] < 0 ?
0 : offsetsForInternalBlocks[i];
DNAddrPair retval = getBestNodeDNAddrPair(targetBlock, null); DNAddrPair retval = getBestNodeDNAddrPair(targetBlock, null);
if (retval != null) { if (retval != null) {
currentNodes[i] = retval.info; currentNodes[i] = retval.info;
blockReaders[i] = getBlockReaderWithRetry(targetBlock, blockReaders[i] = getBlockReaderWithRetry(targetBlock,
readPortions[i].getStartOffsetInBlock(), offsetInBlock, targetBlock.getBlockSize() - offsetInBlock,
targetBlock.getBlockSize() - readPortions[i].getStartOffsetInBlock(),
retval.addr, retval.storageType, retval.info, target, retry); retval.addr, retval.storageType, retval.info, target, retry);
} }
} }

View File

@ -301,12 +301,12 @@ private class ReconstructAndTransferBlock implements Runnable {
} }
private ExtendedBlock getBlock(ExtendedBlock blockGroup, int i) { private ExtendedBlock getBlock(ExtendedBlock blockGroup, int i) {
return StripedBlockUtil.constructStripedBlock(blockGroup, cellSize, return StripedBlockUtil.constructInternalBlock(blockGroup, cellSize,
dataBlkNum, i); dataBlkNum, i);
} }
private long getBlockLen(ExtendedBlock blockGroup, int i) { private long getBlockLen(ExtendedBlock blockGroup, int i) {
return StripedBlockUtil.getStripedBlockLength(blockGroup.getNumBytes(), return StripedBlockUtil.getInternalBlockLength(blockGroup.getNumBytes(),
cellSize, dataBlkNum, i); cellSize, dataBlkNum, i);
} }

View File

@ -41,7 +41,28 @@
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
/** /**
* Utility class for analyzing striped block groups * When accessing a file in striped layout, operations on logical byte ranges
* in the file need to be mapped to physical byte ranges on block files stored
* on DataNodes. This utility class facilities this mapping by defining and
* exposing a number of striping-related concepts. The most basic ones are
* illustrated in the following diagram. Unless otherwise specified, all
* range-related calculations are inclusive (the end offset of the previous
* range should be 1 byte lower than the start offset of the next one).
*
* | <---- Block Group ----> | <- Block Group: logical unit composing
* | | striped HDFS files.
* blk_0 blk_1 blk_2 <- Internal Blocks: each internal block
* | | | represents a physically stored local
* v v v block file
* +------+ +------+ +------+
* |cell_0| |cell_1| |cell_2| <- {@link StripingCell} represents the
* +------+ +------+ +------+ logical order that a Block Group should
* |cell_3| |cell_4| |cell_5| be accessed: cell_0, cell_1, ...
* +------+ +------+ +------+
* |cell_6| |cell_7| |cell_8|
* +------+ +------+ +------+
* |cell_9|
* +------+ <- A cell contains cellSize bytes of data
*/ */
@InterfaceAudience.Private @InterfaceAudience.Private
public class StripedBlockUtil { public class StripedBlockUtil {
@ -103,31 +124,6 @@ public static ExtendedBlock constructInternalBlock(ExtendedBlock blockGroup,
cellSize, dataBlkNum, idxInBlockGroup)); cellSize, dataBlkNum, idxInBlockGroup));
return block; return block;
} }
/**
* This method creates an internal {@link ExtendedBlock} at the given index
* of a block group, for both data and parity block.
*/
public static ExtendedBlock constructStripedBlock(ExtendedBlock blockGroup,
int cellSize, int dataBlkNum, int idxInBlockGroup) {
ExtendedBlock block = new ExtendedBlock(blockGroup);
block.setBlockId(blockGroup.getBlockId() + idxInBlockGroup);
block.setNumBytes(getStripedBlockLength(blockGroup.getNumBytes(), cellSize,
dataBlkNum, idxInBlockGroup));
return block;
}
/**
* Returns an internal block length at the given index of a block group,
* for both data and parity block.
*/
public static long getStripedBlockLength(long numBytes, int cellSize,
int dataBlkNum, int idxInBlockGroup) {
// parity block length is the same as the first striped block length.
return StripedBlockUtil.getInternalBlockLength(
numBytes, cellSize, dataBlkNum,
idxInBlockGroup < dataBlkNum ? idxInBlockGroup : 0);
}
/** /**
* Get the size of an internal block at the given index of a block group * Get the size of an internal block at the given index of a block group
@ -157,7 +153,7 @@ public static long getInternalBlockLength(long dataSize,
return (numStripes - 1L)*cellSize return (numStripes - 1L)*cellSize
+ lastCellSize(lastStripeDataLen, cellSize, numDataBlocks, i); + lastCellSize(lastStripeDataLen, cellSize, numDataBlocks, i);
} }
private static int lastCellSize(int size, int cellSize, int numDataBlocks, private static int lastCellSize(int size, int cellSize, int numDataBlocks,
int i) { int i) {
if (i < numDataBlocks) { if (i < numDataBlocks) {
@ -183,60 +179,6 @@ public static long offsetInBlkToOffsetInBG(int cellSize, int dataBlkNum,
+ offsetInBlk % cellSize; // partial cell + offsetInBlk % cellSize; // partial cell
} }
/**
* This method plans the read portion from each block in the stripe
* @param dataBlkNum The number of data blocks in the striping group
* @param cellSize The size of each striping cell
* @param startInBlk Starting offset in the striped block
* @param len Length of the read request
* @param bufOffset Initial offset in the result buffer
* @return array of {@link ReadPortion}, each representing the portion of I/O
* for an individual block in the group
*/
@VisibleForTesting
public static ReadPortion[] planReadPortions(final int dataBlkNum,
final int cellSize, final long startInBlk, final int len, int bufOffset) {
ReadPortion[] results = new ReadPortion[dataBlkNum];
for (int i = 0; i < dataBlkNum; i++) {
results[i] = new ReadPortion();
}
// cellIdxInBlk is the index of the cell in the block
// E.g., cell_3 is the 2nd cell in blk_0
int cellIdxInBlk = (int) (startInBlk / (cellSize * dataBlkNum));
// blkIdxInGroup is the index of the block in the striped block group
// E.g., blk_2 is the 3rd block in the group
final int blkIdxInGroup = (int) (startInBlk / cellSize % dataBlkNum);
results[blkIdxInGroup].setStartOffsetInBlock(cellSize * cellIdxInBlk +
startInBlk % cellSize);
boolean crossStripe = false;
for (int i = 1; i < dataBlkNum; i++) {
if (blkIdxInGroup + i >= dataBlkNum && !crossStripe) {
cellIdxInBlk++;
crossStripe = true;
}
results[(blkIdxInGroup + i) % dataBlkNum].setStartOffsetInBlock(
cellSize * cellIdxInBlk);
}
int firstCellLen = Math.min(cellSize - (int) (startInBlk % cellSize), len);
results[blkIdxInGroup].offsetsInBuf.add(bufOffset);
results[blkIdxInGroup].lengths.add(firstCellLen);
results[blkIdxInGroup].addReadLength(firstCellLen);
int i = (blkIdxInGroup + 1) % dataBlkNum;
for (int done = firstCellLen; done < len; done += cellSize) {
ReadPortion rp = results[i];
rp.offsetsInBuf.add(done + bufOffset);
final int readLen = Math.min(len - done, cellSize);
rp.lengths.add(readLen);
rp.addReadLength(readLen);
i = (i + 1) % dataBlkNum;
}
return results;
}
/** /**
* Get the next completed striped read task * Get the next completed striped read task
* *
@ -360,84 +302,167 @@ public static void decodeAndFillBuffer(final byte[][] decodeInputs, byte[] buf,
} }
/** /**
* This method divides a requested byte range into an array of * This method divides a requested byte range into an array of inclusive
* {@link AlignedStripe} * {@link AlignedStripe}.
* @param ecSchema The codec schema for the file, which carries the numbers
* of data / parity blocks, as well as cell size
* @param blockGroup The striped block group
* @param rangeStartInBlockGroup The byte range's start offset in block group
* @param rangeEndInBlockGroup The byte range's end offset in block group
* @param buf Destination buffer of the read operation for the byte range
* @param offsetInBuf Start offset into the destination buffer
* *
* * At most 5 stripes will be generated from each logical range, as
* At most 5 stripes will be generated from each logical range * demonstrated in the header of {@link AlignedStripe}.
* TODO: cleanup and get rid of planReadPortions
*/ */
public static AlignedStripe[] divideByteRangeIntoStripes ( public static AlignedStripe[] divideByteRangeIntoStripes (
ECSchema ecSchema, LocatedStripedBlock blockGroup, long start, long end, ECSchema ecSchema, LocatedStripedBlock blockGroup,
byte[] buf, int offsetInBuf) { long rangeStartInBlockGroup, long rangeEndInBlockGroup, byte[] buf,
int offsetInBuf) {
// TODO: change ECSchema naming to use cell size instead of chunk size // TODO: change ECSchema naming to use cell size instead of chunk size
// Step 0: analyze range and calculate basic parameters // Step 0: analyze range and calculate basic parameters
int cellSize = ecSchema.getChunkSize(); int cellSize = ecSchema.getChunkSize();
int dataBlkNum = ecSchema.getNumDataUnits(); int dataBlkNum = ecSchema.getNumDataUnits();
int len = (int) (end - start + 1);
int firstCellIdxInBG = (int) (start / cellSize);
int lastCellIdxInBG = (int) (end / cellSize);
int firstCellSize = Math.min(cellSize - (int) (start % cellSize), len);
long firstCellOffsetInBlk = firstCellIdxInBG / dataBlkNum * cellSize +
start % cellSize;
int lastCellSize = lastCellIdxInBG == firstCellIdxInBG ?
firstCellSize : (int) (end % cellSize) + 1;
// Step 1: get the unmerged ranges on each internal block // Step 1: map the byte range to StripingCells
// TODO: StripingCell should carry info on size and start offset (HDFS-8320) StripingCell[] cells = getStripingCellsOfByteRange(ecSchema, blockGroup,
VerticalRange[] ranges = getRangesForInternalBlocks(ecSchema, rangeStartInBlockGroup, rangeEndInBlockGroup);
firstCellIdxInBG, lastCellIdxInBG, firstCellSize, firstCellOffsetInBlk,
lastCellSize);
// Step 2: merge into at most 5 stripes // Step 2: get the unmerged ranges on each internal block
VerticalRange[] ranges = getRangesForInternalBlocks(ecSchema, cells);
// Step 3: merge into at most 5 stripes
AlignedStripe[] stripes = mergeRangesForInternalBlocks(ecSchema, ranges); AlignedStripe[] stripes = mergeRangesForInternalBlocks(ecSchema, ranges);
// Step 3: calculate each chunk's position in destination buffer // Step 4: calculate each chunk's position in destination buffer
calcualteChunkPositionsInBuf(ecSchema, blockGroup, buf, offsetInBuf, calcualteChunkPositionsInBuf(ecSchema, stripes, cells, buf, offsetInBuf);
firstCellIdxInBG, lastCellIdxInBG, firstCellSize, firstCellOffsetInBlk,
lastCellSize, stripes);
// Step 4: prepare ALLZERO blocks // Step 5: prepare ALLZERO blocks
prepareAllZeroChunks(blockGroup, buf, stripes, cellSize, dataBlkNum); prepareAllZeroChunks(blockGroup, buf, stripes, cellSize, dataBlkNum);
return stripes; return stripes;
} }
private static VerticalRange[] getRangesForInternalBlocks (ECSchema ecSchema, /**
int firstCellIdxInBG, int lastCellIdxInBG, int firstCellSize, * Map the logical byte range to a set of inclusive {@link StripingCell}
long firstCellOffsetInBlk, int lastCellSize) { * instances, each representing the overlap of the byte range to a cell
* used by {@link DFSStripedOutputStream} in encoding
*/
@VisibleForTesting
private static StripingCell[] getStripingCellsOfByteRange(ECSchema ecSchema,
LocatedStripedBlock blockGroup,
long rangeStartInBlockGroup, long rangeEndInBlockGroup) {
Preconditions.checkArgument(
rangeStartInBlockGroup <= rangeEndInBlockGroup &&
rangeEndInBlockGroup < blockGroup.getBlockSize());
int cellSize = ecSchema.getChunkSize(); int cellSize = ecSchema.getChunkSize();
int len = (int) (rangeEndInBlockGroup - rangeStartInBlockGroup + 1);
int firstCellIdxInBG = (int) (rangeStartInBlockGroup / cellSize);
int lastCellIdxInBG = (int) (rangeEndInBlockGroup / cellSize);
int numCells = lastCellIdxInBG - firstCellIdxInBG + 1;
StripingCell[] cells = new StripingCell[numCells];
cells[0] = new StripingCell(ecSchema, firstCellIdxInBG);
cells[numCells - 1] = new StripingCell(ecSchema, lastCellIdxInBG);
cells[0].offset = (int) (rangeStartInBlockGroup % cellSize);
cells[0].size =
Math.min(cellSize - (int) (rangeStartInBlockGroup % cellSize), len);
if (lastCellIdxInBG != firstCellIdxInBG) {
cells[numCells - 1].size = (int) (rangeEndInBlockGroup % cellSize) + 1;
}
for (int i = 1; i < numCells - 1; i++) {
cells[i] = new StripingCell(ecSchema, i + firstCellIdxInBG);
}
return cells;
}
/**
* Given a logical start offset in a block group, calculate the physical
* start offset into each stored internal block.
*/
public static long[] getStartOffsetsForInternalBlocks(
ECSchema ecSchema, LocatedStripedBlock blockGroup,
long rangeStartInBlockGroup) {
Preconditions.checkArgument(
rangeStartInBlockGroup < blockGroup.getBlockSize());
int dataBlkNum = ecSchema.getNumDataUnits(); int dataBlkNum = ecSchema.getNumDataUnits();
int parityBlkNum = ecSchema.getNumParityUnits();
int cellSize = ecSchema.getChunkSize();
long[] startOffsets = new long[dataBlkNum + parityBlkNum];
Arrays.fill(startOffsets, -1L);
int firstCellIdxInBG = (int) (rangeStartInBlockGroup / cellSize);
StripingCell firstCell = new StripingCell(ecSchema, firstCellIdxInBG); StripingCell firstCell = new StripingCell(ecSchema, firstCellIdxInBG);
StripingCell lastCell = new StripingCell(ecSchema, lastCellIdxInBG); firstCell.offset = (int) (rangeStartInBlockGroup % cellSize);
startOffsets[firstCell.idxInStripe] =
VerticalRange ranges[] = new VerticalRange[dataBlkNum]; firstCell.idxInInternalBlk * cellSize + firstCell.offset;
ranges[firstCell.idxInStripe] = long earliestStart = startOffsets[firstCell.idxInStripe];
new VerticalRange(firstCellOffsetInBlk, firstCellSize); for (int i = 1; i < dataBlkNum; i++) {
for (int i = firstCellIdxInBG + 1; i < lastCellIdxInBG; i++) { int idx = firstCellIdxInBG + i;
// iterate through all cells and update the list of StripeRanges if (idx * cellSize >= blockGroup.getBlockSize()) {
StripingCell cell = new StripingCell(ecSchema, i); break;
if (ranges[cell.idxInStripe] == null) { }
ranges[cell.idxInStripe] = new VerticalRange( StripingCell cell = new StripingCell(ecSchema, idx);
cell.idxInInternalBlk * cellSize, cellSize); startOffsets[cell.idxInStripe] = cell.idxInInternalBlk * cellSize;
} else { if (startOffsets[cell.idxInStripe] < earliestStart) {
ranges[cell.idxInStripe].spanInBlock += cellSize; earliestStart = startOffsets[cell.idxInStripe];
} }
} }
if (ranges[lastCell.idxInStripe] == null) { for (int i = dataBlkNum; i < dataBlkNum + parityBlkNum; i++) {
ranges[lastCell.idxInStripe] = new VerticalRange( startOffsets[i] = earliestStart;
lastCell.idxInInternalBlk * cellSize, lastCellSize); }
} else if (lastCell.idxInBlkGroup != firstCell.idxInBlkGroup) { return startOffsets;
ranges[lastCell.idxInStripe].spanInBlock += lastCellSize; }
/**
* Given a logical byte range, mapped to each {@link StripingCell}, calculate
* the physical byte range (inclusive) on each stored internal block.
*/
@VisibleForTesting
private static VerticalRange[] getRangesForInternalBlocks(ECSchema ecSchema,
StripingCell[] cells) {
int cellSize = ecSchema.getChunkSize();
int dataBlkNum = ecSchema.getNumDataUnits();
int parityBlkNum = ecSchema.getNumParityUnits();
VerticalRange ranges[] = new VerticalRange[dataBlkNum + parityBlkNum];
long earliestStart = Long.MAX_VALUE;
long latestEnd = -1;
for (StripingCell cell : cells) {
// iterate through all cells and update the list of StripeRanges
if (ranges[cell.idxInStripe] == null) {
ranges[cell.idxInStripe] = new VerticalRange(
cell.idxInInternalBlk * cellSize + cell.offset, cell.size);
} else {
ranges[cell.idxInStripe].spanInBlock += cell.size;
}
VerticalRange range = ranges[cell.idxInStripe];
if (range.offsetInBlock < earliestStart) {
earliestStart = range.offsetInBlock;
}
if (range.offsetInBlock + range.spanInBlock - 1 > latestEnd) {
latestEnd = range.offsetInBlock + range.spanInBlock - 1;
}
}
// Each parity block should be fetched at maximum range of all data blocks
for (int i = dataBlkNum; i < dataBlkNum + parityBlkNum; i++) {
ranges[i] = new VerticalRange(earliestStart,
latestEnd - earliestStart + 1);
} }
return ranges; return ranges;
} }
private static AlignedStripe[] mergeRangesForInternalBlocks(ECSchema ecSchema, /**
VerticalRange[] ranges) { * Merge byte ranges on each internal block into a set of inclusive
* {@link AlignedStripe} instances.
*/
private static AlignedStripe[] mergeRangesForInternalBlocks(
ECSchema ecSchema, VerticalRange[] ranges) {
int dataBlkNum = ecSchema.getNumDataUnits(); int dataBlkNum = ecSchema.getNumDataUnits();
int parityBlkNum = ecSchema.getNumParityUnits(); int parityBlkNum = ecSchema.getNumParityUnits();
List<AlignedStripe> stripes = new ArrayList<>(); List<AlignedStripe> stripes = new ArrayList<>();
@ -461,12 +486,8 @@ private static AlignedStripe[] mergeRangesForInternalBlocks(ECSchema ecSchema,
} }
private static void calcualteChunkPositionsInBuf(ECSchema ecSchema, private static void calcualteChunkPositionsInBuf(ECSchema ecSchema,
LocatedStripedBlock blockGroup, byte[] buf, int offsetInBuf, AlignedStripe[] stripes, StripingCell[] cells, byte[] buf,
int firstCellIdxInBG, int lastCellIdxInBG, int firstCellSize, int offsetInBuf) {
long firstCellOffsetInBlk, int lastCellSize, AlignedStripe[] stripes) {
int cellSize = ecSchema.getChunkSize();
int dataBlkNum = ecSchema.getNumDataUnits();
// Step 3: calculate each chunk's position in destination buffer
/** /**
* | <--------------- AlignedStripe --------------->| * | <--------------- AlignedStripe --------------->|
* *
@ -484,20 +505,11 @@ private static void calcualteChunkPositionsInBuf(ECSchema ecSchema,
* *
* Cell indexing convention defined in {@link StripingCell} * Cell indexing convention defined in {@link StripingCell}
*/ */
int cellSize = ecSchema.getChunkSize();
int done = 0; int done = 0;
for (int i = firstCellIdxInBG; i <= lastCellIdxInBG; i++) { for (StripingCell cell : cells) {
StripingCell cell = new StripingCell(ecSchema, i); long cellStart = cell.idxInInternalBlk * cellSize + cell.offset;
long cellStart = i == firstCellIdxInBG ? long cellEnd = cellStart + cell.size - 1;
firstCellOffsetInBlk : cell.idxInInternalBlk * cellSize;
int cellLen;
if (i == firstCellIdxInBG) {
cellLen = firstCellSize;
} else if (i == lastCellIdxInBG) {
cellLen = lastCellSize;
} else {
cellLen = cellSize;
}
long cellEnd = cellStart + cellLen - 1;
for (AlignedStripe s : stripes) { for (AlignedStripe s : stripes) {
long stripeEnd = s.getOffsetInBlock() + s.getSpanInBlock() - 1; long stripeEnd = s.getOffsetInBlock() + s.getSpanInBlock() - 1;
long overlapStart = Math.max(cellStart, s.getOffsetInBlock()); long overlapStart = Math.max(cellStart, s.getOffsetInBlock());
@ -514,10 +526,14 @@ private static void calcualteChunkPositionsInBuf(ECSchema ecSchema,
add((int)(offsetInBuf + done + overlapStart - cellStart)); add((int)(offsetInBuf + done + overlapStart - cellStart));
s.chunks[cell.idxInStripe].lengthsInBuf.add(overLapLen); s.chunks[cell.idxInStripe].lengthsInBuf.add(overLapLen);
} }
done += cellLen; done += cell.size;
} }
} }
/**
* If a {@link StripingChunk} maps to a byte range beyond an internal block's
* size, the chunk should be treated as zero bytes in decoding.
*/
private static void prepareAllZeroChunks(LocatedStripedBlock blockGroup, private static void prepareAllZeroChunks(LocatedStripedBlock blockGroup,
byte[] buf, AlignedStripe[] stripes, int cellSize, int dataBlkNum) { byte[] buf, AlignedStripe[] stripes, int cellSize, int dataBlkNum) {
for (AlignedStripe s : stripes) { for (AlignedStripe s : stripes) {
@ -534,51 +550,13 @@ private static void prepareAllZeroChunks(LocatedStripedBlock blockGroup,
} }
/** /**
* This class represents the portion of I/O associated with each block in the * Cell is the unit of encoding used in {@link DFSStripedOutputStream}. This
* striped block group. * size impacts how a logical offset in the file or block group translates
* TODO: consolidate ReadPortion with AlignedStripe * to physical byte offset in a stored internal block. The StripingCell util
*/ * class facilitates this calculation. Each StripingCell is inclusive with
public static class ReadPortion { * its start and end offsets -- e.g., the end logical offset of cell_0_0_0
private long startOffsetInBlock = 0; * should be 1 byte lower than the start logical offset of cell_1_0_1.
private int readLength = 0; *
public final List<Integer> offsetsInBuf = new ArrayList<>();
public final List<Integer> lengths = new ArrayList<>();
public int[] getOffsets() {
int[] offsets = new int[offsetsInBuf.size()];
for (int i = 0; i < offsets.length; i++) {
offsets[i] = offsetsInBuf.get(i);
}
return offsets;
}
public int[] getLengths() {
int[] lens = new int[this.lengths.size()];
for (int i = 0; i < lens.length; i++) {
lens[i] = this.lengths.get(i);
}
return lens;
}
public long getStartOffsetInBlock() {
return startOffsetInBlock;
}
public int getReadLength() {
return readLength;
}
public void setStartOffsetInBlock(long startOffsetInBlock) {
this.startOffsetInBlock = startOffsetInBlock;
}
void addReadLength(int extraLength) {
this.readLength += extraLength;
}
}
/**
* The unit of encoding used in {@link DFSStripedOutputStream}
* | <------- Striped Block Group -------> | * | <------- Striped Block Group -------> |
* blk_0 blk_1 blk_2 * blk_0 blk_1 blk_2
* | | | * | | |
@ -586,43 +564,57 @@ void addReadLength(int extraLength) {
* +----------+ +----------+ +----------+ * +----------+ +----------+ +----------+
* |cell_0_0_0| |cell_1_0_1| |cell_2_0_2| * |cell_0_0_0| |cell_1_0_1| |cell_2_0_2|
* +----------+ +----------+ +----------+ * +----------+ +----------+ +----------+
* |cell_3_1_0| |cell_4_1_1| |cell_5_1_2| <- {@link idxInBlkGroup} = 5 * |cell_3_1_0| |cell_4_1_1| |cell_5_1_2| <- {@link #idxInBlkGroup} = 5
* +----------+ +----------+ +----------+ {@link idxInInternalBlk} = 1 * +----------+ +----------+ +----------+ {@link #idxInInternalBlk} = 1
* {@link idxInStripe} = 2 * {@link #idxInStripe} = 2
* A StripingCell is a special instance of {@link StripingChunk} whose offset * A StripingCell is a special instance of {@link StripingChunk} whose offset
* and size align with the cell used when writing data. * and size align with the cell used when writing data.
* TODO: consider parity cells * TODO: consider parity cells
*/ */
public static class StripingCell { @VisibleForTesting
static class StripingCell {
public final ECSchema schema; public final ECSchema schema;
/** Logical order in a block group, used when doing I/O to a block group */ /** Logical order in a block group, used when doing I/O to a block group */
public final int idxInBlkGroup; final int idxInBlkGroup;
public final int idxInInternalBlk; final int idxInInternalBlk;
public final int idxInStripe; final int idxInStripe;
/**
* When a logical byte range is mapped to a set of cells, it might
* partially overlap with the first and last cells. This field and the
* {@link #size} variable represent the start offset and size of the
* overlap.
*/
int offset;
int size;
public StripingCell(ECSchema ecSchema, int idxInBlkGroup) { StripingCell(ECSchema ecSchema, int idxInBlkGroup) {
this.schema = ecSchema; this.schema = ecSchema;
this.idxInBlkGroup = idxInBlkGroup; this.idxInBlkGroup = idxInBlkGroup;
this.idxInInternalBlk = idxInBlkGroup / ecSchema.getNumDataUnits(); this.idxInInternalBlk = idxInBlkGroup / ecSchema.getNumDataUnits();
this.idxInStripe = idxInBlkGroup - this.idxInStripe = idxInBlkGroup -
this.idxInInternalBlk * ecSchema.getNumDataUnits(); this.idxInInternalBlk * ecSchema.getNumDataUnits();
this.offset = 0;
this.size = ecSchema.getChunkSize();
} }
public StripingCell(ECSchema ecSchema, int idxInInternalBlk, StripingCell(ECSchema ecSchema, int idxInInternalBlk,
int idxInStripe) { int idxInStripe) {
this.schema = ecSchema; this.schema = ecSchema;
this.idxInInternalBlk = idxInInternalBlk; this.idxInInternalBlk = idxInInternalBlk;
this.idxInStripe = idxInStripe; this.idxInStripe = idxInStripe;
this.idxInBlkGroup = this.idxInBlkGroup =
idxInInternalBlk * ecSchema.getNumDataUnits() + idxInStripe; idxInInternalBlk * ecSchema.getNumDataUnits() + idxInStripe;
this.offset = 0;
this.size = ecSchema.getChunkSize();
} }
} }
/** /**
* Given a requested byte range on a striped block group, an AlignedStripe * Given a requested byte range on a striped block group, an AlignedStripe
* represents a {@link VerticalRange} that is aligned with both the byte range * represents an inclusive {@link VerticalRange} that is aligned with both
* and boundaries of all internal blocks. As illustrated in the diagram, any * the byte range and boundaries of all internal blocks. As illustrated in
* given byte range on a block group leads to 1~5 AlignedStripe's. * the diagram, any given byte range on a block group leads to 1~5
* AlignedStripe's.
* *
* |<-------- Striped Block Group -------->| * |<-------- Striped Block Group -------->|
* blk_0 blk_1 blk_2 blk_3 blk_4 * blk_0 blk_1 blk_2 blk_3 blk_4
@ -648,6 +640,7 @@ public StripingCell(ECSchema ecSchema, int idxInInternalBlk,
* *
* The coverage of an AlignedStripe on an internal block is represented as a * The coverage of an AlignedStripe on an internal block is represented as a
* {@link StripingChunk}. * {@link StripingChunk}.
*
* To simplify the logic of reading a logical byte range from a block group, * To simplify the logic of reading a logical byte range from a block group,
* a StripingChunk is either completely in the requested byte range or * a StripingChunk is either completely in the requested byte range or
* completely outside the requested byte range. * completely outside the requested byte range.
@ -692,19 +685,19 @@ public String toString() {
/** /**
* A simple utility class representing an arbitrary vertical inclusive range * A simple utility class representing an arbitrary vertical inclusive range
* starting at {@link offsetInBlock} and lasting for {@link length} bytes in * starting at {@link #offsetInBlock} and lasting for {@link #spanInBlock}
* an internal block. Note that VerticalRange doesn't necessarily align with * bytes in an internal block. Note that VerticalRange doesn't necessarily
* {@link StripingCell}. * align with {@link StripingCell}.
* *
* |<- Striped Block Group ->| * |<- Striped Block Group ->|
* blk_0 * blk_0
* | * |
* v * v
* +-----+ * +-----+
* |~~~~~| <-- {@link offsetInBlock} * |~~~~~| <-- {@link #offsetInBlock}
* | | ^ * | | ^
* | | | * | | |
* | | | {@link spanInBlock} * | | | {@link #spanInBlock}
* | | v * | | v
* |~~~~~| --- * |~~~~~| ---
* | | * | |
@ -743,9 +736,9 @@ public boolean include(long pos) {
* +---------+ +---------+ | +----+ +----+ * +---------+ +---------+ | +----+ +----+
* <----------- data blocks ------------> | <--- parity ---> * <----------- data blocks ------------> | <--- parity --->
* *
* The class also carries {@link buf}, {@link offsetsInBuf}, and * The class also carries {@link #buf}, {@link #offsetsInBuf}, and
* {@link lengthsInBuf} to define how read task for this chunk should deliver * {@link #lengthsInBuf} to define how read task for this chunk should
* the returned data. * deliver the returned data.
*/ */
public static class StripingChunk { public static class StripingChunk {
/** Chunk has been successfully fetched */ /** Chunk has been successfully fetched */

View File

@ -1,143 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import org.junit.Test;
import org.apache.hadoop.hdfs.util.StripedBlockUtil;
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.ReadPortion;
import static org.junit.Assert.*;
public class TestPlanReadPortions {
// We only support this as num of data blocks. It might be good enough for now
// for the purpose, even not flexible yet for any number in a schema.
private final short GROUP_SIZE = 3;
private final int CELLSIZE = 128 * 1024;
private void testPlanReadPortions(int startInBlk, int length,
int bufferOffset, int[] readLengths, int[] offsetsInBlock,
int[][] bufferOffsets, int[][] bufferLengths) {
ReadPortion[] results = StripedBlockUtil.planReadPortions(GROUP_SIZE,
CELLSIZE, startInBlk, length, bufferOffset);
assertEquals(GROUP_SIZE, results.length);
for (int i = 0; i < GROUP_SIZE; i++) {
assertEquals(readLengths[i], results[i].getReadLength());
assertEquals(offsetsInBlock[i], results[i].getStartOffsetInBlock());
final int[] bOffsets = results[i].getOffsets();
assertArrayEquals(bufferOffsets[i], bOffsets);
final int[] bLengths = results[i].getLengths();
assertArrayEquals(bufferLengths[i], bLengths);
}
}
/**
* Test {@link StripedBlockUtil#planReadPortions}
*/
@Test
public void testPlanReadPortions() {
/**
* start block offset is 0, read cellSize - 10
*/
testPlanReadPortions(0, CELLSIZE - 10, 0,
new int[]{CELLSIZE - 10, 0, 0}, new int[]{0, 0, 0},
new int[][]{new int[]{0}, new int[]{}, new int[]{}},
new int[][]{new int[]{CELLSIZE - 10}, new int[]{}, new int[]{}});
/**
* start block offset is 0, read 3 * cellSize
*/
testPlanReadPortions(0, GROUP_SIZE * CELLSIZE, 0,
new int[]{CELLSIZE, CELLSIZE, CELLSIZE}, new int[]{0, 0, 0},
new int[][]{new int[]{0}, new int[]{CELLSIZE}, new int[]{CELLSIZE * 2}},
new int[][]{new int[]{CELLSIZE}, new int[]{CELLSIZE}, new int[]{CELLSIZE}});
/**
* start block offset is 0, read cellSize + 10
*/
testPlanReadPortions(0, CELLSIZE + 10, 0,
new int[]{CELLSIZE, 10, 0}, new int[]{0, 0, 0},
new int[][]{new int[]{0}, new int[]{CELLSIZE}, new int[]{}},
new int[][]{new int[]{CELLSIZE}, new int[]{10}, new int[]{}});
/**
* start block offset is 0, read 5 * cellSize + 10, buffer start offset is 100
*/
testPlanReadPortions(0, 5 * CELLSIZE + 10, 100,
new int[]{CELLSIZE * 2, CELLSIZE * 2, CELLSIZE + 10}, new int[]{0, 0, 0},
new int[][]{new int[]{100, 100 + CELLSIZE * GROUP_SIZE},
new int[]{100 + CELLSIZE, 100 + CELLSIZE * 4},
new int[]{100 + CELLSIZE * 2, 100 + CELLSIZE * 5}},
new int[][]{new int[]{CELLSIZE, CELLSIZE},
new int[]{CELLSIZE, CELLSIZE},
new int[]{CELLSIZE, 10}});
/**
* start block offset is 2, read 3 * cellSize
*/
testPlanReadPortions(2, GROUP_SIZE * CELLSIZE, 100,
new int[]{CELLSIZE, CELLSIZE, CELLSIZE},
new int[]{2, 0, 0},
new int[][]{new int[]{100, 100 + GROUP_SIZE * CELLSIZE - 2},
new int[]{100 + CELLSIZE - 2},
new int[]{100 + CELLSIZE * 2 - 2}},
new int[][]{new int[]{CELLSIZE - 2, 2},
new int[]{CELLSIZE},
new int[]{CELLSIZE}});
/**
* start block offset is 2, read 3 * cellSize + 10
*/
testPlanReadPortions(2, GROUP_SIZE * CELLSIZE + 10, 0,
new int[]{CELLSIZE + 10, CELLSIZE, CELLSIZE},
new int[]{2, 0, 0},
new int[][]{new int[]{0, GROUP_SIZE * CELLSIZE - 2},
new int[]{CELLSIZE - 2},
new int[]{CELLSIZE * 2 - 2}},
new int[][]{new int[]{CELLSIZE - 2, 12},
new int[]{CELLSIZE},
new int[]{CELLSIZE}});
/**
* start block offset is cellSize * 2 - 1, read 5 * cellSize + 10
*/
testPlanReadPortions(CELLSIZE * 2 - 1, 5 * CELLSIZE + 10, 0,
new int[]{CELLSIZE * 2, CELLSIZE + 10, CELLSIZE * 2},
new int[]{CELLSIZE, CELLSIZE - 1, 0},
new int[][]{new int[]{CELLSIZE + 1, 4 * CELLSIZE + 1},
new int[]{0, 2 * CELLSIZE + 1, 5 * CELLSIZE + 1},
new int[]{1, 3 * CELLSIZE + 1}},
new int[][]{new int[]{CELLSIZE, CELLSIZE},
new int[]{1, CELLSIZE, 9},
new int[]{CELLSIZE, CELLSIZE}});
/**
* start block offset is cellSize * 6 - 1, read 7 * cellSize + 10
*/
testPlanReadPortions(CELLSIZE * 6 - 1, 7 * CELLSIZE + 10, 0,
new int[]{CELLSIZE * 3, CELLSIZE * 2 + 9, CELLSIZE * 2 + 1},
new int[]{CELLSIZE * 2, CELLSIZE * 2, CELLSIZE * 2 - 1},
new int[][]{new int[]{1, 3 * CELLSIZE + 1, 6 * CELLSIZE + 1},
new int[]{CELLSIZE + 1, 4 * CELLSIZE + 1, 7 * CELLSIZE + 1},
new int[]{0, 2 * CELLSIZE + 1, 5 * CELLSIZE + 1}},
new int[][]{new int[]{CELLSIZE, CELLSIZE, CELLSIZE},
new int[]{CELLSIZE, CELLSIZE, 9},
new int[]{1, CELLSIZE, CELLSIZE}});
}
}

View File

@ -189,13 +189,13 @@ private void assertFileBlocksRecovery(String fileName, int fileLen,
deadDnIndices[i] = dnMap.get(dataDNs[i]); deadDnIndices[i] = dnMap.get(dataDNs[i]);
// Check the block replica file on deadDn before it dead. // Check the block replica file on deadDn before it dead.
blocks[i] = StripedBlockUtil.constructStripedBlock( blocks[i] = StripedBlockUtil.constructInternalBlock(
lastBlock.getBlock(), cellSize, dataBlkNum, indices[toDead[i]]); lastBlock.getBlock(), cellSize, dataBlkNum, indices[toDead[i]]);
replicas[i] = cluster.getBlockFile(deadDnIndices[i], blocks[i]); replicas[i] = cluster.getBlockFile(deadDnIndices[i], blocks[i]);
metadatas[i] = cluster.getBlockMetadataFile(deadDnIndices[i], blocks[i]); metadatas[i] = cluster.getBlockMetadataFile(deadDnIndices[i], blocks[i]);
// the block replica on the datanode should be the same as expected // the block replica on the datanode should be the same as expected
assertEquals(replicas[i].length(), assertEquals(replicas[i].length(),
StripedBlockUtil.getStripedBlockLength( StripedBlockUtil.getInternalBlockLength(
lastBlock.getBlockSize(), cellSize, dataBlkNum, indices[toDead[i]])); lastBlock.getBlockSize(), cellSize, dataBlkNum, indices[toDead[i]]));
assertTrue(metadatas[i].getName(). assertTrue(metadatas[i].getName().
endsWith(blocks[i].getGenerationStamp() + ".meta")); endsWith(blocks[i].getGenerationStamp() + ".meta"));

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.hdfs.util; package org.apache.hadoop.hdfs.util;
import com.google.common.base.Preconditions;
import org.apache.hadoop.fs.StorageType; import org.apache.hadoop.fs.StorageType;
import org.apache.hadoop.hdfs.DFSTestUtil; import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
@ -26,26 +27,107 @@
import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock; import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockIdManager; import org.apache.hadoop.hdfs.server.blockmanagement.BlockIdManager;
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.parseStripedBlockGroup; import static org.apache.hadoop.hdfs.util.StripedBlockUtil.*;
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.getInternalBlockLength;
import org.apache.hadoop.hdfs.server.namenode.ErasureCodingSchemaManager;
import org.apache.hadoop.io.erasurecode.ECSchema;
import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import java.util.Random;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertFalse;
/**
* Need to cover the following combinations:
* 1. Block group size:
* 1.1 One byte
* 1.2 Smaller than cell
* 1.3 One full cell
* 1.4 x full cells, where x is smaller than number of data blocks
* 1.5 x full cells plus a partial cell
* 1.6 One full stripe
* 1.7 One full stripe plus a partial cell
* 1.8 One full stripe plus x full cells
* 1.9 One full stripe plus x full cells plus a partial cell
* 1.10 y full stripes, but smaller than full block group size
* 1.11 Full block group size
*
* 2. Byte range start
* 2.1 Zero
* 2.2 Within first cell
* 2.3 End of first cell
* 2.4 Start of a middle* cell in the first stripe (* neither first or last)
* 2.5 End of middle cell in the first stripe
* 2.6 Within a middle cell in the first stripe
* 2.7 Start of the last cell in the first stripe
* 2.8 Within the last cell in the first stripe
* 2.9 End of the last cell in the first stripe
* 2.10 Start of a middle stripe
* 2.11 Within a middle stripe
* 2.12 End of a middle stripe
* 2.13 Start of the last stripe
* 2.14 Within the last stripe
* 2.15 End of the last stripe (last byte)
*
* 3. Byte range length: same settings as block group size
*
* We should test in total 11 x 15 x 11 = 1815 combinations
* TODO: test parity block logic
*/
public class TestStripedBlockUtil { public class TestStripedBlockUtil {
private final short DATA_BLK_NUM = HdfsConstants.NUM_DATA_BLOCKS; private final short DATA_BLK_NUM = HdfsConstants.NUM_DATA_BLOCKS;
private final short PARITY_BLK_NUM = HdfsConstants.NUM_PARITY_BLOCKS; private final short PARITY_BLK_NUM = HdfsConstants.NUM_PARITY_BLOCKS;
private final short BLK_GROUP_SIZE = DATA_BLK_NUM + PARITY_BLK_NUM; private final short BLK_GROUP_WIDTH = DATA_BLK_NUM + PARITY_BLK_NUM;
private final int CELLSIZE = HdfsConstants.BLOCK_STRIPED_CELL_SIZE; private final int CELLSIZE = HdfsConstants.BLOCK_STRIPED_CELL_SIZE;
private final int FULL_STRIPE_SIZE = DATA_BLK_NUM * CELLSIZE;
/** number of full stripes in a full block group */
private final int BLK_GROUP_STRIPE_NUM = 16;
private final ECSchema SCEHMA = ErasureCodingSchemaManager.
getSystemDefaultSchema();
private final Random random = new Random();
private LocatedStripedBlock createDummyLocatedBlock() { private int[] blockGroupSizes;
private int[] byteRangeStartOffsets;
private int[] byteRangeSizes;
@Before
public void setup(){
blockGroupSizes = new int[]{1, getDelta(CELLSIZE), CELLSIZE,
getDelta(DATA_BLK_NUM) * CELLSIZE,
getDelta(DATA_BLK_NUM) * CELLSIZE + getDelta(CELLSIZE),
FULL_STRIPE_SIZE, FULL_STRIPE_SIZE + getDelta(CELLSIZE),
FULL_STRIPE_SIZE + getDelta(DATA_BLK_NUM) * CELLSIZE,
FULL_STRIPE_SIZE + getDelta(DATA_BLK_NUM) * CELLSIZE + getDelta(CELLSIZE),
getDelta(BLK_GROUP_STRIPE_NUM) * FULL_STRIPE_SIZE,
BLK_GROUP_STRIPE_NUM * FULL_STRIPE_SIZE};
byteRangeStartOffsets = new int[] {0, getDelta(CELLSIZE), CELLSIZE - 1};
byteRangeSizes = new int[]{1, getDelta(CELLSIZE), CELLSIZE,
getDelta(DATA_BLK_NUM) * CELLSIZE,
getDelta(DATA_BLK_NUM) * CELLSIZE + getDelta(CELLSIZE),
FULL_STRIPE_SIZE, FULL_STRIPE_SIZE + getDelta(CELLSIZE),
FULL_STRIPE_SIZE + getDelta(DATA_BLK_NUM) * CELLSIZE,
FULL_STRIPE_SIZE + getDelta(DATA_BLK_NUM) * CELLSIZE + getDelta(CELLSIZE),
getDelta(BLK_GROUP_STRIPE_NUM) * FULL_STRIPE_SIZE,
BLK_GROUP_STRIPE_NUM * FULL_STRIPE_SIZE};
}
private int getDelta(int size) {
return 1 + random.nextInt(size - 2);
}
private byte hashIntToByte(int i) {
int BYTE_MASK = 0xff;
return (byte) (((i + 13) * 29) & BYTE_MASK);
}
private LocatedStripedBlock createDummyLocatedBlock(int bgSize) {
final long blockGroupID = -1048576; final long blockGroupID = -1048576;
DatanodeInfo[] locs = new DatanodeInfo[BLK_GROUP_SIZE]; DatanodeInfo[] locs = new DatanodeInfo[BLK_GROUP_WIDTH];
String[] storageIDs = new String[BLK_GROUP_SIZE]; String[] storageIDs = new String[BLK_GROUP_WIDTH];
StorageType[] storageTypes = new StorageType[BLK_GROUP_SIZE]; StorageType[] storageTypes = new StorageType[BLK_GROUP_WIDTH];
int[] indices = new int[BLK_GROUP_SIZE]; int[] indices = new int[BLK_GROUP_WIDTH];
for (int i = 0; i < BLK_GROUP_SIZE; i++) { for (int i = 0; i < BLK_GROUP_WIDTH; i++) {
indices[i] = (i + 2) % DATA_BLK_NUM; indices[i] = (i + 2) % DATA_BLK_NUM;
// Location port always equal to logical index of a block, // Location port always equal to logical index of a block,
// for easier verification // for easier verification
@ -53,13 +135,40 @@ private LocatedStripedBlock createDummyLocatedBlock() {
storageIDs[i] = locs[i].getDatanodeUuid(); storageIDs[i] = locs[i].getDatanodeUuid();
storageTypes[i] = StorageType.DISK; storageTypes[i] = StorageType.DISK;
} }
return new LocatedStripedBlock(new ExtendedBlock("pool", blockGroupID), return new LocatedStripedBlock(new ExtendedBlock("pool", blockGroupID,
locs, storageIDs, storageTypes, indices, 0, false, null); bgSize, 1001), locs, storageIDs, storageTypes, indices, 0, false,
null);
}
private byte[][] createInternalBlkBuffers(int bgSize) {
byte[][] bufs = new byte[DATA_BLK_NUM + PARITY_BLK_NUM][];
int[] pos = new int[DATA_BLK_NUM + PARITY_BLK_NUM];
for (int i = 0; i < DATA_BLK_NUM + PARITY_BLK_NUM; i++) {
int bufSize = (int) getInternalBlockLength(
bgSize, CELLSIZE, DATA_BLK_NUM, i);
bufs[i] = new byte[bufSize];
pos[i] = 0;
}
int done = 0;
while (done < bgSize) {
Preconditions.checkState(done % CELLSIZE == 0);
StripingCell cell = new StripingCell(SCEHMA, done / CELLSIZE);
int idxInStripe = cell.idxInStripe;
int size = Math.min(CELLSIZE, bgSize - done);
for (int i = 0; i < size; i++) {
bufs[idxInStripe][pos[idxInStripe] + i] = hashIntToByte(done + i);
}
done += size;
pos[idxInStripe] += size;
}
return bufs;
} }
@Test @Test
public void testParseDummyStripedBlock() { public void testParseDummyStripedBlock() {
LocatedStripedBlock lsb = createDummyLocatedBlock(); LocatedStripedBlock lsb = createDummyLocatedBlock(
BLK_GROUP_STRIPE_NUM * FULL_STRIPE_SIZE);
LocatedBlock[] blocks = parseStripedBlockGroup( LocatedBlock[] blocks = parseStripedBlockGroup(
lsb, CELLSIZE, DATA_BLK_NUM, PARITY_BLK_NUM); lsb, CELLSIZE, DATA_BLK_NUM, PARITY_BLK_NUM);
assertEquals(DATA_BLK_NUM + PARITY_BLK_NUM, blocks.length); assertEquals(DATA_BLK_NUM + PARITY_BLK_NUM, blocks.length);
@ -68,14 +177,15 @@ public void testParseDummyStripedBlock() {
assertEquals(i, assertEquals(i,
BlockIdManager.getBlockIndex(blocks[i].getBlock().getLocalBlock())); BlockIdManager.getBlockIndex(blocks[i].getBlock().getLocalBlock()));
assertEquals(i * CELLSIZE, blocks[i].getStartOffset()); assertEquals(i * CELLSIZE, blocks[i].getStartOffset());
/** TODO: properly define {@link LocatedBlock#offset} for internal blocks */
assertEquals(1, blocks[i].getLocations().length); assertEquals(1, blocks[i].getLocations().length);
assertEquals(i, blocks[i].getLocations()[0].getIpcPort()); assertEquals(i, blocks[i].getLocations()[0].getIpcPort());
assertEquals(i, blocks[i].getLocations()[0].getXferPort()); assertEquals(i, blocks[i].getLocations()[0].getXferPort());
} }
} }
private void verifyInternalBlocks (long numBytesInGroup, long[] expected) { private void verifyInternalBlocks (int numBytesInGroup, int[] expected) {
for (int i = 1; i < BLK_GROUP_SIZE; i++) { for (int i = 1; i < BLK_GROUP_WIDTH; i++) {
assertEquals(expected[i], assertEquals(expected[i],
getInternalBlockLength(numBytesInGroup, CELLSIZE, DATA_BLK_NUM, i)); getInternalBlockLength(numBytesInGroup, CELLSIZE, DATA_BLK_NUM, i));
} }
@ -85,41 +195,85 @@ private void verifyInternalBlocks (long numBytesInGroup, long[] expected) {
public void testGetInternalBlockLength () { public void testGetInternalBlockLength () {
// A small delta that is smaller than a cell // A small delta that is smaller than a cell
final int delta = 10; final int delta = 10;
assert delta < CELLSIZE;
// Block group is smaller than a cell // Block group is smaller than a cell
verifyInternalBlocks(CELLSIZE - delta, verifyInternalBlocks(CELLSIZE - delta,
new long[] {CELLSIZE - delta, 0, 0, 0, 0, 0, new int[] {CELLSIZE - delta, 0, 0, 0, 0, 0,
CELLSIZE - delta, CELLSIZE - delta, CELLSIZE - delta}); CELLSIZE - delta, CELLSIZE - delta, CELLSIZE - delta});
// Block group is exactly as large as a cell // Block group is exactly as large as a cell
verifyInternalBlocks(CELLSIZE, verifyInternalBlocks(CELLSIZE,
new long[] {CELLSIZE, 0, 0, 0, 0, 0, new int[] {CELLSIZE, 0, 0, 0, 0, 0,
CELLSIZE, CELLSIZE, CELLSIZE}); CELLSIZE, CELLSIZE, CELLSIZE});
// Block group is a little larger than a cell // Block group is a little larger than a cell
verifyInternalBlocks(CELLSIZE + delta, verifyInternalBlocks(CELLSIZE + delta,
new long[] {CELLSIZE, delta, 0, 0, 0, 0, new int[] {CELLSIZE, delta, 0, 0, 0, 0,
CELLSIZE, CELLSIZE, CELLSIZE}); CELLSIZE, CELLSIZE, CELLSIZE});
// Block group contains multiple stripes and ends at stripe boundary // Block group contains multiple stripes and ends at stripe boundary
verifyInternalBlocks(2 * DATA_BLK_NUM * CELLSIZE, verifyInternalBlocks(2 * DATA_BLK_NUM * CELLSIZE,
new long[] {2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE, new int[] {2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE,
2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE,
2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE}); 2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE});
// Block group contains multiple stripes and ends at cell boundary // Block group contains multiple stripes and ends at cell boundary
// (not ending at stripe boundary) // (not ending at stripe boundary)
verifyInternalBlocks(2 * DATA_BLK_NUM * CELLSIZE + CELLSIZE, verifyInternalBlocks(2 * DATA_BLK_NUM * CELLSIZE + CELLSIZE,
new long[] {3 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE, new int[] {3 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE,
2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE,
3 * CELLSIZE, 3 * CELLSIZE, 3 * CELLSIZE}); 3 * CELLSIZE, 3 * CELLSIZE, 3 * CELLSIZE});
// Block group contains multiple stripes and doesn't end at cell boundary // Block group contains multiple stripes and doesn't end at cell boundary
verifyInternalBlocks(2 * DATA_BLK_NUM * CELLSIZE - delta, verifyInternalBlocks(2 * DATA_BLK_NUM * CELLSIZE - delta,
new long[] {2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE, new int[] {2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE,
2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE - delta, 2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE - delta,
2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE}); 2 * CELLSIZE, 2 * CELLSIZE, 2 * CELLSIZE});
} }
/**
* Test dividing a byte range into aligned stripes and verify the aligned
* ranges can be translated back to the byte range.
*/
@Test
public void testDivideByteRangeIntoStripes() {
byte[] assembled = new byte[BLK_GROUP_STRIPE_NUM * FULL_STRIPE_SIZE];
for (int bgSize : blockGroupSizes) {
LocatedStripedBlock blockGroup = createDummyLocatedBlock(bgSize);
byte[][] internalBlkBufs = createInternalBlkBuffers(bgSize);
for (int brStart : byteRangeStartOffsets) {
for (int brSize : byteRangeSizes) {
if (brStart + brSize > bgSize) {
continue;
}
AlignedStripe[] stripes = divideByteRangeIntoStripes(SCEHMA,
blockGroup, brStart, brStart + brSize - 1, assembled, 0);
for (AlignedStripe stripe : stripes) {
for (int i = 0; i < DATA_BLK_NUM; i++) {
StripingChunk chunk = stripe.chunks[i];
if (chunk == null || chunk.state != StripingChunk.REQUESTED) {
continue;
}
int done = 0;
for (int j = 0; j < chunk.getLengths().length; j++) {
System.arraycopy(internalBlkBufs[i],
(int) stripe.getOffsetInBlock() + done, assembled,
chunk.getOffsets()[j], chunk.getLengths()[j]);
done += chunk.getLengths()[j];
}
}
}
for (int i = 0; i < brSize; i++) {
if (hashIntToByte(brStart + i) != assembled[i]) {
System.out.println("Oops");
}
assertEquals("Byte at " + (brStart + i) + " should be the same",
hashIntToByte(brStart + i), assembled[i]);
}
}
}
}
}
} }