HDFS-8760. Erasure Coding: reuse BlockReader when reading the same block in pread. Contributed by Jing Zhao.

2015-07-21 15:47:26 -07:00 · 2015-07-21 15:47:26 -07:00 · f8f7a923b7
commit f8f7a923b7
parent 29495cb8f6
7 changed files with 172 additions and 301 deletions
--- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES-HDFS-EC-7285.txt
+++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES-HDFS-EC-7285.txt
@ -364,3 +364,6 @@
    to be consistent with trunk. (zhz)
    HDFS-8433. Erasure coding: set blockToken in LocatedStripedBlock.(waltersu4549)
    HDFS-8760. Erasure Coding: reuse BlockReader when reading the same block in pread.
    (jing9)
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSInputStream.java
@ -44,7 +44,6 @@
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import com.google.common.base.Preconditions;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.fs.ByteBufferReadable;
@ -1139,18 +1138,6 @@ public ByteBuffer call() throws Exception {
    };
  }
  /**
   * Used when reading contiguous blocks
   */
  private void actualGetFromOneDataNode(final DNAddrPair datanode,
      LocatedBlock block, final long start, final long end, byte[] buf,
      int offset, Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
      throws IOException {
    final int length = (int) (end - start + 1);
    actualGetFromOneDataNode(datanode, block, start, end, buf,
        new int[]{offset}, new int[]{length}, corruptedBlockMap);
  }
  /**
   * Read data from one DataNode.
   * @param datanode the datanode from which to read data
@ -1158,23 +1145,18 @@ private void actualGetFromOneDataNode(final DNAddrPair datanode,
   * @param startInBlk the startInBlk offset of the block
   * @param endInBlk the endInBlk offset of the block
   * @param buf the given byte array into which the data is read
-   * @param offsets the data may be read into multiple segments of the buf
+   * @param offset the offset in buf
   *                (when reading a striped block). this array indicates the
   *                offset of each buf segment.
   * @param lengths the length of each buf segment
   * @param corruptedBlockMap map recording list of datanodes with corrupted
   *                          block replica
   */
-  void actualGetFromOneDataNode(final DNAddrPair datanode,
+  void actualGetFromOneDataNode(final DNAddrPair datanode, LocatedBlock block,
-      LocatedBlock block, final long startInBlk, final long endInBlk,
+      final long startInBlk, final long endInBlk, byte[] buf, int offset,
      byte[] buf, int[] offsets, int[] lengths,
      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
      throws IOException {
    DFSClientFaultInjector.get().startFetchFromDatanode();
    int refetchToken = 1; // only need to get a new access token once
    int refetchEncryptionKey = 1; // only need to get a new encryption key once
    final int len = (int) (endInBlk - startInBlk + 1);
    checkReadPortions(offsets, lengths, len);
    while (true) {
      // cached block locations may have been updated by chooseDataNode()
@ -1186,13 +1168,11 @@ void actualGetFromOneDataNode(final DNAddrPair datanode,
        DFSClientFaultInjector.get().fetchFromDatanodeException();
        reader = getBlockReader(block, startInBlk, len, datanode.addr,
            datanode.storageType, datanode.info);
-        for (int i = 0; i < offsets.length; i++) {
+        int nread = reader.readAll(buf, offset, len);
          int nread = reader.readAll(buf, offsets[i], lengths[i]);
        updateReadStatistics(readStatistics, nread, reader);
-          if (nread != lengths[i]) {
+        if (nread != len) {
          throw new IOException("truncated return from reader.read(): " +
-                "excpected " + lengths[i] + ", got " + nread);
+              "excpected " + len + ", got " + nread);
          }
        }
        DFSClientFaultInjector.get().readFromDatanodeDelay();
        return;
@ -1247,24 +1227,6 @@ protected LocatedBlock refreshLocatedBlock(LocatedBlock block)
    return getBlockAt(block.getStartOffset());
  }
  /**
   * This method verifies that the read portions are valid and do not overlap
   * with each other.
   */
  private void checkReadPortions(int[] offsets, int[] lengths, int totalLen) {
    Preconditions.checkArgument(offsets.length == lengths.length && offsets.length > 0);
    int sum = 0;
    for (int i = 0; i < lengths.length; i++) {
      if (i > 0) {
        int gap = offsets[i] - offsets[i - 1];
        // make sure read portions do not overlap with each other
        Preconditions.checkArgument(gap >= lengths[i - 1]);
      }
      sum += lengths[i];
    }
    Preconditions.checkArgument(sum == totalLen);
  }
  /**
   * Like {@link #fetchBlockByteRange}except we start up a second, parallel,
   * 'hedged' read if the first read is taking longer than configured amount of
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSStripedInputStream.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSStripedInputStream.java
@ -31,14 +31,6 @@
 import org.apache.hadoop.hdfs.util.StripedBlockUtil;
 import org.apache.hadoop.io.ByteBufferPool;
 import static org.apache.hadoop.hdfs.util.StripedBlockUtil.convertIndex4Decode;
 import static org.apache.hadoop.hdfs.util.StripedBlockUtil.divideByteRangeIntoStripes;
 import static org.apache.hadoop.hdfs.util.StripedBlockUtil.finalizeDecodeInputs;
 import static org.apache.hadoop.hdfs.util.StripedBlockUtil.decodeAndFillBuffer;
 import static org.apache.hadoop.hdfs.util.StripedBlockUtil.getNextCompletedStripedRead;
 import static org.apache.hadoop.hdfs.util.StripedBlockUtil.getStartOffsetsForInternalBlocks;
 import static org.apache.hadoop.hdfs.util.StripedBlockUtil.initDecodeInputs;
 import static org.apache.hadoop.hdfs.util.StripedBlockUtil.parseStripedBlockGroup;
 import static org.apache.hadoop.hdfs.util.StripedBlockUtil.AlignedStripe;
 import static org.apache.hadoop.hdfs.util.StripedBlockUtil.StripingChunk;
 import static org.apache.hadoop.hdfs.util.StripedBlockUtil.StripingChunkReadResult;
@ -48,10 +40,6 @@
 import org.apache.hadoop.io.erasurecode.ECSchema;
 import org.apache.hadoop.io.erasurecode.rawcoder.RawErasureDecoder;
 import org.apache.hadoop.net.NetUtils;
 import org.apache.htrace.Span;
 import org.apache.htrace.Trace;
 import org.apache.htrace.TraceScope;
 import java.io.EOFException;
 import java.io.IOException;
@ -166,7 +154,6 @@ void skip() {
   */
  private StripeRange curStripeRange;
  private final CompletionService<Void> readingService;
  private ReaderRetryPolicy retry;
  DFSStripedInputStream(DFSClient dfsClient, String src,
      boolean verifyChecksum, ECSchema schema, int cellSize,
@ -198,18 +185,6 @@ private void resetCurStripeBuffer() {
    curStripeRange = new StripeRange(0, 0);
  }
  @Override
  public synchronized int read(final ByteBuffer buf) throws IOException {
    ReaderStrategy byteBufferReader = new ByteBufferStrategy(buf);
    TraceScope scope =
        dfsClient.getPathTraceScope("DFSInputStream#byteBufferRead", src);
    try {
      return readWithStrategy(byteBufferReader, 0, buf.remaining());
    } finally {
      scope.close();
    }
  }
  /**
   * When seeking into a new block group, create blockReader for each internal
   * block in the group.
@ -229,33 +204,6 @@ private synchronized void blockSeekTo(long target) throws IOException {
    this.blockEnd = targetBlockGroup.getStartOffset() +
        targetBlockGroup.getBlockSize() - 1;
    currentLocatedBlock = targetBlockGroup;
    final long offsetIntoBlockGroup = getOffsetInBlockGroup();
    LocatedBlock[] targetBlocks = parseStripedBlockGroup(
        targetBlockGroup, cellSize, dataBlkNum, parityBlkNum);
    // The purpose is to get start offset into each block.
    long[] offsetsForInternalBlocks = getStartOffsetsForInternalBlocks(schema,
        cellSize, targetBlockGroup, offsetIntoBlockGroup);
    Preconditions.checkState(offsetsForInternalBlocks.length ==
        dataBlkNum + parityBlkNum);
    long minOffset = offsetsForInternalBlocks[dataBlkNum];
    retry = new ReaderRetryPolicy();
    for (int i = 0; i < dataBlkNum; i++) {
      LocatedBlock targetBlock = targetBlocks[i];
      if (targetBlock != null) {
        DNAddrPair dnInfo = getBestNodeDNAddrPair(targetBlock, null);
        if (dnInfo != null) {
          BlockReader reader = getBlockReaderWithRetry(targetBlock,
              minOffset, targetBlock.getBlockSize() - minOffset,
              dnInfo.addr, dnInfo.storageType, dnInfo.info, target, retry);
          if (reader != null) {
            blockReaders[i] = new BlockReaderInfo(reader, targetBlock,
                dnInfo.info, minOffset);
          }
        }
      }
    }
  }
  /**
@ -308,16 +256,16 @@ protected void closeCurrentBlockReaders() {
      return;
    }
    for (int i = 0; i < groupSize; i++) {
-      closeReader(i);
+      closeReader(blockReaders[i]);
      blockReaders[i] = null;
    }
    blockEnd = -1;
  }
-  private void closeReader(int index) {
+  private void closeReader(BlockReaderInfo readerInfo) {
-    if (blockReaders[index] != null) {
+    if (readerInfo != null) {
-      IOUtils.cleanup(DFSClient.LOG, blockReaders[index].reader);
+      IOUtils.cleanup(DFSClient.LOG, readerInfo.reader);
-      blockReaders[index].skip();
+      readerInfo.skip();
    }
  }
@ -358,17 +306,17 @@ private void readOneStripe(
    for (AlignedStripe stripe : stripes) {
      // Parse group to get chosen DN location
      StripeReader sreader = new StatefulStripeReader(readingService, stripe,
-          blks, corruptedBlockMap);
+          blks, blockReaders, corruptedBlockMap);
      sreader.readStripe();
    }
    curStripeBuf.position(stripeBufOffset);
    curStripeBuf.limit(stripeLimit);
  }
-  private Callable<Void> readCell(final BlockReader reader,
+  private Callable<Void> readCells(final BlockReader reader,
      final DatanodeInfo datanode, final long currentReaderOffset,
-      final long targetReaderOffset, final ByteBufferStrategy strategy,
+      final long targetReaderOffset, final ByteBufferStrategy[] strategies,
-      final int targetLength, final ExtendedBlock currentBlock,
+      final ExtendedBlock currentBlock,
      final Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) {
    return new Callable<Void>() {
      @Override
@ -386,27 +334,31 @@ public Void call() throws Exception {
              skipped == targetReaderOffset - currentReaderOffset);
        }
        int result = 0;
-        while (result < targetLength) {
+        for (ByteBufferStrategy strategy : strategies) {
-          int ret = readToBuffer(reader, datanode, strategy, currentBlock,
+          result += readToBuffer(reader, datanode, strategy, currentBlock,
              corruptedBlockMap);
          if (ret < 0) {
            throw new IOException("Unexpected EOS from the reader");
        }
          result += ret;
        }
        updateReadStatistics(readStatistics, targetLength, reader);
        return null;
      }
    };
  }
  private int readToBuffer(BlockReader blockReader,
-      DatanodeInfo currentNode, ByteBufferStrategy readerStrategy,
+      DatanodeInfo currentNode, ByteBufferStrategy strategy,
      ExtendedBlock currentBlock,
      Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap)
      throws IOException {
    final int targetLength = strategy.buf.remaining();
    int length = 0;
    try {
-      return readerStrategy.doRead(blockReader, 0, 0);
+      while (length < targetLength) {
        int ret = strategy.doRead(blockReader, 0, 0);
        if (ret < 0) {
          throw new IOException("Unexpected EOS from the reader");
        }
        length += ret;
      }
      return length;
    } catch (ChecksumException ce) {
      DFSClient.LOG.warn("Found Checksum error for "
          + currentBlock + " from " + currentNode
@ -572,61 +524,49 @@ protected void fetchBlockByteRange(LocatedBlock block, long start,
    // Refresh the striped block group
    LocatedStripedBlock blockGroup = getBlockGroupAt(block.getStartOffset());
-    AlignedStripe[] stripes = divideByteRangeIntoStripes(schema, cellSize,
+    AlignedStripe[] stripes = StripedBlockUtil.divideByteRangeIntoStripes(
-        blockGroup, start, end, buf, offset);
+        schema, cellSize, blockGroup, start, end, buf, offset);
    CompletionService<Void> readService = new ExecutorCompletionService<>(
        dfsClient.getStripedReadsThreadPool());
    final LocatedBlock[] blks = StripedBlockUtil.parseStripedBlockGroup(
        blockGroup, cellSize, dataBlkNum, parityBlkNum);
    final BlockReaderInfo[] preaderInfos = new BlockReaderInfo[groupSize];
    try {
      for (AlignedStripe stripe : stripes) {
        // Parse group to get chosen DN location
        StripeReader preader = new PositionStripeReader(readService, stripe,
-          blks, corruptedBlockMap);
+            blks, preaderInfos, corruptedBlockMap);
        preader.readStripe();
      }
  }
  private Callable<Void> getFromOneDataNode(final DNAddrPair datanode,
      final LocatedBlock block, final long start, final long end,
      final byte[] buf, final int[] offsets, final int[] lengths,
      final Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap,
      final int hedgedReadId) {
    final Span parentSpan = Trace.currentSpan();
    return new Callable<Void>() {
      @Override
      public Void call() throws Exception {
        TraceScope scope =
            Trace.startSpan("Parallel reading " + hedgedReadId, parentSpan);
        try {
          actualGetFromOneDataNode(datanode, block, start,
              end, buf, offsets, lengths, corruptedBlockMap);
    } finally {
-          scope.close();
+      for (BlockReaderInfo preaderInfo : preaderInfos) {
        closeReader(preaderInfo);
      }
        return null;
    }
    };
  }
  /**
   * The reader for reading a complete {@link AlignedStripe}. Note that an
   * {@link AlignedStripe} may cross multiple stripes with cellSize width.
   */
  private abstract class StripeReader {
    final Map<Future<Void>, Integer> futures = new HashMap<>();
    final AlignedStripe alignedStripe;
    final CompletionService<Void> service;
    final LocatedBlock[] targetBlocks;
    final Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap;
    final BlockReaderInfo[] readerInfos;
    StripeReader(CompletionService<Void> service, AlignedStripe alignedStripe,
-        LocatedBlock[] targetBlocks,
+        LocatedBlock[] targetBlocks, BlockReaderInfo[] readerInfos,
        Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) {
      this.service = service;
      this.alignedStripe = alignedStripe;
      this.targetBlocks = targetBlocks;
      this.readerInfos = readerInfos;
      this.corruptedBlockMap = corruptedBlockMap;
    }
    abstract boolean readChunk(final CompletionService<Void> service,
        final LocatedBlock block, int chunkIndex);
    /** prepare all the data chunks */
    abstract void prepareDecodeInputs();
@ -635,7 +575,12 @@ abstract boolean readChunk(final CompletionService<Void> service,
    abstract void decode();
-    abstract void updateState4SuccessRead(StripingChunkReadResult result);
+    void updateState4SuccessRead(StripingChunkReadResult result) {
      Preconditions.checkArgument(
          result.state == StripingChunkReadResult.SUCCESSFUL);
      readerInfos[result.index].setOffset(alignedStripe.getOffsetInBlock()
          + alignedStripe.getSpanInBlock());
    }
    private void checkMissingBlocks() throws IOException {
      if (alignedStripe.missingChunksNum > parityBlkNum) {
@ -654,7 +599,7 @@ private void readDataForDecoding() throws IOException {
      for (int i = 0; i < dataBlkNum; i++) {
        Preconditions.checkNotNull(alignedStripe.chunks[i]);
        if (alignedStripe.chunks[i].state == StripingChunk.REQUESTED) {
-          if (!readChunk(service, targetBlocks[i], i)) {
+          if (!readChunk(targetBlocks[i], i)) {
            alignedStripe.missingChunksNum++;
          }
        }
@ -666,7 +611,7 @@ void readParityChunks(int num) throws IOException {
      for (int i = dataBlkNum, j = 0; i < dataBlkNum + parityBlkNum && j < num;
           i++) {
        if (alignedStripe.chunks[i] == null) {
-          if (prepareParityChunk(i) && readChunk(service, targetBlocks[i], i)) {
+          if (prepareParityChunk(i) && readChunk(targetBlocks[i], i)) {
            j++;
          } else {
            alignedStripe.missingChunksNum++;
@ -676,12 +621,75 @@ void readParityChunks(int num) throws IOException {
      checkMissingBlocks();
    }
    boolean createBlockReader(LocatedBlock block, int chunkIndex)
        throws IOException {
      DNAddrPair dnInfo = getBestNodeDNAddrPair(block, null);
      if (dnInfo != null) {
        BlockReader reader = getBlockReaderWithRetry(block,
            alignedStripe.getOffsetInBlock(),
            block.getBlockSize() - alignedStripe.getOffsetInBlock(),
            dnInfo.addr, dnInfo.storageType, dnInfo.info,
            block.getStartOffset(), new ReaderRetryPolicy());
        if (reader != null) {
          readerInfos[chunkIndex] = new BlockReaderInfo(reader, block,
              dnInfo.info, alignedStripe.getOffsetInBlock());
          return true;
        }
      }
      return false;
    }
    private ByteBufferStrategy[] getReadStrategies(StripingChunk chunk) {
      if (chunk.byteBuffer != null) {
        ByteBufferStrategy strategy = new ByteBufferStrategy(chunk.byteBuffer);
        return new ByteBufferStrategy[]{strategy};
      } else {
        ByteBufferStrategy[] strategies =
            new ByteBufferStrategy[chunk.byteArray.getOffsets().length];
        for (int i = 0; i < strategies.length; i++) {
          ByteBuffer buffer = ByteBuffer.wrap(chunk.byteArray.buf(),
              chunk.byteArray.getOffsets()[i], chunk.byteArray.getLengths()[i]);
          strategies[i] = new ByteBufferStrategy(buffer);
        }
        return strategies;
      }
    }
    boolean readChunk(final LocatedBlock block, int chunkIndex)
        throws IOException {
      final StripingChunk chunk = alignedStripe.chunks[chunkIndex];
      if (block == null) {
        chunk.state = StripingChunk.MISSING;
        return false;
      }
      if (readerInfos[chunkIndex] == null) {
        if (!createBlockReader(block, chunkIndex)) {
          chunk.state = StripingChunk.MISSING;
          return false;
        }
      } else if (readerInfos[chunkIndex].shouldSkip) {
        chunk.state = StripingChunk.MISSING;
        return false;
      }
      chunk.state = StripingChunk.PENDING;
      Callable<Void> readCallable = readCells(readerInfos[chunkIndex].reader,
          readerInfos[chunkIndex].datanode,
          readerInfos[chunkIndex].blockReaderOffset,
          alignedStripe.getOffsetInBlock(), getReadStrategies(chunk),
          block.getBlock(), corruptedBlockMap);
      Future<Void> request = service.submit(readCallable);
      futures.put(request, chunkIndex);
      return true;
    }
    /** read the whole stripe. do decoding if necessary */
    void readStripe() throws IOException {
      for (int i = 0; i < dataBlkNum; i++) {
        if (alignedStripe.chunks[i] != null &&
            alignedStripe.chunks[i].state != StripingChunk.ALLZERO) {
-          if (!readChunk(service, targetBlocks[i], i)) {
+          if (!readChunk(targetBlocks[i], i)) {
            alignedStripe.missingChunksNum++;
          }
        }
@ -700,8 +708,8 @@ void readStripe() throws IOException {
      // first read failure
      while (!futures.isEmpty()) {
        try {
-          StripingChunkReadResult r = getNextCompletedStripedRead(service,
+          StripingChunkReadResult r = StripedBlockUtil
-              futures, 0);
+              .getNextCompletedStripedRead(service, futures, 0);
          if (DFSClient.LOG.isDebugEnabled()) {
            DFSClient.LOG.debug("Read task returned: " + r + ", for stripe "
                + alignedStripe);
@ -721,7 +729,7 @@ void readStripe() throws IOException {
          } else {
            returnedChunk.state = StripingChunk.MISSING;
            // close the corresponding reader
-            closeReader(r.index);
+            closeReader(readerInfos[r.index]);
            final int missing = alignedStripe.missingChunksNum;
            alignedStripe.missingChunksNum++;
@ -750,48 +758,17 @@ class PositionStripeReader extends StripeReader {
    PositionStripeReader(CompletionService<Void> service,
        AlignedStripe alignedStripe, LocatedBlock[] targetBlocks,
        BlockReaderInfo[] readerInfos,
        Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) {
-      super(service, alignedStripe, targetBlocks, corruptedBlockMap);
+      super(service, alignedStripe, targetBlocks, readerInfos,
          corruptedBlockMap);
    }
    @Override
    boolean readChunk(final CompletionService<Void> service,
        final LocatedBlock block, int chunkIndex) {
      final StripingChunk chunk = alignedStripe.chunks[chunkIndex];
      if (block == null) {
        chunk.state = StripingChunk.MISSING;
        return false;
      }
      DatanodeInfo loc = block.getLocations()[0];
      StorageType type = block.getStorageTypes()[0];
      DNAddrPair dnAddr = new DNAddrPair(loc, NetUtils.createSocketAddr(
          loc.getXferAddr(dfsClient.getConf().isConnectToDnViaHostname())),
          type);
      chunk.state = StripingChunk.PENDING;
      Callable<Void> readCallable = getFromOneDataNode(dnAddr,
          block, alignedStripe.getOffsetInBlock(),
          alignedStripe.getOffsetInBlock() + alignedStripe.getSpanInBlock() - 1,
          chunk.byteArray.buf(), chunk.byteArray.getOffsets(),
          chunk.byteArray.getLengths(), corruptedBlockMap, chunkIndex);
      Future<Void> getFromDNRequest = service.submit(readCallable);
      if (DFSClient.LOG.isDebugEnabled()) {
        DFSClient.LOG.debug("Submitting striped read request for " + chunkIndex
            + ". Info of the block: " + block + ", offset in block is "
            + alignedStripe.getOffsetInBlock() + ", end is "
            + (alignedStripe.getOffsetInBlock()
            + alignedStripe.getSpanInBlock() - 1));
      }
      futures.put(getFromDNRequest, chunkIndex);
      return true;
    }
    @Override
    void updateState4SuccessRead(StripingChunkReadResult r) {}
    @Override
    void prepareDecodeInputs() {
      if (decodeInputs == null) {
-        decodeInputs = initDecodeInputs(alignedStripe, dataBlkNum, parityBlkNum);
+        decodeInputs = StripedBlockUtil.initDecodeInputs(alignedStripe,
            dataBlkNum, parityBlkNum);
      }
    }
@ -799,8 +776,8 @@ void prepareDecodeInputs() {
    boolean prepareParityChunk(int index) {
      Preconditions.checkState(index >= dataBlkNum &&
          alignedStripe.chunks[index] == null);
-      final int decodeIndex = convertIndex4Decode(index, dataBlkNum,
+      final int decodeIndex = StripedBlockUtil.convertIndex4Decode(index,
-          parityBlkNum);
+          dataBlkNum, parityBlkNum);
      alignedStripe.chunks[index] = new StripingChunk(decodeInputs[decodeIndex]);
      alignedStripe.chunks[index].addByteArraySlice(0,
          (int) alignedStripe.getSpanInBlock());
@ -809,10 +786,10 @@ boolean prepareParityChunk(int index) {
    @Override
    void decode() {
-      finalizeDecodeInputs(decodeInputs, dataBlkNum, parityBlkNum,
+      StripedBlockUtil.finalizeDecodeInputs(decodeInputs, dataBlkNum,
-          alignedStripe);
+          parityBlkNum, alignedStripe);
-      decodeAndFillBuffer(decodeInputs, alignedStripe, dataBlkNum,
+      StripedBlockUtil.decodeAndFillBuffer(decodeInputs, alignedStripe,
-          parityBlkNum, decoder);
+          dataBlkNum, parityBlkNum, decoder);
    }
  }
@ -821,36 +798,10 @@ class StatefulStripeReader extends StripeReader {
    StatefulStripeReader(CompletionService<Void> service,
        AlignedStripe alignedStripe, LocatedBlock[] targetBlocks,
        BlockReaderInfo[] readerInfos,
        Map<ExtendedBlock, Set<DatanodeInfo>> corruptedBlockMap) {
-      super(service, alignedStripe, targetBlocks, corruptedBlockMap);
+      super(service, alignedStripe, targetBlocks, readerInfos,
-    }
+          corruptedBlockMap);
    @Override
    boolean readChunk(final CompletionService<Void> service,
        final LocatedBlock block, int chunkIndex) {
      final StripingChunk chunk = alignedStripe.chunks[chunkIndex];
      final BlockReaderInfo readerInfo = blockReaders[chunkIndex];
      if (readerInfo == null || block == null || readerInfo.shouldSkip) {
        chunk.state = StripingChunk.MISSING;
        return false;
      }
      chunk.state = StripingChunk.PENDING;
      ByteBufferStrategy strategy = new ByteBufferStrategy(chunk.byteBuffer);
      Callable<Void> readCallable = readCell(readerInfo.reader,
          readerInfo.datanode, readerInfo.blockReaderOffset,
          alignedStripe.getOffsetInBlock(), strategy,
          chunk.byteBuffer.remaining(), block.getBlock(), corruptedBlockMap);
      Future<Void> request = readingService.submit(readCallable);
      futures.put(request, chunkIndex);
      return true;
    }
    @Override
    void updateState4SuccessRead(StripingChunkReadResult result) {
      Preconditions.checkArgument(
          result.state == StripingChunkReadResult.SUCCESSFUL);
      blockReaders[result.index].setOffset(alignedStripe.getOffsetInBlock()
          + alignedStripe.getSpanInBlock());
    }
    @Override
@ -864,8 +815,8 @@ void prepareDecodeInputs() {
          int pos = (int) (range.offsetInBlock % cellSize + cellSize * i);
          cur.position(pos);
          cur.limit((int) (pos + range.spanInBlock));
-          final int decodeIndex = convertIndex4Decode(i, dataBlkNum,
+          final int decodeIndex = StripedBlockUtil.convertIndex4Decode(i,
-              parityBlkNum);
+              dataBlkNum, parityBlkNum);
          decodeInputs[decodeIndex] = cur.slice();
          if (alignedStripe.chunks[i] == null) {
            alignedStripe.chunks[i] = new StripingChunk(
@ -884,45 +835,20 @@ boolean prepareParityChunk(int index) throws IOException {
        // we have failed the block reader before
        return false;
      }
-      final int decodeIndex = convertIndex4Decode(index, dataBlkNum,
+      final int decodeIndex = StripedBlockUtil.convertIndex4Decode(index,
-          parityBlkNum);
+          dataBlkNum, parityBlkNum);
      decodeInputs[decodeIndex] = ByteBuffer.allocateDirect(
          (int) alignedStripe.range.spanInBlock);
      alignedStripe.chunks[index] = new StripingChunk(decodeInputs[decodeIndex]);
      if (blockReaders[index] == null && !prepareParityBlockReader(index)) {
        alignedStripe.chunks[index] = new StripingChunk(StripingChunk.MISSING);
        return false;
      }
      return true;
    }
    private boolean prepareParityBlockReader(int i) throws IOException {
      // prepare the block reader for the parity chunk
      LocatedBlock targetBlock = targetBlocks[i];
      if (targetBlock != null) {
        final long offsetInBlock = alignedStripe.getOffsetInBlock();
        DNAddrPair dnInfo = getBestNodeDNAddrPair(targetBlock, null);
        if (dnInfo != null) {
          BlockReader reader = getBlockReaderWithRetry(targetBlock,
              offsetInBlock, targetBlock.getBlockSize() - offsetInBlock,
              dnInfo.addr, dnInfo.storageType, dnInfo.info,
              DFSStripedInputStream.this.getPos(), retry);
          if (reader != null) {
            blockReaders[i] = new BlockReaderInfo(reader, targetBlock,
                dnInfo.info, offsetInBlock);
            return true;
          }
        }
      }
      return false;
    }
    @Override
    void decode() {
      // TODO no copy for data chunks. this depends on HADOOP-12047
      final int span = (int) alignedStripe.getSpanInBlock();
      for (int i = 0; i < alignedStripe.chunks.length; i++) {
-        final int decodeIndex = convertIndex4Decode(i,
+        final int decodeIndex = StripedBlockUtil.convertIndex4Decode(i,
            dataBlkNum, parityBlkNum);
        if (alignedStripe.chunks[i] != null &&
            alignedStripe.chunks[i].state == StripingChunk.ALLZERO) {
@ -941,7 +867,7 @@ void decode() {
      for (int i = 0; i < alignedStripe.chunks.length; i++) {
        if (alignedStripe.chunks[i] != null &&
            alignedStripe.chunks[i].state == StripingChunk.MISSING) {
-          decodeIndices[pos++] = convertIndex4Decode(i,
+          decodeIndices[pos++] = StripedBlockUtil.convertIndex4Decode(i,
              dataBlkNum, parityBlkNum);
        }
      }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java
@ -27,7 +27,6 @@
 import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.BlockStoragePolicySpi;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.StorageType;
 import org.apache.hadoop.hdfs.HdfsConfiguration;
@ -53,8 +52,6 @@
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import static org.apache.hadoop.hdfs.util.StripedBlockUtil.getInternalBlockLength;
 import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.IOException;
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/StripedBlockUtil.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/StripedBlockUtil.java
@ -476,41 +476,6 @@ private static StripingCell[] getStripingCellsOfByteRange(ECSchema ecSchema,
    return cells;
  }
  /**
   * Given a logical start offset in a block group, calculate the physical
   * start offset into each stored internal block.
   */
  public static long[] getStartOffsetsForInternalBlocks(ECSchema ecSchema,
      int cellSize, LocatedStripedBlock blockGroup, long rangeStartInBlockGroup) {
    Preconditions.checkArgument(
        rangeStartInBlockGroup < blockGroup.getBlockSize());
    int dataBlkNum = ecSchema.getNumDataUnits();
    int parityBlkNum = ecSchema.getNumParityUnits();
    long[] startOffsets = new long[dataBlkNum + parityBlkNum];
    Arrays.fill(startOffsets, -1L);
    int firstCellIdxInBG = (int) (rangeStartInBlockGroup / cellSize);
    StripingCell firstCell = new StripingCell(ecSchema, cellSize,
        firstCellIdxInBG, (int) (rangeStartInBlockGroup % cellSize));
    startOffsets[firstCell.idxInStripe] =
        firstCell.idxInInternalBlk * cellSize + firstCell.offset;
    long earliestStart = startOffsets[firstCell.idxInStripe];
    for (int i = 1; i < dataBlkNum; i++) {
      int idx = firstCellIdxInBG + i;
      if (idx * (long) cellSize >= blockGroup.getBlockSize()) {
        break;
      }
      StripingCell cell = new StripingCell(ecSchema, cellSize, idx, 0);
      startOffsets[cell.idxInStripe] = cell.idxInInternalBlk * (long) cellSize;
      if (startOffsets[cell.idxInStripe] < earliestStart) {
        earliestStart = startOffsets[cell.idxInStripe];
      }
    }
    for (int i = dataBlkNum; i < dataBlkNum + parityBlkNum; i++) {
      startOffsets[i] = earliestStart;
    }
    return startOffsets;
  }
  /**
   * Given a logical byte range, mapped to each {@link StripingCell}, calculate
   * the physical byte range (inclusive) on each stored internal block.
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/StripedFileTestUtil.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/StripedFileTestUtil.java
@ -79,10 +79,19 @@ static void verifyPread(FileSystem fs, Path srcPath,  int fileLength,
      for (int startOffset : startOffsets) {
        startOffset = Math.max(0, Math.min(startOffset, fileLength - 1));
        int remaining = fileLength - startOffset;
-        in.readFully(startOffset, buf, 0, remaining);
+        int offset = startOffset;
-        for (int i = 0; i < remaining; i++) {
+        final byte[] result = new byte[remaining];
-          Assert.assertEquals("Byte at " + (startOffset + i) + " should be the " +
+        while (remaining > 0) {
-              "same", expected[startOffset + i], buf[i]);
+          int target = Math.min(remaining, buf.length);
          in.readFully(offset, buf, 0, target);
          System.arraycopy(buf, 0, result, offset - startOffset, target);
          remaining -= target;
          offset += target;
        }
        for (int i = 0; i < fileLength - startOffset; i++) {
          Assert.assertEquals("Byte at " + (startOffset + i) + " is different, "
                  + "the startOffset is " + startOffset,
              expected[startOffset + i], result[i]);
        }
      }
    }
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestWriteReadStripedFile.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestWriteReadStripedFile.java
@ -19,13 +19,16 @@
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.commons.logging.impl.Log4JLogger;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.BlockLocation;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicy;
 import org.apache.hadoop.hdfs.server.datanode.DataNode;
 import org.apache.hadoop.hdfs.web.WebHdfsConstants;
 import org.apache.hadoop.hdfs.web.WebHdfsTestUtil;
 import org.apache.log4j.Level;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@ -45,6 +48,11 @@ public class TestWriteReadStripedFile {
  private static FileSystem fs;
  private static Configuration conf = new HdfsConfiguration();
  static {
    ((Log4JLogger)LogFactory.getLog(BlockPlacementPolicy.class))
        .getLogger().setLevel(Level.ALL);
  }
  @Before
  public void setup() throws IOException {
    conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, blockSize);
@ -232,7 +240,8 @@ public void testWriteReadUsingWebHdfs() throws Exception {
    byte[] smallBuf = new byte[1024];
    byte[] largeBuf = new byte[fileLength + 100];
-    StripedFileTestUtil.verifyPread(fs, srcPath, fileLength, expected, largeBuf);
+    // TODO: HDFS-8797
    //StripedFileTestUtil.verifyPread(fs, srcPath, fileLength, expected, largeBuf);
    StripedFileTestUtil.verifyStatefulRead(fs, srcPath, fileLength, expected, largeBuf);
    StripedFileTestUtil.verifySeek(fs, srcPath, fileLength);