diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolSliceStorage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolSliceStorage.java index fd90ae921a..fd89611ff2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolSliceStorage.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockPoolSliceStorage.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.IOException; +import java.net.URI; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -741,7 +742,20 @@ public boolean isTrashAllowed(File blockFile) { * * @return the trash directory for a given block file that is being deleted. */ - public String getTrashDirectory(File blockFile) { + public String getTrashDirectory(ReplicaInfo info) { + + URI blockURI = info.getBlockURI(); + try{ + File blockFile = new File(blockURI); + return getTrashDirectory(blockFile); + } catch (IllegalArgumentException e) { + LOG.warn("Failed to get block file for replica " + info, e); + } + + return null; + } + + private String getTrashDirectory(File blockFile) { if (isTrashAllowed(blockFile)) { Matcher matcher = BLOCK_POOL_CURRENT_PATH_PATTERN.matcher(blockFile.getParent()); String trashDirectory = matcher.replaceFirst("$1$2" + TRASH_ROOT_DIR + "$4"); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java index 522d577fd7..39419c1e01 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java @@ -121,7 +121,7 @@ class BlockReceiver implements Closeable { /** the block to receive */ private final ExtendedBlock block; /** the replica to write */ - private ReplicaInPipelineInterface replicaInfo; + private ReplicaInPipeline replicaInfo; /** pipeline stage */ private final BlockConstructionStage stage; private final boolean isTransfer; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockSender.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockSender.java index 9d9502bc2c..c3ba2eb397 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockSender.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockSender.java @@ -38,6 +38,7 @@ import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeReference; import org.apache.hadoop.hdfs.server.datanode.fsdataset.LengthInputStream; import org.apache.hadoop.hdfs.util.DataTransferThrottler; @@ -248,8 +249,8 @@ class BlockSender implements java.io.Closeable { } // if there is a write in progress ChunkChecksum chunkChecksum = null; - if (replica instanceof ReplicaBeingWritten) { - final ReplicaBeingWritten rbw = (ReplicaBeingWritten)replica; + if (replica.getState() == ReplicaState.RBW) { + final ReplicaInPipeline rbw = (ReplicaInPipeline) replica; waitForMinLength(rbw, startOffset + length); chunkChecksum = rbw.getLastChecksumAndDataLen(); } @@ -473,7 +474,7 @@ private static Replica getReplica(ExtendedBlock block, DataNode datanode) * @param len minimum length to reach * @throws IOException on failing to reach the len in given wait time */ - private static void waitForMinLength(ReplicaBeingWritten rbw, long len) + private static void waitForMinLength(ReplicaInPipeline rbw, long len) throws IOException { // Wait for 3 seconds for rbw replica to reach the minimum length for (int i = 0; i < 30 && rbw.getBytesOnDisk() < len; i++) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java index 0025040123..09ecac1dce 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java @@ -3474,4 +3474,4 @@ public String getDiskBalancerSetting(String key) throws IOException { void setBlockScanner(BlockScanner blockScanner) { this.blockScanner = blockScanner; } -} \ No newline at end of file +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java index 931c1241f5..aa06aa1d0d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java @@ -56,6 +56,6 @@ public void noRegistration() throws IOException { } public void failMirrorConnection() throws IOException { } - public void failPipeline(ReplicaInPipelineInterface replicaInfo, + public void failPipeline(ReplicaInPipeline replicaInfo, String mirrorAddr) throws IOException { } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataStorage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataStorage.java index 0e6b339db0..7e620c2cea 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataStorage.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataStorage.java @@ -204,9 +204,9 @@ public void clearRollingUpgradeMarker(String bpid) throws IOException { * @return trash directory if rolling upgrade is in progress, null * otherwise. */ - public String getTrashDirectoryForBlockFile(String bpid, File blockFile) { + public String getTrashDirectoryForReplica(String bpid, ReplicaInfo info) { if (trashEnabledBpids.contains(bpid)) { - return getBPStorage(bpid).getTrashDirectory(blockFile); + return getBPStorage(bpid).getTrashDirectory(info); } return null; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DirectoryScanner.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DirectoryScanner.java index f9ebab906c..c50bfafd2d 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DirectoryScanner.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DirectoryScanner.java @@ -597,14 +597,14 @@ private void scan() { diffs.put(bpid, diffRecord); statsRecord.totalBlocks = blockpoolReport.length; - List bl = dataset.getFinalizedBlocks(bpid); - FinalizedReplica[] memReport = bl.toArray(new FinalizedReplica[bl.size()]); + List bl = dataset.getFinalizedBlocks(bpid); + ReplicaInfo[] memReport = bl.toArray(new ReplicaInfo[bl.size()]); Arrays.sort(memReport); // Sort based on blockId int d = 0; // index for blockpoolReport int m = 0; // index for memReprot while (m < memReport.length && d < blockpoolReport.length) { - FinalizedReplica memBlock = memReport[m]; + ReplicaInfo memBlock = memReport[m]; ScanInfo info = blockpoolReport[d]; if (info.getBlockId() < memBlock.getBlockId()) { if (!dataset.isDeletingBlock(bpid, info.getBlockId())) { @@ -633,7 +633,7 @@ private void scan() { // or block file length is different than expected statsRecord.mismatchBlocks++; addDifference(diffRecord, statsRecord, info); - } else if (info.getBlockFile().compareTo(memBlock.getBlockFile()) != 0) { + } else if (memBlock.compareWith(info) != 0) { // volumeMap record and on-disk files don't match. statsRecord.duplicateBlocks++; addDifference(diffRecord, statsRecord, info); @@ -652,7 +652,7 @@ private void scan() { } } while (m < memReport.length) { - FinalizedReplica current = memReport[m++]; + ReplicaInfo current = memReport[m++]; addDifference(diffRecord, statsRecord, current.getBlockId(), current.getVolume()); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FinalizedReplica.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FinalizedReplica.java index 8daeb51e0d..81a4ab4a4a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FinalizedReplica.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/FinalizedReplica.java @@ -22,11 +22,12 @@ import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; +import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo; /** * This class describes a replica that has been finalized. */ -public class FinalizedReplica extends ReplicaInfo { +public class FinalizedReplica extends LocalReplica { /** * Constructor @@ -88,4 +89,28 @@ public int hashCode() { public String toString() { return super.toString(); } + + @Override + public ReplicaInfo getOriginalReplica() { + throw new UnsupportedOperationException("Replica of type " + getState() + + " does not support getOriginalReplica"); + } + + @Override + public long getRecoveryID() { + throw new UnsupportedOperationException("Replica of type " + getState() + + " does not support getRecoveryID"); + } + + @Override + public void setRecoveryID(long recoveryId) { + throw new UnsupportedOperationException("Replica of type " + getState() + + " does not support setRecoveryID"); + } + + @Override + public ReplicaRecoveryInfo createInfo() { + throw new UnsupportedOperationException("Replica of type " + getState() + + " does not support createInfo"); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/LocalReplica.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/LocalReplica.java new file mode 100644 index 0000000000..cbfc9a5357 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/LocalReplica.java @@ -0,0 +1,479 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.datanode; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.RandomAccessFile; +import java.net.URI; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.fs.HardLink; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.hdfs.protocol.Block; +import org.apache.hadoop.hdfs.server.common.Storage; +import org.apache.hadoop.hdfs.server.datanode.DirectoryScanner.ScanInfo; +import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; +import org.apache.hadoop.hdfs.server.datanode.fsdataset.LengthInputStream; +import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetUtil; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.nativeio.NativeIO; +import org.apache.hadoop.util.DataChecksum; + +import com.google.common.annotations.VisibleForTesting; + +/** + * This class is used for all replicas which are on local storage media + * and hence, are backed by files. + */ +abstract public class LocalReplica extends ReplicaInfo { + + /** + * Base directory containing numerically-identified sub directories and + * possibly blocks. + */ + private File baseDir; + + /** + * Whether or not this replica's parent directory includes subdirs, in which + * case we can generate them based on the replica's block ID + */ + private boolean hasSubdirs; + + private static final Map internedBaseDirs = new HashMap(); + + static final Log LOG = LogFactory.getLog(LocalReplica.class); + private final static boolean IS_NATIVE_IO_AVAIL; + static { + IS_NATIVE_IO_AVAIL = NativeIO.isAvailable(); + if (Path.WINDOWS && !IS_NATIVE_IO_AVAIL) { + LOG.warn("Data node cannot fully support concurrent reading" + + " and writing without native code extensions on Windows."); + } + } + + /** + * Constructor + * @param block a block + * @param vol volume where replica is located + * @param dir directory path where block and meta files are located + */ + LocalReplica(Block block, FsVolumeSpi vol, File dir) { + this(block.getBlockId(), block.getNumBytes(), + block.getGenerationStamp(), vol, dir); + } + + /** + * Constructor + * @param blockId block id + * @param len replica length + * @param genStamp replica generation stamp + * @param vol volume where replica is located + * @param dir directory path where block and meta files are located + */ + LocalReplica(long blockId, long len, long genStamp, + FsVolumeSpi vol, File dir) { + super(vol, blockId, len, genStamp); + setDirInternal(dir); + } + + /** + * Copy constructor. + * @param from the source replica + */ + LocalReplica(LocalReplica from) { + this(from, from.getVolume(), from.getDir()); + } + + /** + * Get the full path of this replica's data file. + * @return the full path of this replica's data file + */ + @VisibleForTesting + public File getBlockFile() { + return new File(getDir(), getBlockName()); + } + + /** + * Get the full path of this replica's meta file. + * @return the full path of this replica's meta file + */ + @VisibleForTesting + public File getMetaFile() { + return new File(getDir(), + DatanodeUtil.getMetaName(getBlockName(), getGenerationStamp())); + } + + /** + * Return the parent directory path where this replica is located. + * @return the parent directory path where this replica is located + */ + protected File getDir() { + return hasSubdirs ? DatanodeUtil.idToBlockDir(baseDir, + getBlockId()) : baseDir; + } + + /** + * Set the parent directory where this replica is located. + * @param dir the parent directory where the replica is located + */ + private void setDirInternal(File dir) { + if (dir == null) { + baseDir = null; + return; + } + + ReplicaDirInfo dirInfo = parseBaseDir(dir); + this.hasSubdirs = dirInfo.hasSubidrs; + + synchronized (internedBaseDirs) { + if (!internedBaseDirs.containsKey(dirInfo.baseDirPath)) { + // Create a new String path of this file and make a brand new File object + // to guarantee we drop the reference to the underlying char[] storage. + File baseDir = new File(dirInfo.baseDirPath); + internedBaseDirs.put(dirInfo.baseDirPath, baseDir); + } + this.baseDir = internedBaseDirs.get(dirInfo.baseDirPath); + } + } + + @VisibleForTesting + public static class ReplicaDirInfo { + public String baseDirPath; + public boolean hasSubidrs; + + public ReplicaDirInfo (String baseDirPath, boolean hasSubidrs) { + this.baseDirPath = baseDirPath; + this.hasSubidrs = hasSubidrs; + } + } + + @VisibleForTesting + public static ReplicaDirInfo parseBaseDir(File dir) { + + File currentDir = dir; + boolean hasSubdirs = false; + while (currentDir.getName().startsWith(DataStorage.BLOCK_SUBDIR_PREFIX)) { + hasSubdirs = true; + currentDir = currentDir.getParentFile(); + } + + return new ReplicaDirInfo(currentDir.getAbsolutePath(), hasSubdirs); + } + + /** + * Copy specified file into a temporary file. Then rename the + * temporary file to the original name. This will cause any + * hardlinks to the original file to be removed. The temporary + * files are created in the same directory. The temporary files will + * be recovered (especially on Windows) on datanode restart. + */ + private void breakHardlinks(File file, Block b) throws IOException { + File tmpFile = DatanodeUtil.createTmpFile(b, DatanodeUtil.getUnlinkTmpFile(file)); + try (FileInputStream in = new FileInputStream(file)) { + try (FileOutputStream out = new FileOutputStream(tmpFile)){ + IOUtils.copyBytes(in, out, 16 * 1024); + } + if (file.length() != tmpFile.length()) { + throw new IOException("Copy of file " + file + " size " + file.length()+ + " into file " + tmpFile + + " resulted in a size of " + tmpFile.length()); + } + FileUtil.replaceFile(tmpFile, file); + } catch (IOException e) { + boolean done = tmpFile.delete(); + if (!done) { + DataNode.LOG.info("detachFile failed to delete temporary file " + + tmpFile); + } + throw e; + } + } + + /** + * This function "breaks hardlinks" to the current replica file. + * + * When doing a DataNode upgrade, we create a bunch of hardlinks to each block + * file. This cleverly ensures that both the old and the new storage + * directories can contain the same block file, without using additional space + * for the data. + * + * However, when we want to append to the replica file, we need to "break" the + * hardlink to ensure that the old snapshot continues to contain the old data + * length. If we failed to do that, we could roll back to the previous/ + * directory during a downgrade, and find that the block contents were longer + * than they were at the time of upgrade. + * + * @return true only if data was copied. + * @throws IOException + */ + public boolean breakHardLinksIfNeeded() throws IOException { + File file = getBlockFile(); + if (file == null || getVolume() == null) { + throw new IOException("detachBlock:Block not found. " + this); + } + File meta = getMetaFile(); + + int linkCount = HardLink.getLinkCount(file); + if (linkCount > 1) { + DataNode.LOG.info("Breaking hardlink for " + linkCount + "x-linked " + + "block " + this); + breakHardlinks(file, this); + } + if (HardLink.getLinkCount(meta) > 1) { + breakHardlinks(meta, this); + } + return true; + } + + @Override + public URI getBlockURI() { + return getBlockFile().toURI(); + } + + @Override + public InputStream getDataInputStream(long seekOffset) throws IOException { + + File blockFile = getBlockFile(); + if (IS_NATIVE_IO_AVAIL) { + return NativeIO.getShareDeleteFileInputStream(blockFile, seekOffset); + } else { + try { + return FsDatasetUtil.openAndSeek(blockFile, seekOffset); + } catch (FileNotFoundException fnfe) { + throw new IOException("Block " + this + " is not valid. " + + "Expected block file at " + blockFile + " does not exist."); + } + } + } + + @Override + public OutputStream getDataOutputStream(boolean append) throws IOException { + return new FileOutputStream(getBlockFile(), append); + } + + @Override + public boolean blockDataExists() { + return getBlockFile().exists(); + } + + @Override + public boolean deleteBlockData() { + return getBlockFile().delete(); + } + + @Override + public long getBlockDataLength() { + return getBlockFile().length(); + } + + @Override + public URI getMetadataURI() { + return getMetaFile().toURI(); + } + + @Override + public LengthInputStream getMetadataInputStream(long offset) + throws IOException { + File meta = getMetaFile(); + return new LengthInputStream( + FsDatasetUtil.openAndSeek(meta, offset), meta.length()); + } + + @Override + public OutputStream getMetadataOutputStream(boolean append) + throws IOException { + return new FileOutputStream(getMetaFile(), append); + } + + @Override + public boolean metadataExists() { + return getMetaFile().exists(); + } + + @Override + public boolean deleteMetadata() { + return getMetaFile().delete(); + } + + @Override + public long getMetadataLength() { + return getMetaFile().length(); + } + + @Override + public boolean renameMeta(URI destURI) throws IOException { + return renameFile(getMetaFile(), new File(destURI)); + } + + @Override + public boolean renameData(URI destURI) throws IOException { + return renameFile(getBlockFile(), new File(destURI)); + } + + private boolean renameFile(File srcfile, File destfile) throws IOException { + try { + NativeIO.renameTo(srcfile, destfile); + return true; + } catch (IOException e) { + throw new IOException("Failed to move block file for " + this + + " from " + srcfile + " to " + destfile.getAbsolutePath(), e); + } + } + + @Override + public void updateWithReplica(StorageLocation replicaLocation) { + // for local replicas, the replica location is assumed to be a file. + File diskFile = replicaLocation.getFile(); + if (null == diskFile) { + setDirInternal(null); + } else { + setDirInternal(diskFile.getParentFile()); + } + } + + @Override + public boolean getPinning(LocalFileSystem localFS) throws IOException { + FileStatus fss = + localFS.getFileStatus(new Path(getBlockFile().getAbsolutePath())); + return fss.getPermission().getStickyBit(); + } + + @Override + public void setPinning(LocalFileSystem localFS) throws IOException { + File f = getBlockFile(); + Path p = new Path(f.getAbsolutePath()); + + FsPermission oldPermission = localFS.getFileStatus( + new Path(f.getAbsolutePath())).getPermission(); + //sticky bit is used for pinning purpose + FsPermission permission = new FsPermission(oldPermission.getUserAction(), + oldPermission.getGroupAction(), oldPermission.getOtherAction(), true); + localFS.setPermission(p, permission); + } + + @Override + public void bumpReplicaGS(long newGS) throws IOException { + long oldGS = getGenerationStamp(); + File oldmeta = getMetaFile(); + setGenerationStamp(newGS); + File newmeta = getMetaFile(); + + // rename meta file to new GS + if (LOG.isDebugEnabled()) { + LOG.debug("Renaming " + oldmeta + " to " + newmeta); + } + try { + // calling renameMeta on the ReplicaInfo doesn't work here + NativeIO.renameTo(oldmeta, newmeta); + } catch (IOException e) { + setGenerationStamp(oldGS); // restore old GS + throw new IOException("Block " + this + " reopen failed. " + + " Unable to move meta file " + oldmeta + + " to " + newmeta, e); + } + } + + @Override + public void truncateBlock(long newLength) throws IOException { + truncateBlock(getBlockFile(), getMetaFile(), getNumBytes(), newLength); + } + + @Override + public int compareWith(ScanInfo info) { + return info.getBlockFile().compareTo(getBlockFile()); + } + + static public void truncateBlock(File blockFile, File metaFile, + long oldlen, long newlen) throws IOException { + LOG.info("truncateBlock: blockFile=" + blockFile + + ", metaFile=" + metaFile + + ", oldlen=" + oldlen + + ", newlen=" + newlen); + + if (newlen == oldlen) { + return; + } + if (newlen > oldlen) { + throw new IOException("Cannot truncate block to from oldlen (=" + oldlen + + ") to newlen (=" + newlen + ")"); + } + + DataChecksum dcs = BlockMetadataHeader.readHeader(metaFile).getChecksum(); + int checksumsize = dcs.getChecksumSize(); + int bpc = dcs.getBytesPerChecksum(); + long n = (newlen - 1)/bpc + 1; + long newmetalen = BlockMetadataHeader.getHeaderSize() + n*checksumsize; + long lastchunkoffset = (n - 1)*bpc; + int lastchunksize = (int)(newlen - lastchunkoffset); + byte[] b = new byte[Math.max(lastchunksize, checksumsize)]; + + RandomAccessFile blockRAF = new RandomAccessFile(blockFile, "rw"); + try { + //truncate blockFile + blockRAF.setLength(newlen); + + //read last chunk + blockRAF.seek(lastchunkoffset); + blockRAF.readFully(b, 0, lastchunksize); + } finally { + blockRAF.close(); + } + + //compute checksum + dcs.update(b, 0, lastchunksize); + dcs.writeValue(b, 0, false); + + //update metaFile + RandomAccessFile metaRAF = new RandomAccessFile(metaFile, "rw"); + try { + metaRAF.setLength(newmetalen); + metaRAF.seek(newmetalen - checksumsize); + metaRAF.write(b, 0, checksumsize); + } finally { + metaRAF.close(); + } + } + + @Override + public void copyMetadata(URI destination) throws IOException { + //for local replicas, we assume the destination URI is file + Storage.nativeCopyFileUnbuffered(getMetaFile(), + new File(destination), true); + } + + @Override + public void copyBlockdata(URI destination) throws IOException { + //for local replicas, we assume the destination URI is file + Storage.nativeCopyFileUnbuffered(getBlockFile(), + new File(destination), true); + } + +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/LocalReplicaInPipeline.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/LocalReplicaInPipeline.java new file mode 100644 index 0000000000..bc7bc6dde3 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/LocalReplicaInPipeline.java @@ -0,0 +1,417 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.datanode; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.io.RandomAccessFile; +import java.util.concurrent.atomic.AtomicReference; + +import org.apache.hadoop.hdfs.protocol.Block; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; +import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; +import org.apache.hadoop.hdfs.server.datanode.fsdataset.ReplicaOutputStreams; +import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.nativeio.NativeIO; +import org.apache.hadoop.util.DataChecksum; +import org.apache.hadoop.util.StringUtils; + +/** + * This class defines a replica in a pipeline, which + * includes a persistent replica being written to by a dfs client or + * a temporary replica being replicated by a source datanode or + * being copied for the balancing purpose. + * + * The base class implements a temporary replica + */ +public class LocalReplicaInPipeline extends LocalReplica + implements ReplicaInPipeline { + private long bytesAcked; + private long bytesOnDisk; + private byte[] lastChecksum; + private AtomicReference writer = new AtomicReference(); + + /** + * Bytes reserved for this replica on the containing volume. + * Based off difference between the estimated maximum block length and + * the bytes already written to this block. + */ + private long bytesReserved; + private final long originalBytesReserved; + + /** + * Constructor for a zero length replica. + * @param blockId block id + * @param genStamp replica generation stamp + * @param vol volume where replica is located + * @param dir directory path where block and meta files are located + * @param bytesToReserve disk space to reserve for this replica, based on + * the estimated maximum block length. + */ + public LocalReplicaInPipeline(long blockId, long genStamp, + FsVolumeSpi vol, File dir, long bytesToReserve) { + this(blockId, 0L, genStamp, vol, dir, Thread.currentThread(), + bytesToReserve); + } + + /** + * Constructor + * @param block a block + * @param vol volume where replica is located + * @param dir directory path where block and meta files are located + * @param writer a thread that is writing to this replica + */ + LocalReplicaInPipeline(Block block, + FsVolumeSpi vol, File dir, Thread writer) { + this(block.getBlockId(), block.getNumBytes(), block.getGenerationStamp(), + vol, dir, writer, 0L); + } + + /** + * Constructor + * @param blockId block id + * @param len replica length + * @param genStamp replica generation stamp + * @param vol volume where replica is located + * @param dir directory path where block and meta files are located + * @param writer a thread that is writing to this replica + * @param bytesToReserve disk space to reserve for this replica, based on + * the estimated maximum block length. + */ + LocalReplicaInPipeline(long blockId, long len, long genStamp, + FsVolumeSpi vol, File dir, Thread writer, long bytesToReserve) { + super(blockId, len, genStamp, vol, dir); + this.bytesAcked = len; + this.bytesOnDisk = len; + this.writer.set(writer); + this.bytesReserved = bytesToReserve; + this.originalBytesReserved = bytesToReserve; + } + + /** + * Copy constructor. + * @param from where to copy from + */ + public LocalReplicaInPipeline(LocalReplicaInPipeline from) { + super(from); + this.bytesAcked = from.getBytesAcked(); + this.bytesOnDisk = from.getBytesOnDisk(); + this.writer.set(from.writer.get()); + this.bytesReserved = from.bytesReserved; + this.originalBytesReserved = from.originalBytesReserved; + } + + @Override + public long getVisibleLength() { + return -1; + } + + @Override //ReplicaInfo + public ReplicaState getState() { + return ReplicaState.TEMPORARY; + } + + @Override // ReplicaInPipeline + public long getBytesAcked() { + return bytesAcked; + } + + @Override // ReplicaInPipeline + public void setBytesAcked(long bytesAcked) { + long newBytesAcked = bytesAcked - this.bytesAcked; + this.bytesAcked = bytesAcked; + + // Once bytes are ACK'ed we can release equivalent space from the + // volume's reservedForRbw count. We could have released it as soon + // as the write-to-disk completed but that would be inefficient. + getVolume().releaseReservedSpace(newBytesAcked); + bytesReserved -= newBytesAcked; + } + + @Override // ReplicaInPipeline + public long getBytesOnDisk() { + return bytesOnDisk; + } + + @Override + public long getBytesReserved() { + return bytesReserved; + } + + @Override + public long getOriginalBytesReserved() { + return originalBytesReserved; + } + + @Override // ReplicaInPipeline + public void releaseAllBytesReserved() { + getVolume().releaseReservedSpace(bytesReserved); + getVolume().releaseLockedMemory(bytesReserved); + bytesReserved = 0; + } + + @Override // ReplicaInPipeline + public synchronized void setLastChecksumAndDataLen(long dataLength, + byte[] checksum) { + this.bytesOnDisk = dataLength; + this.lastChecksum = checksum; + } + + @Override // ReplicaInPipeline + public synchronized ChunkChecksum getLastChecksumAndDataLen() { + return new ChunkChecksum(getBytesOnDisk(), lastChecksum); + } + + @Override // ReplicaInPipeline + public void setWriter(Thread writer) { + this.writer.set(writer); + } + + @Override + public void interruptThread() { + Thread thread = writer.get(); + if (thread != null && thread != Thread.currentThread() + && thread.isAlive()) { + thread.interrupt(); + } + } + + @Override // Object + public boolean equals(Object o) { + return super.equals(o); + } + + /** + * Attempt to set the writer to a new value. + */ + @Override // ReplicaInPipeline + public boolean attemptToSetWriter(Thread prevWriter, Thread newWriter) { + return writer.compareAndSet(prevWriter, newWriter); + } + + /** + * Interrupt the writing thread and wait until it dies. + * @throws IOException the waiting is interrupted + */ + @Override // ReplicaInPipeline + public void stopWriter(long xceiverStopTimeout) throws IOException { + while (true) { + Thread thread = writer.get(); + if ((thread == null) || (thread == Thread.currentThread()) || + (!thread.isAlive())) { + if (writer.compareAndSet(thread, null)) { + return; // Done + } + // The writer changed. Go back to the start of the loop and attempt to + // stop the new writer. + continue; + } + thread.interrupt(); + try { + thread.join(xceiverStopTimeout); + if (thread.isAlive()) { + // Our thread join timed out. + final String msg = "Join on writer thread " + thread + " timed out"; + DataNode.LOG.warn(msg + "\n" + StringUtils.getStackTrace(thread)); + throw new IOException(msg); + } + } catch (InterruptedException e) { + throw new IOException("Waiting for writer thread is interrupted."); + } + } + } + + @Override // Object + public int hashCode() { + return super.hashCode(); + } + + @Override // ReplicaInPipeline + public ReplicaOutputStreams createStreams(boolean isCreate, + DataChecksum requestedChecksum) throws IOException { + File blockFile = getBlockFile(); + File metaFile = getMetaFile(); + if (DataNode.LOG.isDebugEnabled()) { + DataNode.LOG.debug("writeTo blockfile is " + blockFile + + " of size " + blockFile.length()); + DataNode.LOG.debug("writeTo metafile is " + metaFile + + " of size " + metaFile.length()); + } + long blockDiskSize = 0L; + long crcDiskSize = 0L; + + // the checksum that should actually be used -- this + // may differ from requestedChecksum for appends. + final DataChecksum checksum; + + RandomAccessFile metaRAF = new RandomAccessFile(metaFile, "rw"); + + if (!isCreate) { + // For append or recovery, we must enforce the existing checksum. + // Also, verify that the file has correct lengths, etc. + boolean checkedMeta = false; + try { + BlockMetadataHeader header = BlockMetadataHeader.readHeader(metaRAF); + checksum = header.getChecksum(); + + if (checksum.getBytesPerChecksum() != + requestedChecksum.getBytesPerChecksum()) { + throw new IOException("Client requested checksum " + + requestedChecksum + " when appending to an existing block " + + "with different chunk size: " + checksum); + } + + int bytesPerChunk = checksum.getBytesPerChecksum(); + int checksumSize = checksum.getChecksumSize(); + + blockDiskSize = bytesOnDisk; + crcDiskSize = BlockMetadataHeader.getHeaderSize() + + (blockDiskSize+bytesPerChunk-1)/bytesPerChunk*checksumSize; + if (blockDiskSize > 0 && + (blockDiskSize > blockFile.length() || + crcDiskSize>metaFile.length())) { + throw new IOException("Corrupted block: " + this); + } + checkedMeta = true; + } finally { + if (!checkedMeta) { + // clean up in case of exceptions. + IOUtils.closeStream(metaRAF); + } + } + } else { + // for create, we can use the requested checksum + checksum = requestedChecksum; + } + + FileOutputStream blockOut = null; + FileOutputStream crcOut = null; + try { + blockOut = new FileOutputStream( + new RandomAccessFile(blockFile, "rw").getFD()); + crcOut = new FileOutputStream(metaRAF.getFD()); + if (!isCreate) { + blockOut.getChannel().position(blockDiskSize); + crcOut.getChannel().position(crcDiskSize); + } + return new ReplicaOutputStreams(blockOut, crcOut, checksum, + getVolume().isTransientStorage()); + } catch (IOException e) { + IOUtils.closeStream(blockOut); + IOUtils.closeStream(metaRAF); + throw e; + } + } + + @Override + public OutputStream createRestartMetaStream() throws IOException { + File blockFile = getBlockFile(); + File restartMeta = new File(blockFile.getParent() + + File.pathSeparator + "." + blockFile.getName() + ".restart"); + if (restartMeta.exists() && !restartMeta.delete()) { + DataNode.LOG.warn("Failed to delete restart meta file: " + + restartMeta.getPath()); + } + return new FileOutputStream(restartMeta); + } + + @Override + public String toString() { + return super.toString() + + "\n bytesAcked=" + bytesAcked + + "\n bytesOnDisk=" + bytesOnDisk; + } + + @Override + public ReplicaInfo getOriginalReplica() { + throw new UnsupportedOperationException("Replica of type " + getState() + + " does not support getOriginalReplica"); + } + + @Override + public long getRecoveryID() { + throw new UnsupportedOperationException("Replica of type " + getState() + + " does not support getRecoveryID"); + } + + @Override + public void setRecoveryID(long recoveryId) { + throw new UnsupportedOperationException("Replica of type " + getState() + + " does not support setRecoveryID"); + } + + @Override + public ReplicaRecoveryInfo createInfo(){ + throw new UnsupportedOperationException("Replica of type " + getState() + + " does not support createInfo"); + } + + public void moveReplicaFrom(ReplicaInfo oldReplicaInfo, File newBlkFile) + throws IOException { + + if (!(oldReplicaInfo instanceof LocalReplica)) { + throw new IOException("The source replica with blk id " + + oldReplicaInfo.getBlockId() + + " should be derived from LocalReplica"); + } + + LocalReplica localReplica = (LocalReplica) oldReplicaInfo; + + File oldmeta = localReplica.getMetaFile(); + File newmeta = getMetaFile(); + + if (LOG.isDebugEnabled()) { + LOG.debug("Renaming " + oldmeta + " to " + newmeta); + } + try { + NativeIO.renameTo(oldmeta, newmeta); + } catch (IOException e) { + throw new IOException("Block " + oldReplicaInfo + " reopen failed. " + + " Unable to move meta file " + oldmeta + + " to rbw dir " + newmeta, e); + } + + File blkfile = localReplica.getBlockFile(); + + if (LOG.isDebugEnabled()) { + LOG.debug("Renaming " + blkfile + " to " + newBlkFile + + ", file length=" + blkfile.length()); + } + try { + NativeIO.renameTo(blkfile, newBlkFile); + } catch (IOException e) { + try { + NativeIO.renameTo(newmeta, oldmeta); + } catch (IOException ex) { + LOG.warn("Cannot move meta file " + newmeta + + "back to the finalized directory " + oldmeta, ex); + } + throw new IOException("Block " + oldReplicaInfo + " reopen failed. " + + " Unable to move block file " + blkfile + + " to rbw dir " + newBlkFile, e); + } + } + + @Override // ReplicaInPipeline + public ReplicaInfo getReplicaInfo() { + return this; + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaBeingWritten.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaBeingWritten.java index 4a89493f03..262533e881 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaBeingWritten.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaBeingWritten.java @@ -27,9 +27,9 @@ * Those are the replicas that * are created in a pipeline initiated by a dfs client. */ -public class ReplicaBeingWritten extends ReplicaInPipeline { +public class ReplicaBeingWritten extends LocalReplicaInPipeline { /** - * Constructor for a zero length replica + * Constructor for a zero length replica. * @param blockId block id * @param genStamp replica generation stamp * @param vol volume where replica is located @@ -37,25 +37,25 @@ public class ReplicaBeingWritten extends ReplicaInPipeline { * @param bytesToReserve disk space to reserve for this replica, based on * the estimated maximum block length. */ - public ReplicaBeingWritten(long blockId, long genStamp, + public ReplicaBeingWritten(long blockId, long genStamp, FsVolumeSpi vol, File dir, long bytesToReserve) { super(blockId, genStamp, vol, dir, bytesToReserve); } - + /** - * Constructor + * Constructor. * @param block a block * @param vol volume where replica is located * @param dir directory path where block and meta files are located * @param writer a thread that is writing to this replica */ - public ReplicaBeingWritten(Block block, + public ReplicaBeingWritten(Block block, FsVolumeSpi vol, File dir, Thread writer) { - super( block, vol, dir, writer); + super(block, vol, dir, writer); } /** - * Constructor + * Constructor. * @param blockId block id * @param len replica length * @param genStamp replica generation stamp diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaBuilder.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaBuilder.java new file mode 100644 index 0000000000..280aaa0ee5 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaBuilder.java @@ -0,0 +1,252 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.datanode; + +import java.io.File; + +import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; +import org.apache.hadoop.hdfs.protocol.Block; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; + +/** + * This class is to be used as a builder for {@link ReplicaInfo} objects. + * The state of the replica is used to determine which object is instantiated. + */ +public class ReplicaBuilder { + + private ReplicaState state; + private long blockId; + private long genStamp; + private long length; + private FsVolumeSpi volume; + private File directoryUsed; + private long bytesToReserve; + private Thread writer; + private long recoveryId; + private Block block; + + private ReplicaInfo fromReplica; + + public ReplicaBuilder(ReplicaState state) { + volume = null; + writer = null; + block = null; + length = -1; + this.state = state; + } + + public ReplicaBuilder setState(ReplicaState state) { + this.state = state; + return this; + } + + public ReplicaBuilder setBlockId(long blockId) { + this.blockId = blockId; + return this; + } + + public ReplicaBuilder setGenerationStamp(long genStamp) { + this.genStamp = genStamp; + return this; + } + + public ReplicaBuilder setLength(long length) { + this.length = length; + return this; + } + + public ReplicaBuilder setFsVolume(FsVolumeSpi volume) { + this.volume = volume; + return this; + } + + public ReplicaBuilder setDirectoryToUse(File dir) { + this.directoryUsed = dir; + return this; + } + + public ReplicaBuilder setBytesToReserve(long bytesToReserve) { + this.bytesToReserve = bytesToReserve; + return this; + } + + public ReplicaBuilder setWriterThread(Thread writer) { + this.writer = writer; + return this; + } + + public ReplicaBuilder from(ReplicaInfo fromReplica) { + this.fromReplica = fromReplica; + return this; + } + + public ReplicaBuilder setRecoveryId(long recoveryId) { + this.recoveryId = recoveryId; + return this; + } + + public ReplicaBuilder setBlock(Block block) { + this.block = block; + return this; + } + + public LocalReplicaInPipeline buildLocalReplicaInPipeline() + throws IllegalArgumentException { + LocalReplicaInPipeline info = null; + switch(state) { + case RBW: + info = buildRBW(); + break; + case TEMPORARY: + info = buildTemporaryReplica(); + break; + default: + throw new IllegalArgumentException("Unknown replica state " + state); + } + return info; + } + + private LocalReplicaInPipeline buildRBW() throws IllegalArgumentException { + if (null != fromReplica && fromReplica.getState() == ReplicaState.RBW) { + return new ReplicaBeingWritten((ReplicaBeingWritten) fromReplica); + } else if (null != fromReplica) { + throw new IllegalArgumentException("Incompatible fromReplica " + + "state: " + fromReplica.getState()); + } else { + if (null != block) { + if (null == writer) { + throw new IllegalArgumentException("A valid writer is " + + "required for constructing a RBW from block " + + block.getBlockId()); + } + return new ReplicaBeingWritten(block, volume, directoryUsed, writer); + } else { + if (length != -1) { + return new ReplicaBeingWritten(blockId, length, genStamp, + volume, directoryUsed, writer, bytesToReserve); + } else { + return new ReplicaBeingWritten(blockId, genStamp, volume, + directoryUsed, bytesToReserve); + } + } + } + } + + private LocalReplicaInPipeline buildTemporaryReplica() + throws IllegalArgumentException { + if (null != fromReplica && + fromReplica.getState() == ReplicaState.TEMPORARY) { + return new LocalReplicaInPipeline((LocalReplicaInPipeline) fromReplica); + } else if (null != fromReplica) { + throw new IllegalArgumentException("Incompatible fromReplica " + + "state: " + fromReplica.getState()); + } else { + if (null != block) { + if (null == writer) { + throw new IllegalArgumentException("A valid writer is " + + "required for constructing a Replica from block " + + block.getBlockId()); + } + return new LocalReplicaInPipeline(block, volume, directoryUsed, + writer); + } else { + if (length != -1) { + return new LocalReplicaInPipeline(blockId, length, genStamp, + volume, directoryUsed, writer, bytesToReserve); + } else { + return new LocalReplicaInPipeline(blockId, genStamp, volume, + directoryUsed, bytesToReserve); + } + } + } + } + + private ReplicaInfo buildFinalizedReplica() throws IllegalArgumentException { + if (null != fromReplica && + fromReplica.getState() == ReplicaState.FINALIZED) { + return new FinalizedReplica((FinalizedReplica)fromReplica); + } else if (null != this.fromReplica) { + throw new IllegalArgumentException("Incompatible fromReplica " + + "state: " + fromReplica.getState()); + } else { + if (null != block) { + return new FinalizedReplica(block, volume, directoryUsed); + } else { + return new FinalizedReplica(blockId, length, genStamp, volume, + directoryUsed); + } + } + } + + private ReplicaInfo buildRWR() throws IllegalArgumentException { + + if (null != fromReplica && fromReplica.getState() == ReplicaState.RWR) { + return new ReplicaWaitingToBeRecovered( + (ReplicaWaitingToBeRecovered) fromReplica); + } else if (null != fromReplica){ + throw new IllegalArgumentException("Incompatible fromReplica " + + "state: " + fromReplica.getState()); + } else { + if (null != block) { + return new ReplicaWaitingToBeRecovered(block, volume, directoryUsed); + } else { + return new ReplicaWaitingToBeRecovered(blockId, length, genStamp, + volume, directoryUsed); + } + } + } + + private ReplicaInfo buildRUR() throws IllegalArgumentException { + if (null == fromReplica) { + throw new IllegalArgumentException( + "Missing a valid replica to recover from"); + } + if (null != writer || null != block) { + throw new IllegalArgumentException("Invalid state for " + + "recovering from replica with blk id " + + fromReplica.getBlockId()); + } + if (fromReplica.getState() == ReplicaState.RUR) { + return new ReplicaUnderRecovery((ReplicaUnderRecovery) fromReplica); + } else { + return new ReplicaUnderRecovery(fromReplica, recoveryId); + } + } + + public ReplicaInfo build() throws IllegalArgumentException { + ReplicaInfo info = null; + switch(this.state) { + case FINALIZED: + info = buildFinalizedReplica(); + break; + case RWR: + info = buildRWR(); + break; + case RUR: + info = buildRUR(); + break; + case RBW: + case TEMPORARY: + info = buildLocalReplicaInPipeline(); + break; + default: + throw new IllegalArgumentException("Unknown replica state " + state); + } + return info; + } +} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaHandler.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaHandler.java index b563d7f9e9..ddc9f9f2c0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaHandler.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaHandler.java @@ -27,11 +27,11 @@ * the fs volume where this replica is located. */ public class ReplicaHandler implements Closeable { - private final ReplicaInPipelineInterface replica; + private final ReplicaInPipeline replica; private final FsVolumeReference volumeReference; public ReplicaHandler( - ReplicaInPipelineInterface replica, FsVolumeReference reference) { + ReplicaInPipeline replica, FsVolumeReference reference) { this.replica = replica; this.volumeReference = reference; } @@ -43,7 +43,7 @@ public void close() throws IOException { } } - public ReplicaInPipelineInterface getReplica() { + public ReplicaInPipeline getReplica() { return replica; } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaInPipeline.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaInPipeline.java index 732684692f..efa6ea686f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaInPipeline.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaInPipeline.java @@ -17,313 +17,91 @@ */ package org.apache.hadoop.hdfs.server.datanode; -import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; -import java.io.RandomAccessFile; -import java.util.concurrent.atomic.AtomicReference; -import org.apache.hadoop.hdfs.protocol.Block; -import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; -import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; import org.apache.hadoop.hdfs.server.datanode.fsdataset.ReplicaOutputStreams; -import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.util.DataChecksum; -import org.apache.hadoop.util.StringUtils; /** - * This class defines a replica in a pipeline, which - * includes a persistent replica being written to by a dfs client or - * a temporary replica being replicated by a source datanode or - * being copied for the balancing purpose. - * - * The base class implements a temporary replica + * This defines the interface of a replica in Pipeline that's being written to */ -public class ReplicaInPipeline extends ReplicaInfo - implements ReplicaInPipelineInterface { - private long bytesAcked; - private long bytesOnDisk; - private byte[] lastChecksum; - private AtomicReference writer = new AtomicReference(); +public interface ReplicaInPipeline extends Replica { + /** + * Set the number of bytes received + * @param bytesReceived number of bytes received + */ + void setNumBytes(long bytesReceived); /** - * Bytes reserved for this replica on the containing volume. - * Based off difference between the estimated maximum block length and - * the bytes already written to this block. + * Get the number of bytes acked + * @return the number of bytes acked */ - private long bytesReserved; - private final long originalBytesReserved; + long getBytesAcked(); /** - * Constructor for a zero length replica - * @param blockId block id - * @param genStamp replica generation stamp - * @param vol volume where replica is located - * @param dir directory path where block and meta files are located - * @param bytesToReserve disk space to reserve for this replica, based on - * the estimated maximum block length. + * Set the number bytes that have acked + * @param bytesAcked number bytes acked */ - public ReplicaInPipeline(long blockId, long genStamp, - FsVolumeSpi vol, File dir, long bytesToReserve) { - this(blockId, 0L, genStamp, vol, dir, Thread.currentThread(), bytesToReserve); - } + void setBytesAcked(long bytesAcked); /** - * Constructor - * @param block a block - * @param vol volume where replica is located - * @param dir directory path where block and meta files are located - * @param writer a thread that is writing to this replica + * Release any disk space reserved for this replica. */ - ReplicaInPipeline(Block block, - FsVolumeSpi vol, File dir, Thread writer) { - this( block.getBlockId(), block.getNumBytes(), block.getGenerationStamp(), - vol, dir, writer, 0L); - } + public void releaseAllBytesReserved(); /** - * Constructor - * @param blockId block id - * @param len replica length - * @param genStamp replica generation stamp - * @param vol volume where replica is located - * @param dir directory path where block and meta files are located - * @param writer a thread that is writing to this replica - * @param bytesToReserve disk space to reserve for this replica, based on - * the estimated maximum block length. + * store the checksum for the last chunk along with the data length + * @param dataLength number of bytes on disk + * @param lastChecksum - checksum bytes for the last chunk */ - ReplicaInPipeline(long blockId, long len, long genStamp, - FsVolumeSpi vol, File dir, Thread writer, long bytesToReserve) { - super( blockId, len, genStamp, vol, dir); - this.bytesAcked = len; - this.bytesOnDisk = len; - this.writer.set(writer); - this.bytesReserved = bytesToReserve; - this.originalBytesReserved = bytesToReserve; - } + public void setLastChecksumAndDataLen(long dataLength, byte[] lastChecksum); + + /** + * gets the last chunk checksum and the length of the block corresponding + * to that checksum + */ + public ChunkChecksum getLastChecksumAndDataLen(); + + /** + * Create output streams for writing to this replica, + * one for block file and one for CRC file + * + * @param isCreate if it is for creation + * @param requestedChecksum the checksum the writer would prefer to use + * @return output streams for writing + * @throws IOException if any error occurs + */ + public ReplicaOutputStreams createStreams(boolean isCreate, + DataChecksum requestedChecksum) throws IOException; /** - * Copy constructor. - * @param from where to copy from + * Create an output stream to write restart metadata in case of datanode + * shutting down for quick restart. + * + * @return output stream for writing. + * @throws IOException if any error occurs */ - public ReplicaInPipeline(ReplicaInPipeline from) { - super(from); - this.bytesAcked = from.getBytesAcked(); - this.bytesOnDisk = from.getBytesOnDisk(); - this.writer.set(from.writer.get()); - this.bytesReserved = from.bytesReserved; - this.originalBytesReserved = from.originalBytesReserved; - } - - @Override - public long getVisibleLength() { - return -1; - } + public OutputStream createRestartMetaStream() throws IOException; - @Override //ReplicaInfo - public ReplicaState getState() { - return ReplicaState.TEMPORARY; - } + ReplicaInfo getReplicaInfo(); - @Override // ReplicaInPipelineInterface - public long getBytesAcked() { - return bytesAcked; - } + /** + * Set the thread that is writing to this replica + * @param writer a thread writing to this replica + */ + void setWriter(Thread writer); - @Override // ReplicaInPipelineInterface - public void setBytesAcked(long bytesAcked) { - long newBytesAcked = bytesAcked - this.bytesAcked; - this.bytesAcked = bytesAcked; - - // Once bytes are ACK'ed we can release equivalent space from the - // volume's reservedForRbw count. We could have released it as soon - // as the write-to-disk completed but that would be inefficient. - getVolume().releaseReservedSpace(newBytesAcked); - bytesReserved -= newBytesAcked; - } - - @Override // ReplicaInPipelineInterface - public long getBytesOnDisk() { - return bytesOnDisk; - } - - @Override - public long getBytesReserved() { - return bytesReserved; - } - - @Override - public long getOriginalBytesReserved() { - return originalBytesReserved; - } - - @Override - public void releaseAllBytesReserved() { // ReplicaInPipelineInterface - getVolume().releaseReservedSpace(bytesReserved); - getVolume().releaseLockedMemory(bytesReserved); - bytesReserved = 0; - } - - @Override // ReplicaInPipelineInterface - public synchronized void setLastChecksumAndDataLen(long dataLength, byte[] lastChecksum) { - this.bytesOnDisk = dataLength; - this.lastChecksum = lastChecksum; - } - - @Override // ReplicaInPipelineInterface - public synchronized ChunkChecksum getLastChecksumAndDataLen() { - return new ChunkChecksum(getBytesOnDisk(), lastChecksum); - } - - public void interruptThread() { - Thread thread = writer.get(); - if (thread != null && thread != Thread.currentThread() - && thread.isAlive()) { - thread.interrupt(); - } - } - - @Override // Object - public boolean equals(Object o) { - return super.equals(o); - } + void interruptThread(); /** * Attempt to set the writer to a new value. */ - public boolean attemptToSetWriter(Thread prevWriter, Thread newWriter) { - return writer.compareAndSet(prevWriter, newWriter); - } + boolean attemptToSetWriter(Thread prevWriter, Thread newWriter); /** - * Interrupt the writing thread and wait until it dies + * Interrupt the writing thread and wait until it dies. * @throws IOException the waiting is interrupted */ - public void stopWriter(long xceiverStopTimeout) throws IOException { - while (true) { - Thread thread = writer.get(); - if ((thread == null) || (thread == Thread.currentThread()) || - (!thread.isAlive())) { - if (writer.compareAndSet(thread, null) == true) { - return; // Done - } - // The writer changed. Go back to the start of the loop and attempt to - // stop the new writer. - continue; - } - thread.interrupt(); - try { - thread.join(xceiverStopTimeout); - if (thread.isAlive()) { - // Our thread join timed out. - final String msg = "Join on writer thread " + thread + " timed out"; - DataNode.LOG.warn(msg + "\n" + StringUtils.getStackTrace(thread)); - throw new IOException(msg); - } - } catch (InterruptedException e) { - throw new IOException("Waiting for writer thread is interrupted."); - } - } - } - - @Override // Object - public int hashCode() { - return super.hashCode(); - } - - @Override // ReplicaInPipelineInterface - public ReplicaOutputStreams createStreams(boolean isCreate, - DataChecksum requestedChecksum) throws IOException { - File blockFile = getBlockFile(); - File metaFile = getMetaFile(); - if (DataNode.LOG.isDebugEnabled()) { - DataNode.LOG.debug("writeTo blockfile is " + blockFile + - " of size " + blockFile.length()); - DataNode.LOG.debug("writeTo metafile is " + metaFile + - " of size " + metaFile.length()); - } - long blockDiskSize = 0L; - long crcDiskSize = 0L; - - // the checksum that should actually be used -- this - // may differ from requestedChecksum for appends. - final DataChecksum checksum; - - RandomAccessFile metaRAF = new RandomAccessFile(metaFile, "rw"); - - if (!isCreate) { - // For append or recovery, we must enforce the existing checksum. - // Also, verify that the file has correct lengths, etc. - boolean checkedMeta = false; - try { - BlockMetadataHeader header = BlockMetadataHeader.readHeader(metaRAF); - checksum = header.getChecksum(); - - if (checksum.getBytesPerChecksum() != - requestedChecksum.getBytesPerChecksum()) { - throw new IOException("Client requested checksum " + - requestedChecksum + " when appending to an existing block " + - "with different chunk size: " + checksum); - } - - int bytesPerChunk = checksum.getBytesPerChecksum(); - int checksumSize = checksum.getChecksumSize(); - - blockDiskSize = bytesOnDisk; - crcDiskSize = BlockMetadataHeader.getHeaderSize() + - (blockDiskSize+bytesPerChunk-1)/bytesPerChunk*checksumSize; - if (blockDiskSize>0 && - (blockDiskSize>blockFile.length() || crcDiskSize>metaFile.length())) { - throw new IOException("Corrupted block: " + this); - } - checkedMeta = true; - } finally { - if (!checkedMeta) { - // clean up in case of exceptions. - IOUtils.closeStream(metaRAF); - } - } - } else { - // for create, we can use the requested checksum - checksum = requestedChecksum; - } - - FileOutputStream blockOut = null; - FileOutputStream crcOut = null; - try { - blockOut = new FileOutputStream( - new RandomAccessFile( blockFile, "rw" ).getFD() ); - crcOut = new FileOutputStream(metaRAF.getFD() ); - if (!isCreate) { - blockOut.getChannel().position(blockDiskSize); - crcOut.getChannel().position(crcDiskSize); - } - return new ReplicaOutputStreams(blockOut, crcOut, checksum, - getVolume().isTransientStorage()); - } catch (IOException e) { - IOUtils.closeStream(blockOut); - IOUtils.closeStream(metaRAF); - throw e; - } - } - - @Override - public OutputStream createRestartMetaStream() throws IOException { - File blockFile = getBlockFile(); - File restartMeta = new File(blockFile.getParent() + - File.pathSeparator + "." + blockFile.getName() + ".restart"); - if (restartMeta.exists() && !restartMeta.delete()) { - DataNode.LOG.warn("Failed to delete restart meta file: " + - restartMeta.getPath()); - } - return new FileOutputStream(restartMeta); - } - - @Override - public String toString() { - return super.toString() - + "\n bytesAcked=" + bytesAcked - + "\n bytesOnDisk=" + bytesOnDisk; - } + void stopWriter(long xceiverStopTimeout) throws IOException; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaInPipelineInterface.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaInPipelineInterface.java deleted file mode 100644 index ef9f3e2e2c..0000000000 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaInPipelineInterface.java +++ /dev/null @@ -1,86 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hdfs.server.datanode; - -import java.io.IOException; -import java.io.OutputStream; - -import org.apache.hadoop.hdfs.server.datanode.fsdataset.ReplicaOutputStreams; -import org.apache.hadoop.util.DataChecksum; - -/** - * This defines the interface of a replica in Pipeline that's being written to - */ -public interface ReplicaInPipelineInterface extends Replica { - /** - * Set the number of bytes received - * @param bytesReceived number of bytes received - */ - void setNumBytes(long bytesReceived); - - /** - * Get the number of bytes acked - * @return the number of bytes acked - */ - long getBytesAcked(); - - /** - * Set the number bytes that have acked - * @param bytesAcked number bytes acked - */ - void setBytesAcked(long bytesAcked); - - /** - * Release any disk space reserved for this replica. - */ - public void releaseAllBytesReserved(); - - /** - * store the checksum for the last chunk along with the data length - * @param dataLength number of bytes on disk - * @param lastChecksum - checksum bytes for the last chunk - */ - public void setLastChecksumAndDataLen(long dataLength, byte[] lastChecksum); - - /** - * gets the last chunk checksum and the length of the block corresponding - * to that checksum - */ - public ChunkChecksum getLastChecksumAndDataLen(); - - /** - * Create output streams for writing to this replica, - * one for block file and one for CRC file - * - * @param isCreate if it is for creation - * @param requestedChecksum the checksum the writer would prefer to use - * @return output streams for writing - * @throws IOException if any error occurs - */ - public ReplicaOutputStreams createStreams(boolean isCreate, - DataChecksum requestedChecksum) throws IOException; - - /** - * Create an output stream to write restart metadata in case of datanode - * shutting down for quick restart. - * - * @return output stream for writing. - * @throws IOException if any error occurs - */ - public OutputStream createRestartMetaStream() throws IOException; -} diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaInfo.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaInfo.java index 3ef639033c..cbbafc37a1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaInfo.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaInfo.java @@ -17,23 +17,20 @@ */ package org.apache.hadoop.hdfs.server.datanode; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; import java.io.IOException; -import java.util.HashMap; -import java.util.Map; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URI; import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.fs.FileUtil; -import org.apache.hadoop.fs.HardLink; +import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.hdfs.protocol.Block; +import org.apache.hadoop.hdfs.server.datanode.DirectoryScanner.ScanInfo; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; -import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.hdfs.server.datanode.fsdataset.LengthInputStream; +import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo; import org.apache.hadoop.util.LightWeightResizableGSet; -import com.google.common.annotations.VisibleForTesting; - /** * This class is used by datanodes to maintain meta data of its replicas. * It provides a general interface for meta information of a replica. @@ -42,81 +39,26 @@ abstract public class ReplicaInfo extends Block implements Replica, LightWeightResizableGSet.LinkedElement { - /** For implementing {@link LightWeightResizableGSet.LinkedElement} interface */ + /** For implementing {@link LightWeightResizableGSet.LinkedElement}. */ private LightWeightResizableGSet.LinkedElement next; - /** volume where the replica belongs */ + /** volume where the replica belongs. */ private FsVolumeSpi volume; - - /** directory where block & meta files belong */ - - /** - * Base directory containing numerically-identified sub directories and - * possibly blocks. - */ - private File baseDir; - - /** - * Whether or not this replica's parent directory includes subdirs, in which - * case we can generate them based on the replica's block ID - */ - private boolean hasSubdirs; - - private static final Map internedBaseDirs = new HashMap(); /** - * Constructor - * @param block a block - * @param vol volume where replica is located - * @param dir directory path where block and meta files are located - */ - ReplicaInfo(Block block, FsVolumeSpi vol, File dir) { - this(block.getBlockId(), block.getNumBytes(), - block.getGenerationStamp(), vol, dir); - } - - /** - * Constructor - * @param blockId block id - * @param len replica length - * @param genStamp replica generation stamp - * @param vol volume where replica is located - * @param dir directory path where block and meta files are located - */ - ReplicaInfo(long blockId, long len, long genStamp, - FsVolumeSpi vol, File dir) { + * Constructor + * @param vol volume where replica is located + * @param blockId block id + * @param len replica length + * @param genStamp replica generation stamp + */ + ReplicaInfo(FsVolumeSpi vol, long blockId, long len, long genStamp) { super(blockId, len, genStamp); this.volume = vol; - setDirInternal(dir); - } - - /** - * Copy constructor. - * @param from where to copy from - */ - ReplicaInfo(ReplicaInfo from) { - this(from, from.getVolume(), from.getDir()); } /** - * Get the full path of this replica's data file - * @return the full path of this replica's data file - */ - public File getBlockFile() { - return new File(getDir(), getBlockName()); - } - - /** - * Get the full path of this replica's meta file - * @return the full path of this replica's meta file - */ - public File getMetaFile() { - return new File(getDir(), - DatanodeUtil.getMetaName(getBlockName(), getGenerationStamp())); - } - - /** - * Get the volume where this replica is located on disk + * Get the volume where this replica is located on disk. * @return the volume where this replica is located on disk */ public FsVolumeSpi getVolume() { @@ -124,7 +66,7 @@ public FsVolumeSpi getVolume() { } /** - * Set the volume where this replica is located on disk + * Set the volume where this replica is located on disk. */ void setVolume(FsVolumeSpi vol) { this.volume = vol; @@ -137,67 +79,6 @@ void setVolume(FsVolumeSpi vol) { public String getStorageUuid() { return volume.getStorageID(); } - - /** - * Return the parent directory path where this replica is located - * @return the parent directory path where this replica is located - */ - File getDir() { - return hasSubdirs ? DatanodeUtil.idToBlockDir(baseDir, - getBlockId()) : baseDir; - } - - /** - * Set the parent directory where this replica is located - * @param dir the parent directory where the replica is located - */ - public void setDir(File dir) { - setDirInternal(dir); - } - - private void setDirInternal(File dir) { - if (dir == null) { - baseDir = null; - return; - } - - ReplicaDirInfo dirInfo = parseBaseDir(dir); - this.hasSubdirs = dirInfo.hasSubidrs; - - synchronized (internedBaseDirs) { - if (!internedBaseDirs.containsKey(dirInfo.baseDirPath)) { - // Create a new String path of this file and make a brand new File object - // to guarantee we drop the reference to the underlying char[] storage. - File baseDir = new File(dirInfo.baseDirPath); - internedBaseDirs.put(dirInfo.baseDirPath, baseDir); - } - this.baseDir = internedBaseDirs.get(dirInfo.baseDirPath); - } - } - - @VisibleForTesting - public static class ReplicaDirInfo { - public String baseDirPath; - public boolean hasSubidrs; - - public ReplicaDirInfo (String baseDirPath, boolean hasSubidrs) { - this.baseDirPath = baseDirPath; - this.hasSubidrs = hasSubidrs; - } - } - - @VisibleForTesting - public static ReplicaDirInfo parseBaseDir(File dir) { - - File currentDir = dir; - boolean hasSubdirs = false; - while (currentDir.getName().startsWith(DataStorage.BLOCK_SUBDIR_PREFIX)) { - hasSubdirs = true; - currentDir = currentDir.getParentFile(); - } - - return new ReplicaDirInfo(currentDir.getAbsolutePath(), hasSubdirs); - } /** * Number of bytes reserved for this replica on disk. @@ -206,6 +87,166 @@ public long getBytesReserved() { return 0; } + /** + * Get the {@code URI} for where the data of this replica is stored. + * @return {@code URI} for the location of replica data. + */ + abstract public URI getBlockURI(); + + /** + * Returns an {@link InputStream} to the replica's data. + * @param seekOffset the offset at which the read is started from. + * @return the {@link InputStream} to read the replica data. + * @throws IOException if an error occurs in opening a stream to the data. + */ + abstract public InputStream getDataInputStream(long seekOffset) + throws IOException; + + /** + * Returns an {@link OutputStream} to the replica's data. + * @param append indicates if the block should be opened for append. + * @return the {@link OutputStream} to write to the replica. + * @throws IOException if an error occurs in creating an {@link OutputStream}. + */ + abstract public OutputStream getDataOutputStream(boolean append) + throws IOException; + + /** + * @return true if the replica's data exists. + */ + abstract public boolean blockDataExists(); + + /** + * Used to deletes the replica's block data. + * + * @return true if the replica's data is successfully deleted. + */ + abstract public boolean deleteBlockData(); + + /** + * @return the length of the block on storage. + */ + abstract public long getBlockDataLength(); + + /** + * Get the {@code URI} for where the metadata of this replica is stored. + * + * @return {@code URI} for the location of replica metadata. + */ + abstract public URI getMetadataURI(); + + /** + * Returns an {@link InputStream} to the replica's metadata. + * @param offset the offset at which the read is started from. + * @return the {@link LengthInputStream} to read the replica metadata. + * @throws IOException + */ + abstract public LengthInputStream getMetadataInputStream(long offset) + throws IOException; + + /** + * Returns an {@link OutputStream} to the replica's metadata. + * @param append indicates if the block metadata should be opened for append. + * @return the {@link OutputStream} to write to the replica's metadata. + * @throws IOException if an error occurs in creating an {@link OutputStream}. + */ + abstract public OutputStream getMetadataOutputStream(boolean append) + throws IOException; + + /** + * @return true if the replica's metadata exists. + */ + abstract public boolean metadataExists(); + + /** + * Used to deletes the replica's metadata. + * + * @return true if the replica's metadata is successfully deleted. + */ + abstract public boolean deleteMetadata(); + + /** + * @return the length of the metadata on storage. + */ + abstract public long getMetadataLength(); + + /** + * Rename the metadata {@link URI} to that referenced by {@code destURI}. + * + * @param destURI the target {@link URI}. + * @return true if the rename is successful. + * @throws IOException if an exception occurs in the rename. + */ + abstract public boolean renameMeta(URI destURI) throws IOException; + + /** + * Rename the data {@link URI} to that referenced by {@code destURI}. + * + * @param destURI the target {@link URI}. + * @return true if the rename is successful. + * @throws IOException if an exception occurs in the rename. + */ + abstract public boolean renameData(URI destURI) throws IOException; + + /** + * Update this replica with the {@link StorageLocation} found. + * @param replicaLocation the {@link StorageLocation} found for this replica. + */ + abstract public void updateWithReplica(StorageLocation replicaLocation); + + /** + * Check whether the block was pinned. + * @param localFS the local filesystem to use. + * @return true if the block is pinned. + * @throws IOException + */ + abstract public boolean getPinning(LocalFileSystem localFS) + throws IOException; + + /** + * Set a block to be pinned on this datanode so that it cannot be moved + * by Balancer/Mover. + * + * @param localFS the local filesystem to use. + * @throws IOException if there is an exception in the pinning. + */ + abstract public void setPinning(LocalFileSystem localFS) throws IOException; + + /** + * Bump a replica's generation stamp to a new one. + * Its on-disk meta file name is renamed to be the new one too. + * + * @param newGS new generation stamp + * @throws IOException if the change fails + */ + abstract public void bumpReplicaGS(long newGS) throws IOException; + + abstract public ReplicaInfo getOriginalReplica(); + + /** + * Get the recovery id. + * @return the generation stamp that the replica will be bumped to + */ + abstract public long getRecoveryID(); + + /** + * Set the recovery id. + * @param recoveryId the new recoveryId + */ + abstract public void setRecoveryID(long recoveryId); + + abstract public boolean breakHardLinksIfNeeded() throws IOException; + + abstract public ReplicaRecoveryInfo createInfo(); + + abstract public int compareWith(ScanInfo info); + + abstract public void truncateBlock(long newLength) throws IOException; + + abstract public void copyMetadata(URI destination) throws IOException; + + abstract public void copyBlockdata(URI destination) throws IOException; + /** * Number of bytes originally reserved for this replica. The actual * reservation is adjusted as data is written to disk. @@ -216,79 +257,6 @@ public long getOriginalBytesReserved() { return 0; } - /** - * Copy specified file into a temporary file. Then rename the - * temporary file to the original name. This will cause any - * hardlinks to the original file to be removed. The temporary - * files are created in the same directory. The temporary files will - * be recovered (especially on Windows) on datanode restart. - */ - private void breakHardlinks(File file, Block b) throws IOException { - File tmpFile = DatanodeUtil.createTmpFile(b, DatanodeUtil.getUnlinkTmpFile(file)); - try { - FileInputStream in = new FileInputStream(file); - try { - FileOutputStream out = new FileOutputStream(tmpFile); - try { - IOUtils.copyBytes(in, out, 16 * 1024); - } finally { - out.close(); - } - } finally { - in.close(); - } - if (file.length() != tmpFile.length()) { - throw new IOException("Copy of file " + file + " size " + file.length()+ - " into file " + tmpFile + - " resulted in a size of " + tmpFile.length()); - } - FileUtil.replaceFile(tmpFile, file); - } catch (IOException e) { - boolean done = tmpFile.delete(); - if (!done) { - DataNode.LOG.info("detachFile failed to delete temporary file " + - tmpFile); - } - throw e; - } - } - - /** - * This function "breaks hardlinks" to the current replica file. - * - * When doing a DataNode upgrade, we create a bunch of hardlinks to each block - * file. This cleverly ensures that both the old and the new storage - * directories can contain the same block file, without using additional space - * for the data. - * - * However, when we want to append to the replica file, we need to "break" the - * hardlink to ensure that the old snapshot continues to contain the old data - * length. If we failed to do that, we could roll back to the previous/ - * directory during a downgrade, and find that the block contents were longer - * than they were at the time of upgrade. - * - * @return true only if data was copied. - * @throws IOException - */ - public boolean breakHardLinksIfNeeded() throws IOException { - File file = getBlockFile(); - if (file == null || getVolume() == null) { - throw new IOException("detachBlock:Block not found. " + this); - } - File meta = getMetaFile(); - - int linkCount = HardLink.getLinkCount(file); - if (linkCount > 1) { - DataNode.LOG.info("Breaking hardlink for " + linkCount + "x-linked " + - "block " + this); - breakHardlinks(file, this); - } - if (HardLink.getLinkCount(meta) > 1) { - breakHardlinks(meta, this); - } - return true; - } - @Override //Object public String toString() { return getClass().getSimpleName() @@ -298,7 +266,7 @@ public String toString() { + "\n getBytesOnDisk() = " + getBytesOnDisk() + "\n getVisibleLength()= " + getVisibleLength() + "\n getVolume() = " + getVolume() - + "\n getBlockFile() = " + getBlockFile(); + + "\n getBlockURI() = " + getBlockURI(); } @Override diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaUnderRecovery.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaUnderRecovery.java index 558ee21753..09140e7b84 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaUnderRecovery.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaUnderRecovery.java @@ -17,8 +17,6 @@ */ package org.apache.hadoop.hdfs.server.datanode; -import java.io.File; - import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo; @@ -31,19 +29,19 @@ * A recovery with higher recovery id preempts recoveries with a lower id. * */ -public class ReplicaUnderRecovery extends ReplicaInfo { - private ReplicaInfo original; // the original replica that needs to be recovered +public class ReplicaUnderRecovery extends LocalReplica { + private LocalReplica original; // original replica to be recovered private long recoveryId; // recovery id; it is also the generation stamp // that the replica will be bumped to after recovery public ReplicaUnderRecovery(ReplicaInfo replica, long recoveryId) { - super(replica, replica.getVolume(), replica.getDir()); + super(replica, replica.getVolume(), ((LocalReplica)replica).getDir()); if ( replica.getState() != ReplicaState.FINALIZED && replica.getState() != ReplicaState.RBW && replica.getState() != ReplicaState.RWR ) { throw new IllegalArgumentException("Cannot recover replica: " + replica); } - this.original = replica; + this.original = (LocalReplica) replica; this.recoveryId = recoveryId; } @@ -53,22 +51,16 @@ public ReplicaUnderRecovery(ReplicaInfo replica, long recoveryId) { */ public ReplicaUnderRecovery(ReplicaUnderRecovery from) { super(from); - this.original = from.getOriginalReplica(); + this.original = (LocalReplica) from.getOriginalReplica(); this.recoveryId = from.getRecoveryID(); } - /** - * Get the recovery id - * @return the generation stamp that the replica will be bumped to - */ + @Override public long getRecoveryID() { return recoveryId; } - /** - * Set the recovery id - * @param recoveryId the new recoveryId - */ + @Override public void setRecoveryID(long recoveryId) { if (recoveryId > this.recoveryId) { this.recoveryId = recoveryId; @@ -82,6 +74,7 @@ public void setRecoveryID(long recoveryId) { * Get the original replica that's under recovery * @return the original replica under recovery */ + @Override public ReplicaInfo getOriginalReplica() { return original; } @@ -120,9 +113,9 @@ public void setNumBytes(long numBytes) { } @Override //ReplicaInfo - public void setDir(File dir) { - super.setDir(dir); - original.setDir(dir); + public void updateWithReplica(StorageLocation replicaLocation) { + super.updateWithReplica(replicaLocation); + original.updateWithReplica(replicaLocation); } @Override //ReplicaInfo @@ -148,6 +141,7 @@ public String toString() { + "\n original=" + original; } + @Override public ReplicaRecoveryInfo createInfo() { return new ReplicaRecoveryInfo(original.getBlockId(), original.getBytesOnDisk(), original.getGenerationStamp(), diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaWaitingToBeRecovered.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaWaitingToBeRecovered.java index 220649d1eb..38ef286d18 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaWaitingToBeRecovered.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ReplicaWaitingToBeRecovered.java @@ -22,6 +22,7 @@ import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; +import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo; /** * This class represents a replica that is waiting to be recovered. @@ -32,7 +33,7 @@ * client continues to write or be recovered as a result of * lease recovery. */ -public class ReplicaWaitingToBeRecovered extends ReplicaInfo { +public class ReplicaWaitingToBeRecovered extends LocalReplica { /** * Constructor @@ -94,4 +95,28 @@ public int hashCode() { public String toString() { return super.toString(); } + + @Override + public ReplicaInfo getOriginalReplica() { + throw new UnsupportedOperationException("Replica of type " + getState() + + " does not support getOriginalReplica"); + } + + @Override + public long getRecoveryID() { + throw new UnsupportedOperationException("Replica of type " + getState() + + " does not support getRecoveryID"); + } + + @Override + public void setRecoveryID(long recoveryId) { + throw new UnsupportedOperationException("Replica of type " + getState() + + " does not support getRecoveryID"); + } + + @Override + public ReplicaRecoveryInfo createInfo() { + throw new UnsupportedOperationException("Replica of type " + getState() + + " does not support createInfo"); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java index acc269af87..b75ed5bea3 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java @@ -44,9 +44,8 @@ import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataStorage; -import org.apache.hadoop.hdfs.server.datanode.FinalizedReplica; import org.apache.hadoop.hdfs.server.datanode.Replica; -import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipelineInterface; +import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipeline; import org.apache.hadoop.hdfs.server.datanode.ReplicaHandler; import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo; import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException; @@ -230,10 +229,10 @@ StorageReport[] getStorageReports(String bpid) VolumeFailureSummary getVolumeFailureSummary(); /** @return a list of finalized blocks for the given block pool. */ - List getFinalizedBlocks(String bpid); + List getFinalizedBlocks(String bpid); /** @return a list of finalized blocks for the given block pool. */ - List getFinalizedBlocksOnPersistentStorage(String bpid); + List getFinalizedBlocksOnPersistentStorage(String bpid); /** * Check whether the in-memory block record matches the block on the disk, @@ -337,7 +336,7 @@ ReplicaHandler recoverRbw(ExtendedBlock b, * @param temporary the temporary replica being converted * @return the result RBW */ - ReplicaInPipelineInterface convertTemporaryToRbw( + ReplicaInPipeline convertTemporaryToRbw( ExtendedBlock temporary) throws IOException; /** diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/BlockPoolSlice.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/BlockPoolSlice.java index 1e4e37a4f9..b4384b3b12 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/BlockPoolSlice.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/BlockPoolSlice.java @@ -45,13 +45,13 @@ import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; import org.apache.hadoop.hdfs.protocol.BlockListAsLongs.BlockReportReplica; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader; import org.apache.hadoop.hdfs.server.datanode.DataStorage; import org.apache.hadoop.hdfs.server.datanode.DatanodeUtil; -import org.apache.hadoop.hdfs.server.datanode.FinalizedReplica; import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo; -import org.apache.hadoop.hdfs.server.datanode.ReplicaBeingWritten; -import org.apache.hadoop.hdfs.server.datanode.ReplicaWaitingToBeRecovered; +import org.apache.hadoop.hdfs.server.datanode.ReplicaBuilder; +import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.RamDiskReplicaTracker.RamDiskReplica; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.nativeio.NativeIO; import org.apache.hadoop.util.DataChecksum; @@ -309,14 +309,14 @@ File createRbwFile(Block b) throws IOException { return rbwFile; } - File addFinalizedBlock(Block b, File f) throws IOException { + File addFinalizedBlock(Block b, ReplicaInfo replicaInfo) throws IOException { File blockDir = DatanodeUtil.idToBlockDir(finalizedDir, b.getBlockId()); if (!blockDir.exists()) { if (!blockDir.mkdirs()) { throw new IOException("Failed to mkdirs " + blockDir); } } - File blockFile = FsDatasetImpl.moveBlockFiles(b, f, blockDir); + File blockFile = FsDatasetImpl.moveBlockFiles(b, replicaInfo, blockDir); File metaFile = FsDatasetUtil.getMetaFile(blockFile, b.getGenerationStamp()); if (dfsUsage instanceof CachingGetSpaceUsed) { ((CachingGetSpaceUsed) dfsUsage).incDfsUsed( @@ -329,16 +329,28 @@ File addFinalizedBlock(Block b, File f) throws IOException { * Move a persisted replica from lazypersist directory to a subdirectory * under finalized. */ - File activateSavedReplica(Block b, File metaFile, File blockFile) - throws IOException { - final File blockDir = DatanodeUtil.idToBlockDir(finalizedDir, b.getBlockId()); + ReplicaInfo activateSavedReplica(ReplicaInfo replicaInfo, + RamDiskReplica replicaState) throws IOException { + File metaFile = replicaState.getSavedMetaFile(); + File blockFile = replicaState.getSavedBlockFile(); + final long blockId = replicaInfo.getBlockId(); + final File blockDir = DatanodeUtil.idToBlockDir(finalizedDir, blockId); final File targetBlockFile = new File(blockDir, blockFile.getName()); final File targetMetaFile = new File(blockDir, metaFile.getName()); FileUtils.moveFile(blockFile, targetBlockFile); FsDatasetImpl.LOG.info("Moved " + blockFile + " to " + targetBlockFile); FileUtils.moveFile(metaFile, targetMetaFile); FsDatasetImpl.LOG.info("Moved " + metaFile + " to " + targetMetaFile); - return targetBlockFile; + + ReplicaInfo newReplicaInfo = + new ReplicaBuilder(ReplicaState.FINALIZED) + .setBlockId(blockId) + .setLength(replicaInfo.getBytesOnDisk()) + .setGenerationStamp(replicaInfo.getGenerationStamp()) + .setFsVolume(replicaState.getLazyPersistVolume()) + .setDirectoryToUse(targetBlockFile.getParentFile()) + .build(); + return newReplicaInfo; } void checkDirs() throws DiskErrorException { @@ -461,9 +473,13 @@ private void addReplicaToReplicasMap(Block block, ReplicaMap volumeMap, long blockId = block.getBlockId(); long genStamp = block.getGenerationStamp(); if (isFinalized) { - newReplica = new FinalizedReplica(blockId, - block.getNumBytes(), genStamp, volume, DatanodeUtil - .idToBlockDir(finalizedDir, blockId)); + newReplica = new ReplicaBuilder(ReplicaState.FINALIZED) + .setBlockId(blockId) + .setLength(block.getNumBytes()) + .setGenerationStamp(genStamp) + .setFsVolume(volume) + .setDirectoryToUse(DatanodeUtil.idToBlockDir(finalizedDir, blockId)) + .build(); } else { File file = new File(rbwDir, block.getBlockName()); boolean loadRwr = true; @@ -477,9 +493,15 @@ private void addReplicaToReplicasMap(Block block, ReplicaMap volumeMap, // It didn't expire. Load the replica as a RBW. // We don't know the expected block length, so just use 0 // and don't reserve any more space for writes. - newReplica = new ReplicaBeingWritten(blockId, - validateIntegrityAndSetLength(file, genStamp), - genStamp, volume, file.getParentFile(), null, 0); + newReplica = new ReplicaBuilder(ReplicaState.RBW) + .setBlockId(blockId) + .setLength(validateIntegrityAndSetLength(file, genStamp)) + .setGenerationStamp(genStamp) + .setFsVolume(volume) + .setDirectoryToUse(file.getParentFile()) + .setWriterThread(null) + .setBytesToReserve(0) + .build(); loadRwr = false; } sc.close(); @@ -496,9 +518,13 @@ private void addReplicaToReplicasMap(Block block, ReplicaMap volumeMap, } // Restart meta doesn't exist or expired. if (loadRwr) { - newReplica = new ReplicaWaitingToBeRecovered(blockId, - validateIntegrityAndSetLength(file, genStamp), - genStamp, volume, file.getParentFile()); + ReplicaBuilder builder = new ReplicaBuilder(ReplicaState.RWR) + .setBlockId(blockId) + .setLength(validateIntegrityAndSetLength(file, genStamp)) + .setGenerationStamp(genStamp) + .setFsVolume(volume) + .setDirectoryToUse(file.getParentFile()); + newReplica = builder.build(); } } @@ -614,7 +640,7 @@ static ReplicaInfo selectReplicaToDelete(final ReplicaInfo replica1, // it's the same block so don't ever delete it, even if GS or size // differs. caller should keep the one it just discovered on disk - if (replica1.getBlockFile().equals(replica2.getBlockFile())) { + if (replica1.getBlockURI().equals(replica2.getBlockURI())) { return null; } if (replica1.getGenerationStamp() != replica2.getGenerationStamp()) { @@ -641,13 +667,11 @@ static ReplicaInfo selectReplicaToDelete(final ReplicaInfo replica1, private void deleteReplica(final ReplicaInfo replicaToDelete) { // Delete the files on disk. Failure here is okay. - final File blockFile = replicaToDelete.getBlockFile(); - if (!blockFile.delete()) { - LOG.warn("Failed to delete block file " + blockFile); + if (!replicaToDelete.deleteBlockData()) { + LOG.warn("Failed to delete block file for replica " + replicaToDelete); } - final File metaFile = replicaToDelete.getMetaFile(); - if (!metaFile.delete()) { - LOG.warn("Failed to delete meta file " + metaFile); + if (!replicaToDelete.deleteMetadata()) { + LOG.warn("Failed to delete meta file for replica " + replicaToDelete); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetAsyncDiskService.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetAsyncDiskService.java index fdc9f83d52..c9160cd65a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetAsyncDiskService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetAsyncDiskService.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.FileDescriptor; +import java.io.IOException; import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -34,6 +35,8 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.server.datanode.DataNode; +import org.apache.hadoop.hdfs.server.datanode.DatanodeUtil; +import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeReference; import org.apache.hadoop.hdfs.server.protocol.BlockCommand; import org.apache.hadoop.io.IOUtils; @@ -211,12 +214,12 @@ public void run() { * Delete the block file and meta file from the disk asynchronously, adjust * dfsUsed statistics accordingly. */ - void deleteAsync(FsVolumeReference volumeRef, File blockFile, File metaFile, + void deleteAsync(FsVolumeReference volumeRef, ReplicaInfo replicaToDelete, ExtendedBlock block, String trashDirectory) { LOG.info("Scheduling " + block.getLocalBlock() - + " file " + blockFile + " for deletion"); + + " replica " + replicaToDelete + " for deletion"); ReplicaFileDeleteTask deletionTask = new ReplicaFileDeleteTask( - volumeRef, blockFile, metaFile, block, trashDirectory); + volumeRef, replicaToDelete, block, trashDirectory); execute(((FsVolumeImpl) volumeRef.getVolume()).getCurrentDir(), deletionTask); } @@ -227,19 +230,18 @@ void deleteAsync(FsVolumeReference volumeRef, File blockFile, File metaFile, * files are deleted immediately. */ class ReplicaFileDeleteTask implements Runnable { - final FsVolumeReference volumeRef; - final FsVolumeImpl volume; - final File blockFile; - final File metaFile; - final ExtendedBlock block; - final String trashDirectory; - - ReplicaFileDeleteTask(FsVolumeReference volumeRef, File blockFile, - File metaFile, ExtendedBlock block, String trashDirectory) { + private final FsVolumeReference volumeRef; + private final FsVolumeImpl volume; + private final ReplicaInfo replicaToDelete; + private final ExtendedBlock block; + private final String trashDirectory; + + ReplicaFileDeleteTask(FsVolumeReference volumeRef, + ReplicaInfo replicaToDelete, ExtendedBlock block, + String trashDirectory) { this.volumeRef = volumeRef; this.volume = (FsVolumeImpl) volumeRef.getVolume(); - this.blockFile = blockFile; - this.metaFile = metaFile; + this.replicaToDelete = replicaToDelete; this.block = block; this.trashDirectory = trashDirectory; } @@ -248,15 +250,22 @@ class ReplicaFileDeleteTask implements Runnable { public String toString() { // Called in AsyncDiskService.execute for displaying error messages. return "deletion of block " + block.getBlockPoolId() + " " - + block.getLocalBlock() + " with block file " + blockFile - + " and meta file " + metaFile + " from volume " + volume; + + block.getLocalBlock() + " with block file " + + replicaToDelete.getBlockURI() + " and meta file " + + replicaToDelete.getMetadataURI() + " from volume " + volume; } private boolean deleteFiles() { - return blockFile.delete() && (metaFile.delete() || !metaFile.exists()); + return replicaToDelete.deleteBlockData() && + (replicaToDelete.deleteMetadata() || !replicaToDelete.metadataExists()); } private boolean moveFiles() { + if (trashDirectory == null) { + LOG.error("Trash dir for replica " + replicaToDelete + " is null"); + return false; + } + File trashDirFile = new File(trashDirectory); if (!trashDirFile.exists() && !trashDirFile.mkdirs()) { LOG.error("Failed to create trash directory " + trashDirectory); @@ -264,20 +273,28 @@ private boolean moveFiles() { } if (LOG.isDebugEnabled()) { - LOG.debug("Moving files " + blockFile.getName() + " and " + - metaFile.getName() + " to trash."); + LOG.debug("Moving files " + replicaToDelete.getBlockURI() + " and " + + replicaToDelete.getMetadataURI() + " to trash."); } - File newBlockFile = new File(trashDirectory, blockFile.getName()); - File newMetaFile = new File(trashDirectory, metaFile.getName()); - return (blockFile.renameTo(newBlockFile) && - metaFile.renameTo(newMetaFile)); + final String blockName = replicaToDelete.getBlockName(); + final long genstamp = replicaToDelete.getGenerationStamp(); + File newBlockFile = new File(trashDirectory, blockName); + File newMetaFile = new File(trashDirectory, + DatanodeUtil.getMetaName(blockName, genstamp)); + try { + return (replicaToDelete.renameData(newBlockFile.toURI()) && + replicaToDelete.renameMeta(newMetaFile.toURI())); + } catch (IOException e) { + LOG.error("Error moving files to trash: " + replicaToDelete, e); + } + return false; } @Override public void run() { - final long blockLength = blockFile.length(); - final long metaLength = metaFile.length(); + final long blockLength = replicaToDelete.getBlockDataLength(); + final long metaLength = replicaToDelete.getMetadataLength(); boolean result; result = (trashDirectory == null) ? deleteFiles() : moveFiles(); @@ -286,7 +303,7 @@ public void run() { LOG.warn("Unexpected error trying to " + (trashDirectory == null ? "delete" : "move") + " block " + block.getBlockPoolId() + " " + block.getLocalBlock() - + " at file " + blockFile + ". Ignored."); + + " at file " + replicaToDelete.getBlockURI() + ". Ignored."); } else { if(block.getLocalBlock().getNumBytes() != BlockCommand.NO_ACK){ datanode.notifyNamenodeDeletedBlock(block, volume.getStorageID()); @@ -294,7 +311,7 @@ public void run() { volume.onBlockFileDeletion(block.getBlockPoolId(), blockLength); volume.onMetaFileDeletion(block.getBlockPoolId(), metaLength); LOG.info("Deleted " + block.getBlockPoolId() + " " - + block.getLocalBlock() + " file " + blockFile); + + block.getLocalBlock() + " URI " + replicaToDelete.getBlockURI()); } updateDeletedBlockId(block); IOUtils.cleanup(null, volumeRef); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java index e9f1dc13a8..54b2ce89c0 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java @@ -22,12 +22,10 @@ import java.io.EOFException; import java.io.File; import java.io.FileDescriptor; -import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.RandomAccessFile; import java.nio.channels.ClosedChannelException; import java.nio.channels.FileChannel; import java.util.ArrayList; @@ -53,10 +51,8 @@ import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeys; -import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; -import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.StorageType; import org.apache.hadoop.hdfs.DFSConfigKeys; @@ -77,17 +73,13 @@ import org.apache.hadoop.hdfs.server.datanode.metrics.DataNodeMetricHelper; import org.apache.hadoop.hdfs.server.datanode.DataStorage; import org.apache.hadoop.hdfs.server.datanode.DatanodeUtil; -import org.apache.hadoop.hdfs.server.datanode.FinalizedReplica; import org.apache.hadoop.hdfs.server.datanode.Replica; import org.apache.hadoop.hdfs.server.datanode.ReplicaAlreadyExistsException; -import org.apache.hadoop.hdfs.server.datanode.ReplicaBeingWritten; +import org.apache.hadoop.hdfs.server.datanode.ReplicaBuilder; import org.apache.hadoop.hdfs.server.datanode.ReplicaHandler; import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipeline; -import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipelineInterface; import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo; import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException; -import org.apache.hadoop.hdfs.server.datanode.ReplicaUnderRecovery; -import org.apache.hadoop.hdfs.server.datanode.ReplicaWaitingToBeRecovered; import org.apache.hadoop.hdfs.server.datanode.StorageLocation; import org.apache.hadoop.hdfs.server.datanode.UnexpectedReplicaStateException; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi; @@ -192,13 +184,11 @@ public FsVolumeImpl getVolume(final ExtendedBlock b) { public Block getStoredBlock(String bpid, long blkid) throws IOException { try (AutoCloseableLock lock = datasetLock.acquire()) { - File blockfile = getFile(bpid, blkid, false); - if (blockfile == null) { + ReplicaInfo r = volumeMap.get(bpid, blkid); + if (r == null) { return null; } - final File metafile = FsDatasetUtil.findMetaFile(blockfile); - final long gs = FsDatasetUtil.parseGenerationStamp(blockfile, metafile); - return new Block(blkid, blockfile.length(), gs); + return new Block(blkid, r.getBytesOnDisk(), r.getGenerationStamp()); } } @@ -209,19 +199,16 @@ public Block getStoredBlock(String bpid, long blkid) */ ReplicaInfo fetchReplicaInfo(String bpid, long blockId) { ReplicaInfo r = volumeMap.get(bpid, blockId); - if(r == null) + if (r == null) { return null; + } switch(r.getState()) { case FINALIZED: - return new FinalizedReplica((FinalizedReplica)r); case RBW: - return new ReplicaBeingWritten((ReplicaBeingWritten)r); case RWR: - return new ReplicaWaitingToBeRecovered((ReplicaWaitingToBeRecovered)r); case RUR: - return new ReplicaUnderRecovery((ReplicaUnderRecovery)r); case TEMPORARY: - return new ReplicaInPipeline((ReplicaInPipeline)r); + return new ReplicaBuilder(r.getState()).from(r).build(); } return null; } @@ -229,16 +216,11 @@ ReplicaInfo fetchReplicaInfo(String bpid, long blockId) { @Override // FsDatasetSpi public LengthInputStream getMetaDataInputStream(ExtendedBlock b) throws IOException { - File meta = FsDatasetUtil.getMetaFile(getBlockFile(b), b.getGenerationStamp()); - if (meta == null || !meta.exists()) { + ReplicaInfo info = getBlockReplica(b); + if (info == null || !info.metadataExists()) { return null; } - if (isNativeIOAvailable) { - return new LengthInputStream( - NativeIO.getShareDeleteFileInputStream(meta), - meta.length()); - } - return new LengthInputStream(new FileInputStream(meta), meta.length()); + return info.getMetadataInputStream(0); } final DataNode datanode; @@ -738,62 +720,45 @@ public long getNumBlocksCached() { */ @Override // FsDatasetSpi public long getLength(ExtendedBlock b) throws IOException { - return getBlockFile(b).length(); + return getBlockReplica(b).getBlockDataLength(); } /** * Get File name for a given block. */ - private File getBlockFile(ExtendedBlock b) throws IOException { - return getBlockFile(b.getBlockPoolId(), b.getBlockId()); + private ReplicaInfo getBlockReplica(ExtendedBlock b) throws IOException { + return getBlockReplica(b.getBlockPoolId(), b.getBlockId()); } /** * Get File name for a given block. */ - File getBlockFile(String bpid, long blockId) throws IOException { - File f = validateBlockFile(bpid, blockId); - if(f == null) { + ReplicaInfo getBlockReplica(String bpid, long blockId) throws IOException { + ReplicaInfo r = validateBlockFile(bpid, blockId); + if (r == null) { throw new IOException("BlockId " + blockId + " is not valid."); } - return f; - } - - /** - * Return the File associated with a block, without first - * checking that it exists. This should be used when the - * next operation is going to open the file for read anyway, - * and thus the exists check is redundant. - * - * @param touch if true then update the last access timestamp of the - * block. Currently used for blocks on transient storage. - */ - private File getBlockFileNoExistsCheck(ExtendedBlock b, - boolean touch) - throws IOException { - final File f; - try (AutoCloseableLock lock = datasetLock.acquire()) { - f = getFile(b.getBlockPoolId(), b.getLocalBlock().getBlockId(), touch); - } - if (f == null) { - throw new IOException("Block " + b + " is not valid"); - } - return f; + return r; } @Override // FsDatasetSpi public InputStream getBlockInputStream(ExtendedBlock b, long seekOffset) throws IOException { - File blockFile = getBlockFileNoExistsCheck(b, true); - if (isNativeIOAvailable) { - return NativeIO.getShareDeleteFileInputStream(blockFile, seekOffset); + + ReplicaInfo info; + synchronized(this) { + info = volumeMap.get(b.getBlockPoolId(), b.getLocalBlock()); + } + + if (info != null && info.getVolume().isTransientStorage()) { + ramDiskReplicaTracker.touch(b.getBlockPoolId(), b.getBlockId()); + datanode.getMetrics().incrRamDiskBlocksReadHits(); + } + + if(info != null && info.blockDataExists()) { + return info.getDataInputStream(seekOffset); } else { - try { - return openAndSeek(blockFile, seekOffset); - } catch (FileNotFoundException fnfe) { - throw new IOException("Block " + b + " is not valid. " + - "Expected block file at " + blockFile + " does not exist."); - } + throw new IOException("No data exists for block " + b); } } @@ -814,7 +779,7 @@ ReplicaInfo getReplicaInfo(ExtendedBlock b) } return info; } - + /** * Get the meta info of a block stored in volumeMap. Block is looked up * without matching the generation stamp. @@ -824,7 +789,8 @@ ReplicaInfo getReplicaInfo(ExtendedBlock b) * @throws ReplicaNotFoundException if no entry is in the map or * there is a generation stamp mismatch */ - private ReplicaInfo getReplicaInfo(String bpid, long blkid) + @VisibleForTesting + ReplicaInfo getReplicaInfo(String bpid, long blkid) throws ReplicaNotFoundException { ReplicaInfo info = volumeMap.get(bpid, blkid); if (info == null) { @@ -833,7 +799,7 @@ private ReplicaInfo getReplicaInfo(String bpid, long blkid) } return info; } - + /** * Returns handles to the block file and its metadata file */ @@ -844,10 +810,9 @@ public ReplicaInputStreams getTmpInputStreams(ExtendedBlock b, ReplicaInfo info = getReplicaInfo(b); FsVolumeReference ref = info.getVolume().obtainReference(); try { - InputStream blockInStream = openAndSeek(info.getBlockFile(), blkOffset); + InputStream blockInStream = info.getDataInputStream(blkOffset); try { - InputStream metaInStream = - openAndSeek(info.getMetaFile(), metaOffset); + InputStream metaInStream = info.getMetadataInputStream(metaOffset); return new ReplicaInputStreams(blockInStream, metaInStream, ref); } catch (IOException e) { IOUtils.cleanup(null, blockInStream); @@ -860,41 +825,27 @@ public ReplicaInputStreams getTmpInputStreams(ExtendedBlock b, } } - private static FileInputStream openAndSeek(File file, long offset) - throws IOException { - RandomAccessFile raf = null; - try { - raf = new RandomAccessFile(file, "r"); - if (offset > 0) { - raf.seek(offset); - } - return new FileInputStream(raf.getFD()); - } catch(IOException ioe) { - IOUtils.cleanup(null, raf); - throw ioe; - } - } - - static File moveBlockFiles(Block b, File srcfile, File destdir) + static File moveBlockFiles(Block b, ReplicaInfo replicaInfo, File destdir) throws IOException { final File dstfile = new File(destdir, b.getBlockName()); - final File srcmeta = FsDatasetUtil.getMetaFile(srcfile, b.getGenerationStamp()); final File dstmeta = FsDatasetUtil.getMetaFile(dstfile, b.getGenerationStamp()); try { - NativeIO.renameTo(srcmeta, dstmeta); + replicaInfo.renameMeta(dstmeta.toURI()); } catch (IOException e) { throw new IOException("Failed to move meta file for " + b - + " from " + srcmeta + " to " + dstmeta, e); + + " from " + replicaInfo.getMetadataURI() + " to " + dstmeta, e); } try { - NativeIO.renameTo(srcfile, dstfile); + replicaInfo.renameData(dstfile.toURI()); } catch (IOException e) { throw new IOException("Failed to move block file for " + b - + " from " + srcfile + " to " + dstfile.getAbsolutePath(), e); + + " from " + replicaInfo.getBlockURI() + " to " + + dstfile.getAbsolutePath(), e); } if (LOG.isDebugEnabled()) { - LOG.debug("addFinalizedBlock: Moved " + srcmeta + " to " + dstmeta - + " and " + srcfile + " to " + dstfile); + LOG.debug("addFinalizedBlock: Moved " + replicaInfo.getMetadataURI() + + " to " + dstmeta + " and " + replicaInfo.getBlockURI() + + " to " + dstfile); } return dstfile; } @@ -904,41 +855,44 @@ static File moveBlockFiles(Block b, File srcfile, File destdir) * @return the new meta and block files. * @throws IOException */ - static File[] copyBlockFiles(long blockId, long genStamp, File srcMeta, - File srcFile, File destRoot, boolean calculateChecksum, + static File[] copyBlockFiles(long blockId, long genStamp, + ReplicaInfo srcReplica, File destRoot, boolean calculateChecksum, int smallBufferSize, final Configuration conf) throws IOException { final File destDir = DatanodeUtil.idToBlockDir(destRoot, blockId); - final File dstFile = new File(destDir, srcFile.getName()); + // blockName is same as the filename for the block + final File dstFile = new File(destDir, srcReplica.getBlockName()); final File dstMeta = FsDatasetUtil.getMetaFile(dstFile, genStamp); - return copyBlockFiles(srcMeta, srcFile, dstMeta, dstFile, calculateChecksum, + return copyBlockFiles(srcReplica, dstMeta, dstFile, calculateChecksum, smallBufferSize, conf); } - static File[] copyBlockFiles(File srcMeta, File srcFile, File dstMeta, + static File[] copyBlockFiles(ReplicaInfo srcReplica, File dstMeta, File dstFile, boolean calculateChecksum, int smallBufferSize, final Configuration conf) throws IOException { + if (calculateChecksum) { - computeChecksum(srcMeta, dstMeta, srcFile, smallBufferSize, conf); + computeChecksum(srcReplica, dstMeta, smallBufferSize, conf); } else { try { - Storage.nativeCopyFileUnbuffered(srcMeta, dstMeta, true); + srcReplica.copyMetadata(dstMeta.toURI()); } catch (IOException e) { - throw new IOException("Failed to copy " + srcMeta + " to " + dstMeta, e); + throw new IOException("Failed to copy " + srcReplica + " metadata to " + + dstMeta, e); } } - try { - Storage.nativeCopyFileUnbuffered(srcFile, dstFile, true); + srcReplica.copyBlockdata(dstFile.toURI()); } catch (IOException e) { - throw new IOException("Failed to copy " + srcFile + " to " + dstFile, e); + throw new IOException("Failed to copy " + srcReplica + " block file to " + + dstFile, e); } if (LOG.isDebugEnabled()) { if (calculateChecksum) { - LOG.debug("Copied " + srcMeta + " to " + dstMeta - + " and calculated checksum"); + LOG.debug("Copied " + srcReplica.getMetadataURI() + " meta to " + + dstMeta + " and calculated checksum"); } else { - LOG.debug("Copied " + srcFile + " to " + dstFile); + LOG.debug("Copied " + srcReplica.getBlockURI() + " to " + dstFile); } } return new File[] {dstMeta, dstFile}; @@ -1002,18 +956,21 @@ public ReplicaInfo moveBlockAcrossStorage(ExtendedBlock block, private ReplicaInfo moveBlock(ExtendedBlock block, ReplicaInfo replicaInfo, FsVolumeReference volumeRef) throws IOException { - File oldBlockFile = replicaInfo.getBlockFile(); - File oldMetaFile = replicaInfo.getMetaFile(); + FsVolumeImpl targetVolume = (FsVolumeImpl) volumeRef.getVolume(); // Copy files to temp dir first File[] blockFiles = copyBlockFiles(block.getBlockId(), - block.getGenerationStamp(), oldMetaFile, oldBlockFile, + block.getGenerationStamp(), replicaInfo, targetVolume.getTmpDir(block.getBlockPoolId()), replicaInfo.isOnTransientStorage(), smallBufferSize, conf); - ReplicaInfo newReplicaInfo = new ReplicaInPipeline( - replicaInfo.getBlockId(), replicaInfo.getGenerationStamp(), - targetVolume, blockFiles[0].getParentFile(), 0); + ReplicaInfo newReplicaInfo = new ReplicaBuilder(ReplicaState.TEMPORARY) + .setBlockId(replicaInfo.getBlockId()) + .setGenerationStamp(replicaInfo.getGenerationStamp()) + .setFsVolume(targetVolume) + .setDirectoryToUse(blockFiles[0].getParentFile()) + .setBytesToReserve(0) + .build(); newReplicaInfo.setNumBytes(blockFiles[1].length()); // Finalize the copied files newReplicaInfo = finalizeReplica(block.getBlockPoolId(), newReplicaInfo); @@ -1023,8 +980,7 @@ private ReplicaInfo moveBlock(ExtendedBlock block, ReplicaInfo replicaInfo, volume.getBlockPoolSlice(block.getBlockPoolId()).incrNumBlocks(); } - removeOldReplica(replicaInfo, newReplicaInfo, oldBlockFile, oldMetaFile, - oldBlockFile.length(), oldMetaFile.length(), block.getBlockPoolId()); + removeOldReplica(replicaInfo, newReplicaInfo, block.getBlockPoolId()); return newReplicaInfo; } @@ -1065,16 +1021,18 @@ public ReplicaInfo moveBlockAcrossVolumes(ExtendedBlock block, FsVolumeSpi * Compute and store the checksum for a block file that does not already have * its checksum computed. * - * @param srcMeta source meta file, containing only the checksum header, not a - * calculated checksum + * @param srcReplica source {@link ReplicaInfo}, containing only the checksum + * header, not a calculated checksum * @param dstMeta destination meta file, into which this method will write a * full computed checksum - * @param blockFile block file for which the checksum will be computed + * @param smallBufferSize buffer size to use + * @param conf the {@link Configuration} * @throws IOException */ - private static void computeChecksum(File srcMeta, File dstMeta, - File blockFile, int smallBufferSize, final Configuration conf) + private static void computeChecksum(ReplicaInfo srcReplica, File dstMeta, + int smallBufferSize, final Configuration conf) throws IOException { + File srcMeta = new File(srcReplica.getMetadataURI()); final DataChecksum checksum = BlockMetadataHeader.readDataChecksum(srcMeta, DFSUtilClient.getIoFileBufferSize(conf)); final byte[] data = new byte[1 << 16]; @@ -1094,9 +1052,7 @@ private static void computeChecksum(File srcMeta, File dstMeta, BlockMetadataHeader.writeHeader(metaOut, checksum); int offset = 0; - try (InputStream dataIn = isNativeIOAvailable ? - NativeIO.getShareDeleteFileInputStream(blockFile) : - new FileInputStream(blockFile)) { + try (InputStream dataIn = srcReplica.getDataInputStream(0)) { for (int n; (n = dataIn.read(data, offset, data.length - offset)) != -1; ) { if (n > 0) { @@ -1118,58 +1074,7 @@ private static void computeChecksum(File srcMeta, File dstMeta, checksum.calculateChunkedSums(data, 0, offset, crcs, 0); metaOut.write(crcs, 0, 4); } finally { - IOUtils.cleanup(LOG, metaOut); - } - } - - static private void truncateBlock(File blockFile, File metaFile, - long oldlen, long newlen) throws IOException { - LOG.info("truncateBlock: blockFile=" + blockFile - + ", metaFile=" + metaFile - + ", oldlen=" + oldlen - + ", newlen=" + newlen); - - if (newlen == oldlen) { - return; - } - if (newlen > oldlen) { - throw new IOException("Cannot truncate block to from oldlen (=" + oldlen - + ") to newlen (=" + newlen + ")"); - } - - DataChecksum dcs = BlockMetadataHeader.readHeader(metaFile).getChecksum(); - int checksumsize = dcs.getChecksumSize(); - int bpc = dcs.getBytesPerChecksum(); - long n = (newlen - 1)/bpc + 1; - long newmetalen = BlockMetadataHeader.getHeaderSize() + n*checksumsize; - long lastchunkoffset = (n - 1)*bpc; - int lastchunksize = (int)(newlen - lastchunkoffset); - byte[] b = new byte[Math.max(lastchunksize, checksumsize)]; - - RandomAccessFile blockRAF = new RandomAccessFile(blockFile, "rw"); - try { - //truncate blockFile - blockRAF.setLength(newlen); - - //read last chunk - blockRAF.seek(lastchunkoffset); - blockRAF.readFully(b, 0, lastchunksize); - } finally { - blockRAF.close(); - } - - //compute checksum - dcs.update(b, 0, lastchunksize); - dcs.writeValue(b, 0, false); - - //update metaFile - RandomAccessFile metaRAF = new RandomAccessFile(metaFile, "rw"); - try { - metaRAF.setLength(newmetalen); - metaRAF.seek(newmetalen - checksumsize); - metaRAF.write(b, 0, checksumsize); - } finally { - metaRAF.close(); + IOUtils.cleanup(null, metaOut); } } @@ -1202,10 +1107,9 @@ public ReplicaHandler append(ExtendedBlock b, } FsVolumeReference ref = replicaInfo.getVolume().obtainReference(); - ReplicaBeingWritten replica = null; + ReplicaInPipeline replica = null; try { - replica = append(b.getBlockPoolId(), - (FinalizedReplica) replicaInfo, newGS, + replica = append(b.getBlockPoolId(), replicaInfo, newGS, b.getNumBytes()); } catch (IOException e) { IOUtils.cleanup(null, ref); @@ -1227,70 +1131,38 @@ public ReplicaHandler append(ExtendedBlock b, * @throws IOException if moving the replica from finalized directory * to rbw directory fails */ - private ReplicaBeingWritten append(String bpid, - FinalizedReplica replicaInfo, long newGS, long estimateBlockLen) + private ReplicaInPipeline append(String bpid, + ReplicaInfo replicaInfo, long newGS, long estimateBlockLen) throws IOException { try (AutoCloseableLock lock = datasetLock.acquire()) { + // If the block is cached, start uncaching it. + if (replicaInfo.getState() != ReplicaState.FINALIZED) { + throw new IOException("Only a Finalized replica can be appended to; " + + "Replica with blk id " + replicaInfo.getBlockId() + " has state " + + replicaInfo.getState()); + } // If the block is cached, start uncaching it. cacheManager.uncacheBlock(bpid, replicaInfo.getBlockId()); - // If there are any hardlinks to the block, break them. This ensures we - // are not appending to a file that is part of a previous/ directory. + // If there are any hardlinks to the block, break them. This ensures + // we are not appending to a file that is part of a previous/ directory. replicaInfo.breakHardLinksIfNeeded(); - // construct a RBW replica with the new GS - File blkfile = replicaInfo.getBlockFile(); - FsVolumeImpl v = (FsVolumeImpl) replicaInfo.getVolume(); - long bytesReserved = estimateBlockLen - replicaInfo.getNumBytes(); - if (v.getAvailable() < bytesReserved) { - throw new DiskOutOfSpaceException("Insufficient space for appending to " - + replicaInfo); + FsVolumeImpl v = (FsVolumeImpl)replicaInfo.getVolume(); + ReplicaInPipeline rip = v.append(bpid, replicaInfo, + newGS, estimateBlockLen); + if (rip.getReplicaInfo().getState() != ReplicaState.RBW) { + throw new IOException("Append on block " + replicaInfo.getBlockId() + + " returned a replica of state " + rip.getReplicaInfo().getState() + + "; expected RBW"); } - File newBlkFile = new File(v.getRbwDir(bpid), replicaInfo.getBlockName()); - File oldmeta = replicaInfo.getMetaFile(); - ReplicaBeingWritten newReplicaInfo = new ReplicaBeingWritten( - replicaInfo.getBlockId(), replicaInfo.getNumBytes(), newGS, - v, newBlkFile.getParentFile(), Thread.currentThread(), bytesReserved); - File newmeta = newReplicaInfo.getMetaFile(); - - // rename meta file to rbw directory - if (LOG.isDebugEnabled()) { - LOG.debug("Renaming " + oldmeta + " to " + newmeta); - } - try { - NativeIO.renameTo(oldmeta, newmeta); - } catch (IOException e) { - throw new IOException("Block " + replicaInfo + " reopen failed. " + - " Unable to move meta file " + oldmeta + - " to rbw dir " + newmeta, e); - } - - // rename block file to rbw directory - if (LOG.isDebugEnabled()) { - LOG.debug("Renaming " + blkfile + " to " + newBlkFile - + ", file length=" + blkfile.length()); - } - try { - NativeIO.renameTo(blkfile, newBlkFile); - } catch (IOException e) { - try { - NativeIO.renameTo(newmeta, oldmeta); - } catch (IOException ex) { - LOG.warn("Cannot move meta file " + newmeta + - "back to the finalized directory " + oldmeta, ex); - } - throw new IOException("Block " + replicaInfo + " reopen failed. " + - " Unable to move block file " + blkfile + - " to rbw dir " + newBlkFile, e); - } - // Replace finalized replica by a RBW replica in replicas map - volumeMap.add(bpid, newReplicaInfo); - v.reserveSpaceForReplica(bytesReserved); - return newReplicaInfo; + volumeMap.add(bpid, rip.getReplicaInfo()); + return rip; } } + @SuppressWarnings("serial") private static class MustStopExistingWriter extends Exception { private final ReplicaInPipeline rip; @@ -1298,7 +1170,7 @@ private static class MustStopExistingWriter extends Exception { this.rip = rip; } - ReplicaInPipeline getReplica() { + ReplicaInPipeline getReplicaInPipeline() { return rip; } } @@ -1327,7 +1199,7 @@ private ReplicaInfo recoverCheck(ExtendedBlock b, long newGS, // stop the previous writer before check a replica's length long replicaLen = replicaInfo.getNumBytes(); if (replicaInfo.getState() == ReplicaState.RBW) { - ReplicaBeingWritten rbw = (ReplicaBeingWritten)replicaInfo; + ReplicaInPipeline rbw = (ReplicaInPipeline) replicaInfo; if (!rbw.attemptToSetWriter(null, Thread.currentThread())) { throw new MustStopExistingWriter(rbw); } @@ -1360,17 +1232,16 @@ public ReplicaHandler recoverAppend( try { try (AutoCloseableLock lock = datasetLock.acquire()) { ReplicaInfo replicaInfo = recoverCheck(b, newGS, expectedBlockLen); - FsVolumeReference ref = replicaInfo.getVolume().obtainReference(); - ReplicaBeingWritten replica; + ReplicaInPipeline replica; try { // change the replica's state/gs etc. if (replicaInfo.getState() == ReplicaState.FINALIZED) { - replica = append(b.getBlockPoolId(), (FinalizedReplica) replicaInfo, + replica = append(b.getBlockPoolId(), replicaInfo, newGS, b.getNumBytes()); } else { //RBW - bumpReplicaGS(replicaInfo, newGS); - replica = (ReplicaBeingWritten) replicaInfo; + replicaInfo.bumpReplicaGS(newGS); + replica = (ReplicaInPipeline) replicaInfo; } } catch (IOException e) { IOUtils.cleanup(null, ref); @@ -1379,7 +1250,8 @@ public ReplicaHandler recoverAppend( return new ReplicaHandler(replica, ref); } } catch (MustStopExistingWriter e) { - e.getReplica().stopWriter(datanode.getDnConf().getXceiverStopTimeout()); + e.getReplicaInPipeline() + .stopWriter(datanode.getDnConf().getXceiverStopTimeout()); } } } @@ -1394,7 +1266,7 @@ public Replica recoverClose(ExtendedBlock b, long newGS, // check replica's state ReplicaInfo replicaInfo = recoverCheck(b, newGS, expectedBlockLen); // bump the replica's GS - bumpReplicaGS(replicaInfo, newGS); + replicaInfo.bumpReplicaGS(newGS); // finalize the replica if RBW if (replicaInfo.getState() == ReplicaState.RBW) { finalizeReplica(b.getBlockPoolId(), replicaInfo); @@ -1402,40 +1274,12 @@ public Replica recoverClose(ExtendedBlock b, long newGS, return replicaInfo; } } catch (MustStopExistingWriter e) { - e.getReplica().stopWriter(datanode.getDnConf().getXceiverStopTimeout()); + e.getReplicaInPipeline() + .stopWriter(datanode.getDnConf().getXceiverStopTimeout()); } } } - /** - * Bump a replica's generation stamp to a new one. - * Its on-disk meta file name is renamed to be the new one too. - * - * @param replicaInfo a replica - * @param newGS new generation stamp - * @throws IOException if rename fails - */ - private void bumpReplicaGS(ReplicaInfo replicaInfo, - long newGS) throws IOException { - long oldGS = replicaInfo.getGenerationStamp(); - File oldmeta = replicaInfo.getMetaFile(); - replicaInfo.setGenerationStamp(newGS); - File newmeta = replicaInfo.getMetaFile(); - - // rename meta file to new GS - if (LOG.isDebugEnabled()) { - LOG.debug("Renaming " + oldmeta + " to " + newmeta); - } - try { - NativeIO.renameTo(oldmeta, newmeta); - } catch (IOException e) { - replicaInfo.setGenerationStamp(oldGS); // restore old GS - throw new IOException("Block " + replicaInfo + " reopen failed. " + - " Unable to move meta file " + oldmeta + - " to " + newmeta, e); - } - } - @Override // FsDatasetSpi public ReplicaHandler createRbw( StorageType storageType, ExtendedBlock b, boolean allowLazyPersist) @@ -1482,18 +1326,20 @@ public ReplicaHandler createRbw( datanode.getMetrics().incrRamDiskBlocksWriteFallback(); } - File f; + ReplicaInPipeline newReplicaInfo; try { - f = v.createRbwFile(b.getBlockPoolId(), b.getLocalBlock()); + newReplicaInfo = v.createRbw(b); + if (newReplicaInfo.getReplicaInfo().getState() != ReplicaState.RBW) { + throw new IOException("CreateRBW returned a replica of state " + + newReplicaInfo.getReplicaInfo().getState() + + " for block " + b.getBlockId()); + } } catch (IOException e) { IOUtils.cleanup(null, ref); throw e; } - ReplicaBeingWritten newReplicaInfo = - new ReplicaBeingWritten(b.getBlockId(), - b.getGenerationStamp(), v, f.getParentFile(), b.getNumBytes()); - volumeMap.add(b.getBlockPoolId(), newReplicaInfo); + volumeMap.add(b.getBlockPoolId(), newReplicaInfo.getReplicaInfo()); return new ReplicaHandler(newReplicaInfo, ref); } } @@ -1507,14 +1353,14 @@ public ReplicaHandler recoverRbw( while (true) { try { try (AutoCloseableLock lock = datasetLock.acquire()) { - ReplicaInfo replicaInfo = getReplicaInfo(b.getBlockPoolId(), b.getBlockId()); - + ReplicaInfo replicaInfo = + getReplicaInfo(b.getBlockPoolId(), b.getBlockId()); // check the replica's state if (replicaInfo.getState() != ReplicaState.RBW) { throw new ReplicaNotFoundException( ReplicaNotFoundException.NON_RBW_REPLICA + replicaInfo); } - ReplicaBeingWritten rbw = (ReplicaBeingWritten)replicaInfo; + ReplicaInPipeline rbw = (ReplicaInPipeline)replicaInfo; if (!rbw.attemptToSetWriter(null, Thread.currentThread())) { throw new MustStopExistingWriter(rbw); } @@ -1522,12 +1368,13 @@ public ReplicaHandler recoverRbw( return recoverRbwImpl(rbw, b, newGS, minBytesRcvd, maxBytesRcvd); } } catch (MustStopExistingWriter e) { - e.getReplica().stopWriter(datanode.getDnConf().getXceiverStopTimeout()); + e.getReplicaInPipeline().stopWriter( + datanode.getDnConf().getXceiverStopTimeout()); } } } - private ReplicaHandler recoverRbwImpl(ReplicaBeingWritten rbw, + private ReplicaHandler recoverRbwImpl(ReplicaInPipeline rbw, ExtendedBlock b, long newGS, long minBytesRcvd, long maxBytesRcvd) throws IOException { try (AutoCloseableLock lock = datasetLock.acquire()) { @@ -1551,20 +1398,20 @@ private ReplicaHandler recoverRbwImpl(ReplicaBeingWritten rbw, minBytesRcvd + ", " + maxBytesRcvd + "]."); } - FsVolumeReference ref = rbw.getVolume().obtainReference(); + FsVolumeReference ref = rbw.getReplicaInfo() + .getVolume().obtainReference(); try { // Truncate the potentially corrupt portion. // If the source was client and the last node in the pipeline was lost, // any corrupt data written after the acked length can go unnoticed. if (numBytes > bytesAcked) { - final File replicafile = rbw.getBlockFile(); - truncateBlock(replicafile, rbw.getMetaFile(), numBytes, bytesAcked); + rbw.getReplicaInfo().truncateBlock(bytesAcked); rbw.setNumBytes(bytesAcked); rbw.setLastChecksumAndDataLen(bytesAcked, null); } // bump the replica's generation stamp to newGS - bumpReplicaGS(rbw, newGS); + rbw.getReplicaInfo().bumpReplicaGS(newGS); } catch (IOException e) { IOUtils.cleanup(null, ref); throw e; @@ -1576,6 +1423,7 @@ private ReplicaHandler recoverRbwImpl(ReplicaBeingWritten rbw, @Override // FsDatasetSpi public ReplicaInPipeline convertTemporaryToRbw( final ExtendedBlock b) throws IOException { + try (AutoCloseableLock lock = datasetLock.acquire()) { final long blockId = b.getBlockId(); final long expectedGs = b.getGenerationStamp(); @@ -1583,21 +1431,21 @@ public ReplicaInPipeline convertTemporaryToRbw( LOG.info("Convert " + b + " from Temporary to RBW, visible length=" + visible); - final ReplicaInPipeline temp; - - // get replica - final ReplicaInfo r = volumeMap.get(b.getBlockPoolId(), blockId); - if (r == null) { - throw new ReplicaNotFoundException( - ReplicaNotFoundException.NON_EXISTENT_REPLICA + b); + final ReplicaInfo temp; + { + // get replica + final ReplicaInfo r = volumeMap.get(b.getBlockPoolId(), blockId); + if (r == null) { + throw new ReplicaNotFoundException( + ReplicaNotFoundException.NON_EXISTENT_REPLICA + b); + } + // check the replica's state + if (r.getState() != ReplicaState.TEMPORARY) { + throw new ReplicaAlreadyExistsException( + "r.getState() != ReplicaState.TEMPORARY, r=" + r); + } + temp = r; } - // check the replica's state - if (r.getState() != ReplicaState.TEMPORARY) { - throw new ReplicaAlreadyExistsException( - "r.getState() != ReplicaState.TEMPORARY, r=" + r); - } - temp = (ReplicaInPipeline) r; - // check generation stamp if (temp.getGenerationStamp() != expectedGs) { throw new ReplicaAlreadyExistsException( @@ -1621,17 +1469,15 @@ public ReplicaInPipeline convertTemporaryToRbw( throw new IOException("r.getVolume() = null, temp=" + temp); } - // move block files to the rbw directory - BlockPoolSlice bpslice = v.getBlockPoolSlice(b.getBlockPoolId()); - final File dest = moveBlockFiles(b.getLocalBlock(), temp.getBlockFile(), - bpslice.getRbwDir()); - // create RBW - final ReplicaBeingWritten rbw = new ReplicaBeingWritten( - blockId, numBytes, expectedGs, - v, dest.getParentFile(), Thread.currentThread(), 0); - rbw.setBytesAcked(visible); + final ReplicaInPipeline rbw = v.convertTemporaryToRbw(b, temp); + + if(rbw.getState() != ReplicaState.RBW) { + throw new IOException("Expected replica state: " + ReplicaState.RBW + + " obtained " + rbw.getState() + " for converting block " + + b.getBlockId()); + } // overwrite the RBW in the volume map - volumeMap.add(b.getBlockPoolId(), rbw); + volumeMap.add(b.getBlockPoolId(), rbw.getReplicaInfo()); return rbw; } } @@ -1653,22 +1499,20 @@ public ReplicaHandler createTemporary( FsVolumeReference ref = volumes.getNextVolume(storageType, b.getNumBytes()); FsVolumeImpl v = (FsVolumeImpl) ref.getVolume(); - // create a temporary file to hold block in the designated volume - File f; + ReplicaInPipeline newReplicaInfo; try { - f = v.createTmpFile(b.getBlockPoolId(), b.getLocalBlock()); + newReplicaInfo = v.createTemporary(b); } catch (IOException e) { IOUtils.cleanup(null, ref); throw e; } - ReplicaInPipeline newReplicaInfo = - new ReplicaInPipeline(b.getBlockId(), b.getGenerationStamp(), v, - f.getParentFile(), b.getLocalBlock().getNumBytes()); - volumeMap.add(b.getBlockPoolId(), newReplicaInfo); + + volumeMap.add(b.getBlockPoolId(), newReplicaInfo.getReplicaInfo()); return new ReplicaHandler(newReplicaInfo, ref); } else { - if (!(currentReplicaInfo.getGenerationStamp() < b - .getGenerationStamp() && currentReplicaInfo instanceof ReplicaInPipeline)) { + if (!(currentReplicaInfo.getGenerationStamp() < b.getGenerationStamp() + && (currentReplicaInfo.getState() == ReplicaState.TEMPORARY + || currentReplicaInfo.getState() == ReplicaState.RBW))) { throw new ReplicaAlreadyExistsException("Block " + b + " already exists in state " + currentReplicaInfo.getState() + " and thus cannot be created."); @@ -1687,8 +1531,7 @@ public ReplicaHandler createTemporary( } // Stop the previous writer - ((ReplicaInPipeline) lastFoundReplicaInfo) - .stopWriter(writerStopTimeoutMs); + ((ReplicaInPipeline)lastFoundReplicaInfo).stopWriter(writerStopTimeoutMs); } while (true); } @@ -1737,29 +1580,23 @@ public void finalizeBlock(ExtendedBlock b) throws IOException { finalizeReplica(b.getBlockPoolId(), replicaInfo); } } - - private FinalizedReplica finalizeReplica(String bpid, + + private ReplicaInfo finalizeReplica(String bpid, ReplicaInfo replicaInfo) throws IOException { try (AutoCloseableLock lock = datasetLock.acquire()) { - FinalizedReplica newReplicaInfo = null; + ReplicaInfo newReplicaInfo = null; if (replicaInfo.getState() == ReplicaState.RUR && - ((ReplicaUnderRecovery) replicaInfo).getOriginalReplica().getState() - == ReplicaState.FINALIZED) { - newReplicaInfo = (FinalizedReplica) - ((ReplicaUnderRecovery) replicaInfo).getOriginalReplica(); + replicaInfo.getOriginalReplica().getState() + == ReplicaState.FINALIZED) { + newReplicaInfo = replicaInfo.getOriginalReplica(); } else { - FsVolumeImpl v = (FsVolumeImpl) replicaInfo.getVolume(); - File f = replicaInfo.getBlockFile(); + FsVolumeImpl v = (FsVolumeImpl)replicaInfo.getVolume(); if (v == null) { - throw new IOException("No volume for temporary file " + f + - " for block " + replicaInfo); + throw new IOException("No volume for block " + replicaInfo); } - File dest = v.addFinalizedBlock( - bpid, replicaInfo, f, replicaInfo.getBytesReserved()); - newReplicaInfo = - new FinalizedReplica(replicaInfo, v, dest.getParentFile()); - + newReplicaInfo = v.addFinalizedBlock( + bpid, replicaInfo, replicaInfo, replicaInfo.getBytesReserved()); if (v.isTransientStorage()) { releaseLockedMemory( replicaInfo.getOriginalBytesReserved() @@ -1770,8 +1607,9 @@ private FinalizedReplica finalizeReplica(String bpid, datanode.getMetrics().addRamDiskBytesWrite(replicaInfo.getNumBytes()); } } + assert newReplicaInfo.getState() == ReplicaState.FINALIZED + : "Replica should be finalized"; volumeMap.add(bpid, newReplicaInfo); - return newReplicaInfo; } } @@ -1784,14 +1622,13 @@ public void unfinalizeBlock(ExtendedBlock b) throws IOException { try (AutoCloseableLock lock = datasetLock.acquire()) { ReplicaInfo replicaInfo = volumeMap.get(b.getBlockPoolId(), b.getLocalBlock()); - if (replicaInfo != null - && replicaInfo.getState() == ReplicaState.TEMPORARY) { + if (replicaInfo != null && + replicaInfo.getState() == ReplicaState.TEMPORARY) { // remove from volumeMap volumeMap.remove(b.getBlockPoolId(), b.getLocalBlock()); // delete the on-disk temp file - if (delBlockFromDisk(replicaInfo.getBlockFile(), - replicaInfo.getMetaFile(), b.getLocalBlock())) { + if (delBlockFromDisk(replicaInfo)) { LOG.warn("Block " + b + " unfinalized and removed. "); } if (replicaInfo.getVolume().isTransientStorage()) { @@ -1804,23 +1641,17 @@ public void unfinalizeBlock(ExtendedBlock b) throws IOException { /** * Remove a block from disk - * @param blockFile block file - * @param metaFile block meta file - * @param b a block - * @return true if on-disk files are deleted; false otherwise + * @param info the replica that needs to be deleted + * @return true if data for the replica are deleted; false otherwise */ - private boolean delBlockFromDisk(File blockFile, File metaFile, Block b) { - if (blockFile == null) { - LOG.warn("No file exists for block: " + b); - return true; - } + private boolean delBlockFromDisk(ReplicaInfo info) { - if (!blockFile.delete()) { - LOG.warn("Not able to delete the block file: " + blockFile); + if (!info.deleteBlockData()) { + LOG.warn("Not able to delete the block data for replica " + info); return false; } else { // remove the meta file - if (metaFile != null && !metaFile.delete()) { - LOG.warn("Not able to delete the meta block file: " + metaFile); + if (!info.deleteMetadata()) { + LOG.warn("Not able to delete the meta data for replica " + info); return false; } } @@ -1859,20 +1690,19 @@ public Map getBlockReports(String bpid) { continue; } switch(b.getState()) { - case FINALIZED: - case RBW: - case RWR: - builders.get(b.getVolume().getStorageID()).add(b); - break; - case RUR: - ReplicaUnderRecovery rur = (ReplicaUnderRecovery)b; - builders.get(rur.getVolume().getStorageID()) - .add(rur.getOriginalReplica()); - break; - case TEMPORARY: - break; - default: - assert false : "Illegal ReplicaInfo state."; + case FINALIZED: + case RBW: + case RWR: + builders.get(b.getVolume().getStorageID()).add(b); + break; + case RUR: + ReplicaInfo orig = b.getOriginalReplica(); + builders.get(b.getVolume().getStorageID()).add(orig); + break; + case TEMPORARY: + break; + default: + assert false : "Illegal ReplicaInfo state."; } } } @@ -1889,13 +1719,14 @@ public Map getBlockReports(String bpid) { * Get the list of finalized blocks from in-memory blockmap for a block pool. */ @Override - public List getFinalizedBlocks(String bpid) { + public List getFinalizedBlocks(String bpid) { try (AutoCloseableLock lock = datasetLock.acquire()) { - ArrayList finalized = - new ArrayList(volumeMap.size(bpid)); + ArrayList finalized = + new ArrayList(volumeMap.size(bpid)); for (ReplicaInfo b : volumeMap.replicas(bpid)) { if (b.getState() == ReplicaState.FINALIZED) { - finalized.add(new FinalizedReplica((FinalizedReplica) b)); + finalized.add(new ReplicaBuilder(ReplicaState.FINALIZED) + .from(b).build()); } } return finalized; @@ -1906,15 +1737,16 @@ public List getFinalizedBlocks(String bpid) { * Get the list of finalized blocks from in-memory blockmap for a block pool. */ @Override - public List getFinalizedBlocksOnPersistentStorage( + public List getFinalizedBlocksOnPersistentStorage( String bpid) { try (AutoCloseableLock lock = datasetLock.acquire()) { - ArrayList finalized = - new ArrayList(volumeMap.size(bpid)); + ArrayList finalized = + new ArrayList(volumeMap.size(bpid)); for (ReplicaInfo b : volumeMap.replicas(bpid)) { - if (!b.getVolume().isTransientStorage() && + if(!b.getVolume().isTransientStorage() && b.getState() == ReplicaState.FINALIZED) { - finalized.add(new FinalizedReplica((FinalizedReplica) b)); + finalized.add(new ReplicaBuilder(ReplicaState.FINALIZED) + .from(b).build()); } } return finalized; @@ -1951,8 +1783,8 @@ public void checkBlock(ExtendedBlock b, long minLength, ReplicaState state) if (replicaInfo.getState() != state) { throw new UnexpectedReplicaStateException(b,state); } - if (!replicaInfo.getBlockFile().exists()) { - throw new FileNotFoundException(replicaInfo.getBlockFile().getPath()); + if (!replicaInfo.blockDataExists()) { + throw new FileNotFoundException(replicaInfo.getBlockURI().toString()); } long onDiskLength = getLength(b); if (onDiskLength < minLength) { @@ -1991,46 +1823,44 @@ private boolean isValid(final ExtendedBlock b, final ReplicaState state) { /** * Find the file corresponding to the block and return it if it exists. */ - File validateBlockFile(String bpid, long blockId) { + ReplicaInfo validateBlockFile(String bpid, long blockId) { //Should we check for metadata file too? - final File f; + final ReplicaInfo r; try (AutoCloseableLock lock = datasetLock.acquire()) { - f = getFile(bpid, blockId, false); + r = volumeMap.get(bpid, blockId); } - - if(f != null ) { - if(f.exists()) - return f; - + + if (r != null) { + if (r.blockDataExists()) { + return r; + } // if file is not null, but doesn't exist - possibly disk failed datanode.checkDiskErrorAsync(); } - + if (LOG.isDebugEnabled()) { - LOG.debug("blockId=" + blockId + ", f=" + f); + LOG.debug("blockId=" + blockId + ", replica=" + r); } return null; } /** Check the files of a replica. */ static void checkReplicaFiles(final ReplicaInfo r) throws IOException { - //check replica's file - final File f = r.getBlockFile(); - if (!f.exists()) { - throw new FileNotFoundException("File " + f + " not found, r=" + r); + //check replica's data exists + if (!r.blockDataExists()) { + throw new FileNotFoundException("Block data not found, r=" + r); } - if (r.getBytesOnDisk() != f.length()) { - throw new IOException("File length mismatched. The length of " - + f + " is " + f.length() + " but r=" + r); + if (r.getBytesOnDisk() != r.getBlockDataLength()) { + throw new IOException("Block length mismatch, len=" + + r.getBlockDataLength() + " but r=" + r); } //check replica's meta file - final File metafile = FsDatasetUtil.getMetaFile(f, r.getGenerationStamp()); - if (!metafile.exists()) { - throw new IOException("Metafile " + metafile + " does not exist, r=" + r); + if (!r.metadataExists()) { + throw new IOException(r.getMetadataURI() + " does not exist, r=" + r); } - if (metafile.length() == 0) { - throw new IOException("Metafile " + metafile + " is empty, r=" + r); + if (r.getMetadataLength() == 0) { + throw new IOException("Metafile is empty, r=" + r); } } @@ -2041,7 +1871,7 @@ static void checkReplicaFiles(final ReplicaInfo r) throws IOException { public void invalidate(String bpid, Block invalidBlks[]) throws IOException { final List errors = new ArrayList(); for (int i = 0; i < invalidBlks.length; i++) { - final File f; + final ReplicaInfo removing; final FsVolumeImpl v; try (AutoCloseableLock lock = datasetLock.acquire()) { final ReplicaInfo info = volumeMap.get(bpid, invalidBlks[i]); @@ -2056,27 +1886,31 @@ public void invalidate(String bpid, Block invalidBlks[]) throws IOException { + ": GenerationStamp not matched, info=" + info); continue; } - f = info.getBlockFile(); v = (FsVolumeImpl)info.getVolume(); if (v == null) { errors.add("Failed to delete replica " + invalidBlks[i] - + ". No volume for this replica, file=" + f); + + ". No volume for replica " + info); continue; } - File parent = f.getParentFile(); - if (parent == null) { - errors.add("Failed to delete replica " + invalidBlks[i] - + ". Parent not found for file " + f); - continue; + try { + File blockFile = new File(info.getBlockURI()); + if (blockFile != null && blockFile.getParentFile() == null) { + errors.add("Failed to delete replica " + invalidBlks[i] + + ". Parent not found for block file: " + blockFile); + continue; + } + } catch(IllegalArgumentException e) { + LOG.warn("Parent directory check failed; replica " + info + + " is not backed by a local file"); } - ReplicaInfo removing = volumeMap.remove(bpid, invalidBlks[i]); + removing = volumeMap.remove(bpid, invalidBlks[i]); addDeletingBlock(bpid, removing.getBlockId()); if (LOG.isDebugEnabled()) { - LOG.debug("Block file " + removing.getBlockFile().getName() + LOG.debug("Block file " + removing.getBlockURI() + " is to be deleted"); } - if (removing instanceof ReplicaInPipelineInterface) { - ((ReplicaInPipelineInterface) removing).releaseAllBytesReserved(); + if (removing instanceof ReplicaInPipeline) { + ((ReplicaInPipeline) removing).releaseAllBytesReserved(); } } @@ -2104,10 +1938,9 @@ public void invalidate(String bpid, Block invalidBlks[]) throws IOException { // It's ok to unlink the block file before the uncache operation // finishes. try { - asyncDiskService.deleteAsync(v.obtainReference(), f, - FsDatasetUtil.getMetaFile(f, invalidBlks[i].getGenerationStamp()), + asyncDiskService.deleteAsync(v.obtainReference(), removing, new ExtendedBlock(bpid, invalidBlks[i]), - dataStorage.getTrashDirectoryForBlockFile(bpid, f)); + dataStorage.getTrashDirectoryForReplica(bpid, removing)); } catch (ClosedChannelException e) { LOG.warn("Volume " + v + " is closed, ignore the deletion task for " + "block " + invalidBlks[i]); @@ -2192,7 +2025,7 @@ private void cacheBlock(String bpid, long blockId) { cacheManager.numBlocksFailedToCache.incrementAndGet(); } } - blockFileName = info.getBlockFile().getAbsolutePath(); + blockFileName = info.getBlockURI().toString(); length = info.getVisibleLength(); genstamp = info.getGenerationStamp(); volumeExecutor = volume.getCacheExecutor(); @@ -2224,28 +2057,12 @@ public boolean isCached(String bpid, long blockId) { public boolean contains(final ExtendedBlock block) { try (AutoCloseableLock lock = datasetLock.acquire()) { final long blockId = block.getLocalBlock().getBlockId(); - return getFile(block.getBlockPoolId(), blockId, false) != null; + final String bpid = block.getBlockPoolId(); + final ReplicaInfo r = volumeMap.get(bpid, blockId); + return (r != null && r.blockDataExists()); } } - /** - * Turn the block identifier into a filename - * @param bpid Block pool Id - * @param blockId a block's id - * @return on disk data file path; null if the replica does not exist - */ - File getFile(final String bpid, final long blockId, boolean touch) { - ReplicaInfo info = volumeMap.get(bpid, blockId); - if (info != null) { - if (touch && info.getVolume().isTransientStorage()) { - ramDiskReplicaTracker.touch(bpid, blockId); - datanode.getMetrics().incrRamDiskBlocksReadHits(); - } - return info.getBlockFile(); - } - return null; - } - /** * check if a data directory is healthy * @@ -2373,7 +2190,7 @@ public void checkAndUpdate(String bpid, long blockId, File diskFile, } return; } - if (!memBlockInfo.getBlockFile().exists()) { + if (!memBlockInfo.blockDataExists()) { // Block is in memory and not on the disk // Remove the block from volumeMap volumeMap.remove(bpid, blockId); @@ -2396,8 +2213,13 @@ public void checkAndUpdate(String bpid, long blockId, File diskFile, */ if (memBlockInfo == null) { // Block is missing in memory - add the block to volumeMap - ReplicaInfo diskBlockInfo = new FinalizedReplica(blockId, - diskFile.length(), diskGS, vol, diskFile.getParentFile()); + ReplicaInfo diskBlockInfo = new ReplicaBuilder(ReplicaState.FINALIZED) + .setBlockId(blockId) + .setLength(diskFile.length()) + .setGenerationStamp(diskGS) + .setFsVolume(vol) + .setDirectoryToUse(diskFile.getParentFile()) + .build(); volumeMap.add(bpid, diskBlockInfo); if (vol.isTransientStorage()) { long lockedBytesReserved = @@ -2413,21 +2235,27 @@ public void checkAndUpdate(String bpid, long blockId, File diskFile, * Block exists in volumeMap and the block file exists on the disk */ // Compare block files - File memFile = memBlockInfo.getBlockFile(); - if (memFile.exists()) { - if (memFile.compareTo(diskFile) != 0) { + if (memBlockInfo.blockDataExists()) { + if (memBlockInfo.getBlockURI().compareTo(diskFile.toURI()) != 0) { if (diskMetaFile.exists()) { - if (memBlockInfo.getMetaFile().exists()) { + if (memBlockInfo.metadataExists()) { // We have two sets of block+meta files. Decide which one to // keep. - ReplicaInfo diskBlockInfo = new FinalizedReplica( - blockId, diskFile.length(), diskGS, vol, diskFile.getParentFile()); - ((FsVolumeImpl) vol).getBlockPoolSlice(bpid).resolveDuplicateReplicas( - memBlockInfo, diskBlockInfo, volumeMap); + ReplicaInfo diskBlockInfo = + new ReplicaBuilder(ReplicaState.FINALIZED) + .setBlockId(blockId) + .setLength(diskFile.length()) + .setGenerationStamp(diskGS) + .setFsVolume(vol) + .setDirectoryToUse(diskFile.getParentFile()) + .build(); + ((FsVolumeImpl) vol).getBlockPoolSlice(bpid) + .resolveDuplicateReplicas( + memBlockInfo, diskBlockInfo, volumeMap); } } else { if (!diskFile.delete()) { - LOG.warn("Failed to delete " + diskFile + ". Will retry on next scan"); + LOG.warn("Failed to delete " + diskFile); } } } @@ -2436,12 +2264,12 @@ public void checkAndUpdate(String bpid, long blockId, File diskFile, // Update the block with the file found on the disk. Since the block // file and metadata file are found as a pair on the disk, update // the block based on the metadata file found on the disk - LOG.warn("Block file in volumeMap " - + memFile.getAbsolutePath() + LOG.warn("Block file in replica " + + memBlockInfo.getBlockURI() + " does not exist. Updating it to the file found during scan " + diskFile.getAbsolutePath()); - memBlockInfo.setDir(diskFile.getParentFile()); - memFile = diskFile; + memBlockInfo.updateWithReplica( + StorageLocation.parse(diskFile.toString())); LOG.warn("Updating generation stamp for block " + blockId + " from " + memBlockInfo.getGenerationStamp() + " to " + diskGS); @@ -2463,24 +2291,31 @@ public void checkAndUpdate(String bpid, long blockId, File diskFile, // Metadata file corresponding to block in memory is missing // If metadata file found during the scan is on the same directory // as the block file, then use the generation stamp from it - long gs = diskMetaFile != null && diskMetaFile.exists() - && diskMetaFile.getParent().equals(memFile.getParent()) ? diskGS - : HdfsConstants.GRANDFATHER_GENERATION_STAMP; + try { + File memFile = new File(memBlockInfo.getBlockURI()); + long gs = diskMetaFile != null && diskMetaFile.exists() + && diskMetaFile.getParent().equals(memFile.getParent()) ? diskGS + : HdfsConstants.GRANDFATHER_GENERATION_STAMP; - LOG.warn("Updating generation stamp for block " + blockId - + " from " + memBlockInfo.getGenerationStamp() + " to " + gs); + LOG.warn("Updating generation stamp for block " + blockId + + " from " + memBlockInfo.getGenerationStamp() + " to " + gs); - memBlockInfo.setGenerationStamp(gs); + memBlockInfo.setGenerationStamp(gs); + } catch (IllegalArgumentException e) { + //exception arises because the URI cannot be converted to a file + LOG.warn("Block URI could not be resolved to a file", e); + } } } // Compare block size - if (memBlockInfo.getNumBytes() != memFile.length()) { + if (memBlockInfo.getNumBytes() != memBlockInfo.getBlockDataLength()) { // Update the length based on the block file corruptBlock = new Block(memBlockInfo); LOG.warn("Updating size of block " + blockId + " from " - + memBlockInfo.getNumBytes() + " to " + memFile.length()); - memBlockInfo.setNumBytes(memFile.length()); + + memBlockInfo.getNumBytes() + " to " + + memBlockInfo.getBlockDataLength()); + memBlockInfo.setNumBytes(memBlockInfo.getBlockDataLength()); } } @@ -2531,7 +2366,7 @@ static ReplicaRecoveryInfo initReplicaRecovery(String bpid, ReplicaMap map, return initReplicaRecoveryImpl(bpid, map, block, recoveryId); } } catch (MustStopExistingWriter e) { - e.getReplica().stopWriter(xceiverStopTimeout); + e.getReplicaInPipeline().stopWriter(xceiverStopTimeout); } } } @@ -2549,20 +2384,21 @@ static ReplicaRecoveryInfo initReplicaRecoveryImpl(String bpid, ReplicaMap map, } //stop writer if there is any - if (replica instanceof ReplicaInPipeline) { + if (replica.getState() == ReplicaState.TEMPORARY || + replica.getState() == ReplicaState.RBW) { final ReplicaInPipeline rip = (ReplicaInPipeline)replica; if (!rip.attemptToSetWriter(null, Thread.currentThread())) { throw new MustStopExistingWriter(rip); } //check replica bytes on disk. - if (rip.getBytesOnDisk() < rip.getVisibleLength()) { + if (replica.getBytesOnDisk() < replica.getVisibleLength()) { throw new IOException("THIS IS NOT SUPPOSED TO HAPPEN:" - + " getBytesOnDisk() < getVisibleLength(), rip=" + rip); + + " getBytesOnDisk() < getVisibleLength(), rip=" + replica); } //check the replica's files - checkReplicaFiles(rip); + checkReplicaFiles(replica); } //check generation stamp @@ -2580,9 +2416,9 @@ static ReplicaRecoveryInfo initReplicaRecoveryImpl(String bpid, ReplicaMap map, } //check RUR - final ReplicaUnderRecovery rur; + final ReplicaInfo rur; if (replica.getState() == ReplicaState.RUR) { - rur = (ReplicaUnderRecovery)replica; + rur = replica; if (rur.getRecoveryID() >= recoveryId) { throw new RecoveryInProgressException( "rur.getRecoveryID() >= recoveryId = " + recoveryId @@ -2594,7 +2430,8 @@ static ReplicaRecoveryInfo initReplicaRecoveryImpl(String bpid, ReplicaMap map, + " from " + oldRecoveryID + " to " + recoveryId); } else { - rur = new ReplicaUnderRecovery(replica, recoveryId); + rur = new ReplicaBuilder(ReplicaState.RUR) + .from(replica).setRecoveryId(recoveryId).build(); map.add(bpid, rur); LOG.info("initReplicaRecovery: changing replica state for " + block + " from " + replica.getState() @@ -2640,8 +2477,8 @@ public Replica updateReplicaUnderRecovery( checkReplicaFiles(replica); //update replica - final FinalizedReplica finalized = updateReplicaUnderRecovery(oldBlock - .getBlockPoolId(), (ReplicaUnderRecovery) replica, recoveryId, + final ReplicaInfo finalized = updateReplicaUnderRecovery(oldBlock + .getBlockPoolId(), replica, recoveryId, newBlockId, newlength); boolean copyTruncate = newBlockId != oldBlock.getBlockId(); @@ -2661,7 +2498,6 @@ public Replica updateReplicaUnderRecovery( + ", len=" + oldBlock.getNumBytes() + ", finalized=" + finalized; } - //check replica files after update checkReplicaFiles(finalized); @@ -2669,9 +2505,9 @@ public Replica updateReplicaUnderRecovery( } } - private FinalizedReplica updateReplicaUnderRecovery( + private ReplicaInfo updateReplicaUnderRecovery( String bpid, - ReplicaUnderRecovery rur, + ReplicaInfo rur, long recoveryId, long newBlockId, long newlength) throws IOException { @@ -2682,18 +2518,9 @@ private FinalizedReplica updateReplicaUnderRecovery( } boolean copyOnTruncate = newBlockId > 0L && rur.getBlockId() != newBlockId; - File blockFile; - File metaFile; // bump rur's GS to be recovery id if(!copyOnTruncate) { - bumpReplicaGS(rur, recoveryId); - blockFile = rur.getBlockFile(); - metaFile = rur.getMetaFile(); - } else { - File[] copiedReplicaFiles = - copyReplicaWithNewBlockIdAndGS(rur, bpid, newBlockId, recoveryId); - blockFile = copiedReplicaFiles[1]; - metaFile = copiedReplicaFiles[0]; + rur.bumpReplicaGS(recoveryId); } //update length @@ -2701,48 +2528,34 @@ private FinalizedReplica updateReplicaUnderRecovery( throw new IOException("rur.getNumBytes() < newlength = " + newlength + ", rur=" + rur); } + if (rur.getNumBytes() > newlength) { - rur.breakHardLinksIfNeeded(); - truncateBlock(blockFile, metaFile, rur.getNumBytes(), newlength); if(!copyOnTruncate) { + rur.breakHardLinksIfNeeded(); + rur.truncateBlock(newlength); // update RUR with the new length rur.setNumBytes(newlength); } else { // Copying block to a new block with new blockId. // Not truncating original block. - FsVolumeSpi volume = rur.getVolume(); - String blockPath = blockFile.getAbsolutePath(); - String volumePath = volume.getBasePath(); - assert blockPath.startsWith(volumePath) : - "New block file: " + blockPath + " must be on " + - "same volume as recovery replica: " + volumePath; - ReplicaBeingWritten newReplicaInfo = new ReplicaBeingWritten( - newBlockId, recoveryId, volume, blockFile.getParentFile(), - newlength); - newReplicaInfo.setNumBytes(newlength); - volumeMap.add(bpid, newReplicaInfo); - finalizeReplica(bpid, newReplicaInfo); - } - } + FsVolumeImpl volume = (FsVolumeImpl) rur.getVolume(); + ReplicaInPipeline newReplicaInfo = volume.updateRURCopyOnTruncate( + rur, bpid, newBlockId, recoveryId, newlength); + if (newReplicaInfo.getState() != ReplicaState.RBW) { + throw new IOException("Append on block " + rur.getBlockId() + + " returned a replica of state " + newReplicaInfo.getState() + + "; expected RBW"); + } + newReplicaInfo.setNumBytes(newlength); + volumeMap.add(bpid, newReplicaInfo.getReplicaInfo()); + finalizeReplica(bpid, newReplicaInfo.getReplicaInfo()); + } + } // finalize the block return finalizeReplica(bpid, rur); } - private File[] copyReplicaWithNewBlockIdAndGS( - ReplicaUnderRecovery replicaInfo, String bpid, long newBlkId, long newGS) - throws IOException { - String blockFileName = Block.BLOCK_FILE_PREFIX + newBlkId; - FsVolumeImpl v = (FsVolumeImpl) replicaInfo.getVolume(); - final File tmpDir = v.getBlockPoolSlice(bpid).getTmpDir(); - final File destDir = DatanodeUtil.idToBlockDir(tmpDir, newBlkId); - final File dstBlockFile = new File(destDir, blockFileName); - final File dstMetaFile = FsDatasetUtil.getMetaFile(dstBlockFile, newGS); - return copyBlockFiles(replicaInfo.getMetaFile(), - replicaInfo.getBlockFile(), - dstMetaFile, dstBlockFile, true, smallBufferSize, conf); - } - @Override // FsDatasetSpi public long getReplicaVisibleLength(final ExtendedBlock block) throws IOException { @@ -2886,10 +2699,11 @@ public BlockLocalPathInfo getBlockLocalPathInfo(ExtendedBlock block) } } - File datafile = getBlockFile(block); - File metafile = FsDatasetUtil.getMetaFile(datafile, block.getGenerationStamp()); + ReplicaInfo r = getBlockReplica(block); + File blockFile = new File(r.getBlockURI()); + File metaFile = new File(r.getMetadataURI()); BlockLocalPathInfo info = new BlockLocalPathInfo(block, - datafile.getAbsolutePath(), metafile.getAbsolutePath()); + blockFile.getAbsolutePath(), metaFile.toString()); return info; } @@ -3001,8 +2815,7 @@ private void setupAsyncLazyPersistThread(final FsVolumeImpl v) { } private void removeOldReplica(ReplicaInfo replicaInfo, - ReplicaInfo newReplicaInfo, File blockFile, File metaFile, - long blockFileUsed, long metaFileUsed, final String bpid) { + ReplicaInfo newReplicaInfo, final String bpid) { // Before deleting the files from old storage we must notify the // NN that the files are on the new storage. Else a blockReport from // the transient storage might cause the NN to think the blocks are lost. @@ -3019,11 +2832,11 @@ private void removeOldReplica(ReplicaInfo replicaInfo, newReplicaInfo.isOnTransientStorage()); // Remove the old replicas - if (blockFile.delete() || !blockFile.exists()) { + if (replicaInfo.deleteBlockData() || !replicaInfo.blockDataExists()) { FsVolumeImpl volume = (FsVolumeImpl) replicaInfo.getVolume(); - volume.onBlockFileDeletion(bpid, blockFileUsed); - if (metaFile.delete() || !metaFile.exists()) { - volume.onMetaFileDeletion(bpid, metaFileUsed); + volume.onBlockFileDeletion(bpid, replicaInfo.getBytesOnDisk()); + if (replicaInfo.deleteMetadata() || !replicaInfo.metadataExists()) { + volume.onMetaFileDeletion(bpid, replicaInfo.getMetadataLength()); } } @@ -3083,8 +2896,7 @@ private boolean saveNextReplica() { asyncLazyPersistService.submitLazyPersistTask( block.getBlockPoolId(), block.getBlockId(), replicaInfo.getGenerationStamp(), block.getCreationTime(), - replicaInfo.getMetaFile(), replicaInfo.getBlockFile(), - targetReference); + replicaInfo, targetReference); } } } @@ -3122,18 +2934,12 @@ public void evictBlocks(long bytesNeeded) throws IOException { } ReplicaInfo replicaInfo, newReplicaInfo; - File blockFile, metaFile; - long blockFileUsed, metaFileUsed; final String bpid = replicaState.getBlockPoolId(); try (AutoCloseableLock lock = datasetLock.acquire()) { replicaInfo = getReplicaInfo(replicaState.getBlockPoolId(), replicaState.getBlockId()); Preconditions.checkState(replicaInfo.getVolume().isTransientStorage()); - blockFile = replicaInfo.getBlockFile(); - metaFile = replicaInfo.getMetaFile(); - blockFileUsed = blockFile.length(); - metaFileUsed = metaFile.length(); ramDiskReplicaTracker.discardReplica(replicaState.getBlockPoolId(), replicaState.getBlockId(), false); @@ -3141,16 +2947,9 @@ public void evictBlocks(long bytesNeeded) throws IOException { // the target volume BlockPoolSlice bpSlice = replicaState.getLazyPersistVolume().getBlockPoolSlice(bpid); - File newBlockFile = bpSlice.activateSavedReplica( - replicaInfo, replicaState.getSavedMetaFile(), - replicaState.getSavedBlockFile()); newReplicaInfo = - new FinalizedReplica(replicaInfo.getBlockId(), - replicaInfo.getBytesOnDisk(), - replicaInfo.getGenerationStamp(), - replicaState.getLazyPersistVolume(), - newBlockFile.getParentFile()); + bpSlice.activateSavedReplica(replicaInfo, replicaState); // Update the volumeMap entry. volumeMap.add(bpid, newReplicaInfo); @@ -3165,8 +2964,7 @@ public void evictBlocks(long bytesNeeded) throws IOException { // Delete the block+meta files from RAM disk and release locked // memory. - removeOldReplica(replicaInfo, newReplicaInfo, blockFile, metaFile, - blockFileUsed, metaFileUsed, bpid); + removeOldReplica(replicaInfo, newReplicaInfo, bpid); } } } @@ -3205,16 +3003,9 @@ public void setPinning(ExtendedBlock block) throws IOException { if (!blockPinningEnabled) { return; } - - File f = getBlockFile(block); - Path p = new Path(f.getAbsolutePath()); - FsPermission oldPermission = localFS.getFileStatus( - new Path(f.getAbsolutePath())).getPermission(); - //sticky bit is used for pinning purpose - FsPermission permission = new FsPermission(oldPermission.getUserAction(), - oldPermission.getGroupAction(), oldPermission.getOtherAction(), true); - localFS.setPermission(p, permission); + ReplicaInfo r = getBlockReplica(block); + r.setPinning(localFS); } @Override @@ -3222,10 +3013,8 @@ public boolean getPinning(ExtendedBlock block) throws IOException { if (!blockPinningEnabled) { return false; } - File f = getBlockFile(block); - - FileStatus fss = localFS.getFileStatus(new Path(f.getAbsolutePath())); - return fss.getPermission().getStickyBit(); + ReplicaInfo r = getBlockReplica(block); + return r.getPinning(localFS); } @Override @@ -3308,10 +3097,11 @@ void stopAllDataxceiverThreads(FsVolumeImpl volume) { for (String blockPoolId : volumeMap.getBlockPoolList()) { Collection replicas = volumeMap.replicas(blockPoolId); for (ReplicaInfo replicaInfo : replicas) { - if (replicaInfo instanceof ReplicaInPipeline + if ((replicaInfo.getState() == ReplicaState.TEMPORARY + || replicaInfo.getState() == ReplicaState.RBW) && replicaInfo.getVolume().equals(volume)) { - ReplicaInPipeline replicaInPipeline - = (ReplicaInPipeline) replicaInfo; + ReplicaInPipeline replicaInPipeline = + (ReplicaInPipeline) replicaInfo; replicaInPipeline.interruptThread(); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetUtil.java index f695c8c228..a4d433d544 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetUtil.java @@ -18,14 +18,17 @@ package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl; import java.io.File; +import java.io.FileInputStream; import java.io.FilenameFilter; import java.io.IOException; +import java.io.RandomAccessFile; import java.util.Arrays; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.server.datanode.DatanodeUtil; +import org.apache.hadoop.io.IOUtils; /** Utility methods. */ @InterfaceAudience.Private @@ -71,6 +74,21 @@ public boolean accept(File dir, String name) { return matches[0]; } + public static FileInputStream openAndSeek(File file, long offset) + throws IOException { + RandomAccessFile raf = null; + try { + raf = new RandomAccessFile(file, "r"); + if (offset > 0) { + raf.seek(offset); + } + return new FileInputStream(raf.getFD()); + } catch(IOException ioe) { + IOUtils.cleanup(null, raf); + throw ioe; + } + } + /** * Find the meta-file for the specified block file * and then return the generation stamp from the name of the meta-file. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java index afcc5dd486..57fab660c7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeImpl.java @@ -47,11 +47,19 @@ import org.apache.hadoop.fs.StorageType; import org.apache.hadoop.util.AutoCloseableLock; import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.DFSUtilClient; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.server.datanode.DataStorage; import org.apache.hadoop.hdfs.server.datanode.DatanodeUtil; +import org.apache.hadoop.hdfs.server.datanode.LocalReplica; +import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; +import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException; +import org.apache.hadoop.hdfs.server.datanode.ReplicaBuilder; +import org.apache.hadoop.hdfs.server.datanode.LocalReplicaInPipeline; +import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipeline; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeReference; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; @@ -102,7 +110,7 @@ public class FsVolumeImpl implements FsVolumeSpi { // Disk space reserved for blocks (RBW or Re-replicating) open for write. private AtomicLong reservedForReplicas; private long recentReserved = 0; - + private final Configuration conf; // Capacity configured. This is useful when we want to // limit the visible capacity for tests. If negative, then we just // query from the filesystem. @@ -130,6 +138,7 @@ public class FsVolumeImpl implements FsVolumeSpi { this.usage = new DF(parent, conf); this.storageType = storageType; this.configuredCapacity = -1; + this.conf = conf; cacheExecutor = initializeCacheExecutor(parent); } @@ -896,10 +905,15 @@ File createRbwFile(String bpid, Block b) throws IOException { * @return * @throws IOException */ - File addFinalizedBlock(String bpid, Block b, File f, long bytesReserved) - throws IOException { + ReplicaInfo addFinalizedBlock(String bpid, Block b, ReplicaInfo replicaInfo, + long bytesReserved) throws IOException { releaseReservedSpace(bytesReserved); - return getBlockPoolSlice(bpid).addFinalizedBlock(b, f); + File dest = getBlockPoolSlice(bpid).addFinalizedBlock(b, replicaInfo); + return new ReplicaBuilder(ReplicaState.FINALIZED) + .setBlock(replicaInfo) + .setFsVolume(this) + .setDirectoryToUse(dest.getParentFile()) + .build(); } Executor getCacheExecutor() { @@ -950,18 +964,18 @@ void shutdown() { } } - void addBlockPool(String bpid, Configuration conf) throws IOException { - addBlockPool(bpid, conf, null); + void addBlockPool(String bpid, Configuration c) throws IOException { + addBlockPool(bpid, c, null); } - void addBlockPool(String bpid, Configuration conf, Timer timer) + void addBlockPool(String bpid, Configuration c, Timer timer) throws IOException { File bpdir = new File(currentDir, bpid); BlockPoolSlice bp; if (timer == null) { - bp = new BlockPoolSlice(bpid, this, bpdir, conf, new Timer()); + bp = new BlockPoolSlice(bpid, this, bpdir, c, new Timer()); } else { - bp = new BlockPoolSlice(bpid, this, bpdir, conf, timer); + bp = new BlockPoolSlice(bpid, this, bpdir, c, timer); } bpSlices.put(bpid, bp); } @@ -1053,5 +1067,127 @@ public StorageType getStorageType() { DatanodeStorage toDatanodeStorage() { return new DatanodeStorage(storageID, DatanodeStorage.State.NORMAL, storageType); } + + + public ReplicaInPipeline append(String bpid, ReplicaInfo replicaInfo, + long newGS, long estimateBlockLen) throws IOException { + + long bytesReserved = estimateBlockLen - replicaInfo.getNumBytes(); + if (getAvailable() < bytesReserved) { + throw new DiskOutOfSpaceException("Insufficient space for appending to " + + replicaInfo); + } + + assert replicaInfo.getVolume() == this: + "The volume of the replica should be the same as this volume"; + + // construct a RBW replica with the new GS + File newBlkFile = new File(getRbwDir(bpid), replicaInfo.getBlockName()); + LocalReplicaInPipeline newReplicaInfo = new ReplicaBuilder(ReplicaState.RBW) + .setBlockId(replicaInfo.getBlockId()) + .setLength(replicaInfo.getNumBytes()) + .setGenerationStamp(newGS) + .setFsVolume(this) + .setDirectoryToUse(newBlkFile.getParentFile()) + .setWriterThread(Thread.currentThread()) + .setBytesToReserve(bytesReserved) + .buildLocalReplicaInPipeline(); + + // rename meta file to rbw directory + // rename block file to rbw directory + newReplicaInfo.moveReplicaFrom(replicaInfo, newBlkFile); + + reserveSpaceForReplica(bytesReserved); + return newReplicaInfo; + } + + public ReplicaInPipeline createRbw(ExtendedBlock b) throws IOException { + + File f = createRbwFile(b.getBlockPoolId(), b.getLocalBlock()); + LocalReplicaInPipeline newReplicaInfo = new ReplicaBuilder(ReplicaState.RBW) + .setBlockId(b.getBlockId()) + .setGenerationStamp(b.getGenerationStamp()) + .setFsVolume(this) + .setDirectoryToUse(f.getParentFile()) + .setBytesToReserve(b.getNumBytes()) + .buildLocalReplicaInPipeline(); + return newReplicaInfo; + } + + public ReplicaInPipeline convertTemporaryToRbw(ExtendedBlock b, + ReplicaInfo temp) throws IOException { + + final long blockId = b.getBlockId(); + final long expectedGs = b.getGenerationStamp(); + final long visible = b.getNumBytes(); + final long numBytes = temp.getNumBytes(); + + // move block files to the rbw directory + BlockPoolSlice bpslice = getBlockPoolSlice(b.getBlockPoolId()); + final File dest = FsDatasetImpl.moveBlockFiles(b.getLocalBlock(), temp, + bpslice.getRbwDir()); + // create RBW + final LocalReplicaInPipeline rbw = new ReplicaBuilder(ReplicaState.RBW) + .setBlockId(blockId) + .setLength(numBytes) + .setGenerationStamp(expectedGs) + .setFsVolume(this) + .setDirectoryToUse(dest.getParentFile()) + .setWriterThread(Thread.currentThread()) + .setBytesToReserve(0) + .buildLocalReplicaInPipeline(); + rbw.setBytesAcked(visible); + return rbw; + } + + public ReplicaInPipeline createTemporary(ExtendedBlock b) throws IOException { + // create a temporary file to hold block in the designated volume + File f = createTmpFile(b.getBlockPoolId(), b.getLocalBlock()); + LocalReplicaInPipeline newReplicaInfo = + new ReplicaBuilder(ReplicaState.TEMPORARY) + .setBlockId(b.getBlockId()) + .setGenerationStamp(b.getGenerationStamp()) + .setDirectoryToUse(f.getParentFile()) + .setBytesToReserve(b.getLocalBlock().getNumBytes()) + .setFsVolume(this) + .buildLocalReplicaInPipeline(); + return newReplicaInfo; + } + + public ReplicaInPipeline updateRURCopyOnTruncate(ReplicaInfo rur, + String bpid, long newBlockId, long recoveryId, long newlength) + throws IOException { + + rur.breakHardLinksIfNeeded(); + File[] copiedReplicaFiles = + copyReplicaWithNewBlockIdAndGS(rur, bpid, newBlockId, recoveryId); + File blockFile = copiedReplicaFiles[1]; + File metaFile = copiedReplicaFiles[0]; + LocalReplica.truncateBlock(blockFile, metaFile, + rur.getNumBytes(), newlength); + + LocalReplicaInPipeline newReplicaInfo = new ReplicaBuilder(ReplicaState.RBW) + .setBlockId(newBlockId) + .setGenerationStamp(recoveryId) + .setFsVolume(this) + .setDirectoryToUse(blockFile.getParentFile()) + .setBytesToReserve(newlength) + .buildLocalReplicaInPipeline(); + return newReplicaInfo; + } + + private File[] copyReplicaWithNewBlockIdAndGS( + ReplicaInfo replicaInfo, String bpid, long newBlkId, long newGS) + throws IOException { + String blockFileName = Block.BLOCK_FILE_PREFIX + newBlkId; + FsVolumeImpl v = (FsVolumeImpl) replicaInfo.getVolume(); + final File tmpDir = v.getBlockPoolSlice(bpid).getTmpDir(); + final File destDir = DatanodeUtil.idToBlockDir(tmpDir, newBlkId); + final File dstBlockFile = new File(destDir, blockFileName); + final File dstMetaFile = FsDatasetUtil.getMetaFile(dstBlockFile, newGS); + return FsDatasetImpl.copyBlockFiles(replicaInfo, dstMetaFile, + dstBlockFile, true, DFSUtilClient.getSmallBufferSize(conf), conf); + } + } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeList.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeList.java index 634ad42d89..80d373644e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeList.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeList.java @@ -311,7 +311,7 @@ void addVolume(FsVolumeReference ref) { } else { // If the volume is not put into a volume scanner, it does not need to // hold the reference. - IOUtils.cleanup(FsDatasetImpl.LOG, ref); + IOUtils.cleanup(null, ref); } // If the volume is used to replace a failed volume, it needs to reset the // volume failure info for this volume. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/RamDiskAsyncLazyPersistService.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/RamDiskAsyncLazyPersistService.java index 9b467ea790..9e549f9bb1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/RamDiskAsyncLazyPersistService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/RamDiskAsyncLazyPersistService.java @@ -24,6 +24,7 @@ import org.apache.hadoop.hdfs.DFSUtilClient; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.server.datanode.DataNode; +import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeReference; import java.io.File; @@ -182,8 +183,7 @@ synchronized void shutdown() { */ void submitLazyPersistTask(String bpId, long blockId, long genStamp, long creationTime, - File metaFile, File blockFile, - FsVolumeReference target) throws IOException { + ReplicaInfo replica, FsVolumeReference target) throws IOException { if (LOG.isDebugEnabled()) { LOG.debug("LazyWriter schedule async task to persist RamDisk block pool id: " + bpId + " block id: " + blockId); @@ -198,31 +198,29 @@ void submitLazyPersistTask(String bpId, long blockId, } ReplicaLazyPersistTask lazyPersistTask = new ReplicaLazyPersistTask( - bpId, blockId, genStamp, creationTime, blockFile, metaFile, + bpId, blockId, genStamp, creationTime, replica, target, lazyPersistDir); execute(volume.getCurrentDir(), lazyPersistTask); } class ReplicaLazyPersistTask implements Runnable { - final String bpId; - final long blockId; - final long genStamp; - final long creationTime; - final File blockFile; - final File metaFile; - final FsVolumeReference targetVolume; - final File lazyPersistDir; + private final String bpId; + private final long blockId; + private final long genStamp; + private final long creationTime; + private final ReplicaInfo replicaInfo; + private final FsVolumeReference targetVolume; + private final File lazyPersistDir; ReplicaLazyPersistTask(String bpId, long blockId, long genStamp, long creationTime, - File blockFile, File metaFile, + ReplicaInfo replicaInfo, FsVolumeReference targetVolume, File lazyPersistDir) { this.bpId = bpId; this.blockId = blockId; this.genStamp = genStamp; this.creationTime = creationTime; - this.blockFile = blockFile; - this.metaFile = metaFile; + this.replicaInfo = replicaInfo; this.targetVolume = targetVolume; this.lazyPersistDir = lazyPersistDir; } @@ -232,8 +230,10 @@ public String toString() { // Called in AsyncLazyPersistService.execute for displaying error messages. return "LazyWriter async task of persist RamDisk block pool id:" + bpId + " block pool id: " - + blockId + " with block file " + blockFile - + " and meta file " + metaFile + " to target volume " + targetVolume;} + + blockId + " with block file " + replicaInfo.getBlockURI() + + " and meta file " + replicaInfo.getMetadataURI() + + " to target volume " + targetVolume; + } @Override public void run() { @@ -243,7 +243,7 @@ public void run() { int smallBufferSize = DFSUtilClient.getSmallBufferSize(EMPTY_HDFS_CONF); // No FsDatasetImpl lock for the file copy File targetFiles[] = FsDatasetImpl.copyBlockFiles( - blockId, genStamp, metaFile, blockFile, lazyPersistDir, true, + blockId, genStamp, replicaInfo, lazyPersistDir, true, smallBufferSize, conf); // Lock FsDataSetImpl during onCompleteLazyPersist callback diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestClientProtocolForPipelineRecovery.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestClientProtocolForPipelineRecovery.java index 99b617eb82..65a484c825 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestClientProtocolForPipelineRecovery.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestClientProtocolForPipelineRecovery.java @@ -39,7 +39,7 @@ import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector; import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; -import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipelineInterface; +import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipeline; import org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols; import org.apache.hadoop.hdfs.tools.DFSAdmin; @@ -530,7 +530,7 @@ public void testPipelineRecoveryWithTransferBlock() throws Exception { DataNodeFaultInjector.set(new DataNodeFaultInjector() { @Override - public void failPipeline(ReplicaInPipelineInterface replicaInfo, + public void failPipeline(ReplicaInPipeline replicaInfo, String mirror) throws IOException { if (!lastDn.equals(mirror)) { // Only fail for second DN diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestCrcCorruption.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestCrcCorruption.java index 398bcc27a7..917f0dbe09 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestCrcCorruption.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestCrcCorruption.java @@ -34,7 +34,7 @@ import org.apache.hadoop.hdfs.client.HdfsClientConfigKeys; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.server.datanode.DataNode; -import org.apache.hadoop.hdfs.server.datanode.FinalizedReplica; +import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo; import org.apache.hadoop.io.IOUtils; import org.junit.Before; import org.junit.Test; @@ -172,12 +172,12 @@ private void thistest(Configuration conf, DFSTestUtil util) throws Exception { final int dnIdx = 0; final DataNode dn = cluster.getDataNodes().get(dnIdx); final String bpid = cluster.getNamesystem().getBlockPoolId(); - List replicas = + List replicas = dn.getFSDataset().getFinalizedBlocks(bpid); assertTrue("Replicas do not exist", !replicas.isEmpty()); for (int idx = 0; idx < replicas.size(); idx++) { - FinalizedReplica replica = replicas.get(idx); + ReplicaInfo replica = replicas.get(idx); ExtendedBlock eb = new ExtendedBlock(bpid, replica); if (idx % 3 == 0) { LOG.info("Deliberately removing meta for block " + eb); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/SimulatedFSDataset.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/SimulatedFSDataset.java index a082fbbb90..6034d1ee32 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/SimulatedFSDataset.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/SimulatedFSDataset.java @@ -132,7 +132,7 @@ public static byte simulatedByte(Block b, long offsetInBlk) { } // information about a single block - private class BInfo implements ReplicaInPipelineInterface { + private class BInfo implements ReplicaInPipeline { final Block theBlock; private boolean finalized = false; // if not finalized => ongoing creation SimulatedOutputStream oStream = null; @@ -330,6 +330,28 @@ public ChunkChecksum getLastChecksumAndDataLen() { public boolean isOnTransientStorage() { return false; } + + @Override + public ReplicaInfo getReplicaInfo() { + return null; + } + + @Override + public void setWriter(Thread writer) { + } + + @Override + public void interruptThread() { + } + + @Override + public boolean attemptToSetWriter(Thread prevWriter, Thread newWriter) { + return false; + } + + @Override + public void stopWriter(long xceiverStopTimeout) throws IOException { + } } /** @@ -1228,7 +1250,7 @@ public void deleteBlockPool(String bpid, boolean force) { } @Override - public ReplicaInPipelineInterface convertTemporaryToRbw(ExtendedBlock temporary) + public ReplicaInPipeline convertTemporaryToRbw(ExtendedBlock temporary) throws IOException { final Map map = blockMap.get(temporary.getBlockPoolId()); if (map == null) { @@ -1302,12 +1324,12 @@ public StorageReport[] getStorageReports(String bpid) { } @Override - public List getFinalizedBlocks(String bpid) { + public List getFinalizedBlocks(String bpid) { throw new UnsupportedOperationException(); } @Override - public List getFinalizedBlocksOnPersistentStorage(String bpid) { + public List getFinalizedBlocksOnPersistentStorage(String bpid) { throw new UnsupportedOperationException(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockPoolSliceStorage.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockPoolSliceStorage.java index b5951a0aad..a8f8b6a529 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockPoolSliceStorage.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockPoolSliceStorage.java @@ -21,6 +21,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hdfs.server.common.Storage; import org.junit.Test; +import org.mockito.Mockito; import java.io.File; import java.util.Random; @@ -105,7 +106,10 @@ public void getTrashDirectoryForBlockFile(String fileName, int nestingLevel) { LOG.info("Got subdir " + blockFileSubdir); LOG.info("Generated file path " + testFilePath); - assertThat(storage.getTrashDirectory(new File(testFilePath)), is(expectedTrashPath)); + + ReplicaInfo info = Mockito.mock(ReplicaInfo.class); + Mockito.when(info.getBlockURI()).thenReturn(new File(testFilePath).toURI()); + assertThat(storage.getTrashDirectory(info), is(expectedTrashPath)); } /* diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockRecovery.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockRecovery.java index 8183de8408..c09303f480 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockRecovery.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestBlockRecovery.java @@ -667,7 +667,7 @@ public void testNotMatchedReplicaID() throws IOException { if(LOG.isDebugEnabled()) { LOG.debug("Running " + GenericTestUtils.getMethodName()); } - ReplicaInPipelineInterface replicaInfo = dn.data.createRbw( + ReplicaInPipeline replicaInfo = dn.data.createRbw( StorageType.DEFAULT, block, false).getReplica(); ReplicaOutputStreams streams = null; try { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeRollingUpgrade.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeRollingUpgrade.java index 7e56988ecc..f08606ec5e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeRollingUpgrade.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeRollingUpgrade.java @@ -49,6 +49,7 @@ import org.apache.hadoop.hdfs.tools.DFSAdmin; import org.apache.hadoop.test.GenericTestUtils; import org.junit.Test; +import org.mockito.Mockito; /** * Ensure that the DataNode correctly handles rolling upgrade @@ -114,8 +115,11 @@ private File getBlockForFile(Path path, boolean exists) throws IOException { } private File getTrashFileForBlock(File blockFile, boolean exists) { + + ReplicaInfo info = Mockito.mock(ReplicaInfo.class); + Mockito.when(info.getBlockURI()).thenReturn(blockFile.toURI()); File trashFile = new File( - dn0.getStorage().getTrashDirectoryForBlockFile(blockPoolId, blockFile)); + dn0.getStorage().getTrashDirectoryForReplica(blockPoolId, info)); assertEquals(exists, trashFile.exists()); return trashFile; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDirectoryScanner.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDirectoryScanner.java index 3822bad735..576aae066a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDirectoryScanner.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDirectoryScanner.java @@ -116,8 +116,8 @@ private List createFile(String fileNamePrefix, private long truncateBlockFile() throws IOException { try(AutoCloseableLock lock = fds.acquireDatasetLock()) { for (ReplicaInfo b : FsDatasetTestUtil.getReplicas(fds, bpid)) { - File f = b.getBlockFile(); - File mf = b.getMetaFile(); + File f = new File(b.getBlockURI()); + File mf = new File(b.getMetadataURI()); // Truncate a block file that has a corresponding metadata file if (f.exists() && f.length() != 0 && mf.exists()) { FileOutputStream s = null; @@ -141,8 +141,8 @@ private long truncateBlockFile() throws IOException { private long deleteBlockFile() { try(AutoCloseableLock lock = fds.acquireDatasetLock()) { for (ReplicaInfo b : FsDatasetTestUtil.getReplicas(fds, bpid)) { - File f = b.getBlockFile(); - File mf = b.getMetaFile(); + File f = new File(b.getBlockURI()); + File mf = new File(b.getMetadataURI()); // Delete a block file that has corresponding metadata file if (f.exists() && mf.exists() && f.delete()) { LOG.info("Deleting block file " + f.getAbsolutePath()); @@ -157,10 +157,9 @@ private long deleteBlockFile() { private long deleteMetaFile() { try(AutoCloseableLock lock = fds.acquireDatasetLock()) { for (ReplicaInfo b : FsDatasetTestUtil.getReplicas(fds, bpid)) { - File file = b.getMetaFile(); // Delete a metadata file - if (file.exists() && file.delete()) { - LOG.info("Deleting metadata file " + file.getAbsolutePath()); + if (b.metadataExists() && b.deleteMetadata()) { + LOG.info("Deleting metadata " + b.getMetadataURI()); return b.getBlockId(); } } @@ -184,8 +183,8 @@ private void duplicateBlock(long blockId) throws IOException { } // Volume without a copy of the block. Make a copy now. - File sourceBlock = b.getBlockFile(); - File sourceMeta = b.getMetaFile(); + File sourceBlock = new File(b.getBlockURI()); + File sourceMeta = new File(b.getMetadataURI()); String sourceRoot = b.getVolume().getBasePath(); String destRoot = v.getBasePath(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestSimulatedFSDataset.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestSimulatedFSDataset.java index dd7d239f6b..4e724bc7cb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestSimulatedFSDataset.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestSimulatedFSDataset.java @@ -80,7 +80,7 @@ static int addSomeBlocks(SimulatedFSDataset fsdataset, long startingBlockId, ExtendedBlock b = new ExtendedBlock(bpid, blkID, 0, 0); // we pass expected len as zero, - fsdataset should use the sizeof actual // data written - ReplicaInPipelineInterface bInfo = fsdataset.createRbw( + ReplicaInPipeline bInfo = fsdataset.createRbw( StorageType.DEFAULT, b, false).getReplica(); ReplicaOutputStreams out = bInfo.createStreams(true, DataChecksum.newDataChecksum(DataChecksum.Type.CRC32, 512)); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestTransferRbw.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestTransferRbw.java index c3cb862b99..708fbaf30a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestTransferRbw.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestTransferRbw.java @@ -57,7 +57,7 @@ private static ReplicaBeingWritten getRbw(final DataNode datanode, String bpid) throws InterruptedException { return (ReplicaBeingWritten)getReplica(datanode, bpid, ReplicaState.RBW); } - private static ReplicaInPipeline getReplica(final DataNode datanode, + private static LocalReplicaInPipeline getReplica(final DataNode datanode, final String bpid, final ReplicaState expectedState) throws InterruptedException { final Collection replicas = FsDatasetTestUtil.getReplicas( datanode.getFSDataset(), bpid); @@ -68,7 +68,7 @@ private static ReplicaInPipeline getReplica(final DataNode datanode, Assert.assertEquals(1, replicas.size()); final ReplicaInfo r = replicas.iterator().next(); Assert.assertEquals(expectedState, r.getState()); - return (ReplicaInPipeline)r; + return (LocalReplicaInPipeline)r; } @Test diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/extdataset/ExternalDatasetImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/extdataset/ExternalDatasetImpl.java index fcd960a79a..126810825b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/extdataset/ExternalDatasetImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/extdataset/ExternalDatasetImpl.java @@ -87,12 +87,12 @@ public Map getVolumeInfoMap() { } @Override - public List getFinalizedBlocks(String bpid) { + public List getFinalizedBlocks(String bpid) { return null; } @Override - public List getFinalizedBlocksOnPersistentStorage(String bpid) { + public List getFinalizedBlocksOnPersistentStorage(String bpid) { return null; } @@ -159,7 +159,7 @@ public ReplicaHandler recoverRbw(ExtendedBlock b, long newGS, } @Override - public ReplicaInPipelineInterface convertTemporaryToRbw( + public ReplicaInPipeline convertTemporaryToRbw( ExtendedBlock temporary) throws IOException { return new ExternalReplicaInPipeline(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/extdataset/ExternalReplicaInPipeline.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/extdataset/ExternalReplicaInPipeline.java index a0039bcdbe..90c3b8a56f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/extdataset/ExternalReplicaInPipeline.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/extdataset/ExternalReplicaInPipeline.java @@ -23,11 +23,12 @@ import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.datanode.ChunkChecksum; -import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipelineInterface; +import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipeline; +import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo; import org.apache.hadoop.hdfs.server.datanode.fsdataset.ReplicaOutputStreams; import org.apache.hadoop.util.DataChecksum; -public class ExternalReplicaInPipeline implements ReplicaInPipelineInterface { +public class ExternalReplicaInPipeline implements ReplicaInPipeline { @Override public void setNumBytes(long bytesReceived) { @@ -105,4 +106,25 @@ public String getStorageUuid() { public boolean isOnTransientStorage() { return false; } + + @Override + public ReplicaInfo getReplicaInfo() { + return null; + } + + public void setWriter(Thread writer) { + } + + public void stopWriter(long xceiverStopTimeout) + throws IOException { + } + + @Override + public boolean attemptToSetWriter(Thread prevWriter, Thread newWriter) { + return false; + } + + @Override + public void interruptThread() { + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/extdataset/TestExternalDataset.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/extdataset/TestExternalDataset.java index 82a6951413..e4391529cf 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/extdataset/TestExternalDataset.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/extdataset/TestExternalDataset.java @@ -19,7 +19,7 @@ package org.apache.hadoop.hdfs.server.datanode.extdataset; import org.apache.hadoop.hdfs.server.datanode.Replica; -import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipelineInterface; +import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipeline; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; import org.junit.Test; @@ -75,7 +75,7 @@ public void testIntantiateExternalReplica() throws Throwable { */ @Test public void testInstantiateReplicaInPipeline() throws Throwable { - ReplicaInPipelineInterface inst = new ExternalReplicaInPipeline(); + ReplicaInPipeline inst = new ExternalReplicaInPipeline(); } /** diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImplTestUtils.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImplTestUtils.java index d2f3db06f7..e1825f8220 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImplTestUtils.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImplTestUtils.java @@ -26,8 +26,8 @@ import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.fs.DF; import org.apache.hadoop.hdfs.protocol.Block; -import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; +import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataStorage; import org.apache.hadoop.hdfs.server.datanode.DatanodeUtil; @@ -35,11 +35,11 @@ import org.apache.hadoop.hdfs.server.datanode.FsDatasetTestUtils; import org.apache.hadoop.hdfs.server.datanode.Replica; import org.apache.hadoop.hdfs.server.datanode.ReplicaBeingWritten; -import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipeline; +import org.apache.hadoop.hdfs.server.datanode.ReplicaBuilder; +import org.apache.hadoop.hdfs.server.datanode.LocalReplicaInPipeline; import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo; import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException; import org.apache.hadoop.hdfs.server.datanode.ReplicaUnderRecovery; -import org.apache.hadoop.hdfs.server.datanode.ReplicaWaitingToBeRecovered; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi.FsVolumeReferences; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; import org.apache.hadoop.test.GenericTestUtils; @@ -205,8 +205,8 @@ public FsDatasetImplTestUtils(DataNode datanode) { dataset = (FsDatasetImpl) datanode.getFSDataset(); } - private File getBlockFile(ExtendedBlock eb) throws IOException { - return dataset.getBlockFile(eb.getBlockPoolId(), eb.getBlockId()); + private ReplicaInfo getBlockFile(ExtendedBlock eb) throws IOException { + return dataset.getReplicaInfo(eb); } /** @@ -217,8 +217,8 @@ public MaterializedReplica getMaterializedReplica(ExtendedBlock block) throws ReplicaNotFoundException { File blockFile; try { - blockFile = dataset.getBlockFile( - block.getBlockPoolId(), block.getBlockId()); + ReplicaInfo r = dataset.getReplicaInfo(block); + blockFile = new File(r.getBlockURI()); } catch (IOException e) { LOG.error("Block file for " + block + " does not existed:", e); throw new ReplicaNotFoundException(block); @@ -240,7 +240,7 @@ public Replica createFinalizedReplica(ExtendedBlock block) public Replica createFinalizedReplica(FsVolumeSpi volume, ExtendedBlock block) throws IOException { FsVolumeImpl vol = (FsVolumeImpl) volume; - ReplicaInfo info = new FinalizedReplica(block.getLocalBlock(), vol, + FinalizedReplica info = new FinalizedReplica(block.getLocalBlock(), vol, vol.getCurrentDir().getParentFile()); dataset.volumeMap.add(block.getBlockPoolId(), info); info.getBlockFile().createNewFile(); @@ -260,7 +260,7 @@ public Replica createReplicaInPipeline(ExtendedBlock block) public Replica createReplicaInPipeline( FsVolumeSpi volume, ExtendedBlock block) throws IOException { FsVolumeImpl vol = (FsVolumeImpl) volume; - ReplicaInPipeline rip = new ReplicaInPipeline( + LocalReplicaInPipeline rip = new LocalReplicaInPipeline( block.getBlockId(), block.getGenerationStamp(), volume, vol.createTmpFile( block.getBlockPoolId(), block.getLocalBlock()).getParentFile(), @@ -305,9 +305,11 @@ public Replica createReplicaWaitingToBeRecovered( FsVolumeImpl vol = (FsVolumeImpl) volume; final String bpid = eb.getBlockPoolId(); final Block block = eb.getLocalBlock(); - ReplicaWaitingToBeRecovered rwbr = - new ReplicaWaitingToBeRecovered(eb.getLocalBlock(), volume, - vol.createRbwFile(bpid, block).getParentFile()); + ReplicaInfo rwbr = new ReplicaBuilder(ReplicaState.RWR) + .setBlock(eb.getLocalBlock()) + .setFsVolume(volume) + .setDirectoryToUse(vol.createRbwFile(bpid, block).getParentFile()) + .build(); dataset.volumeMap.add(bpid, rwbr); return rwbr; } @@ -354,6 +356,7 @@ public void injectCorruptReplica(ExtendedBlock block) throws IOException { "Meta file " + metaFile + " already exists." ); } + dataset.volumeMap.add(block.getBlockPoolId(), finalized); } } @@ -379,25 +382,21 @@ public long getRawCapacity() throws IOException { @Override public long getStoredDataLength(ExtendedBlock block) throws IOException { - File f = getBlockFile(block); - try (RandomAccessFile raf = new RandomAccessFile(f, "r")) { - return raf.length(); - } + ReplicaInfo r = getBlockFile(block); + return r.getBlockDataLength(); } @Override public long getStoredGenerationStamp(ExtendedBlock block) throws IOException { - File f = getBlockFile(block); - File dir = f.getParentFile(); - File[] files = FileUtil.listFiles(dir); - return FsDatasetUtil.getGenerationStampFromFile(files, f); + ReplicaInfo r = getBlockFile(block); + return r.getGenerationStamp(); } @Override public void changeStoredGenerationStamp( ExtendedBlock block, long newGenStamp) throws IOException { - File blockFile = - dataset.getBlockFile(block.getBlockPoolId(), block.getBlockId()); + ReplicaInfo r = dataset.getReplicaInfo(block); + File blockFile = new File(r.getBlockURI()); File metaFile = FsDatasetUtil.findMetaFile(blockFile); File newMetaFile = new File( DatanodeUtil.getMetaName(blockFile.getAbsolutePath(), newGenStamp)); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetTestUtil.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetTestUtil.java index 665befaee5..b42c052adb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetTestUtil.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetTestUtil.java @@ -24,16 +24,16 @@ import java.nio.channels.FileLock; import java.nio.channels.OverlappingFileLockException; import java.util.Collection; -import java.util.Random; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.datanode.DataNode; +import org.apache.hadoop.hdfs.server.datanode.LocalReplica; import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo; +import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException; import org.apache.hadoop.hdfs.server.datanode.StorageLocation; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi; -import org.apache.hadoop.io.IOUtils; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.fail; @@ -41,12 +41,21 @@ public class FsDatasetTestUtil { public static File getFile(FsDatasetSpi fsd, String bpid, long bid) { - return ((FsDatasetImpl)fsd).getFile(bpid, bid, false); + ReplicaInfo r; + try { + r = ((FsDatasetImpl)fsd).getReplicaInfo(bpid, bid); + return new File(r.getBlockURI()); + } catch (ReplicaNotFoundException e) { + FsDatasetImpl.LOG.warn(String.format( + "Replica with id %d was not found in block pool %s.", bid, bpid), e); + } + return null; } public static File getBlockFile(FsDatasetSpi fsd, String bpid, Block b ) throws IOException { - return ((FsDatasetImpl)fsd).getBlockFile(bpid, b.getBlockId()); + ReplicaInfo r = ((FsDatasetImpl)fsd).getReplicaInfo(bpid, b.getBlockId()); + return new File(r.getBlockURI()); } public static File getMetaFile(FsDatasetSpi fsd, String bpid, Block b) @@ -57,7 +66,8 @@ public static File getMetaFile(FsDatasetSpi fsd, String bpid, Block b) public static boolean breakHardlinksIfNeeded(FsDatasetSpi fsd, ExtendedBlock block) throws IOException { - final ReplicaInfo info = ((FsDatasetImpl)fsd).getReplicaInfo(block); + final LocalReplica info = + (LocalReplica) ((FsDatasetImpl)fsd).getReplicaInfo(block); return info.breakHardLinksIfNeeded(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestWriteToReplica.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestWriteToReplica.java index 4ba3d81107..320af7b14c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestWriteToReplica.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestWriteToReplica.java @@ -36,7 +36,7 @@ import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; import org.apache.hadoop.hdfs.server.datanode.FsDatasetTestUtils; import org.apache.hadoop.hdfs.server.datanode.ReplicaAlreadyExistsException; -import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipelineInterface; +import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipeline; import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo; import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi; @@ -483,7 +483,7 @@ private void testWriteToTemporary(FsDatasetImpl dataSet, ExtendedBlock[] blocks) long newGenStamp = blocks[NON_EXISTENT].getGenerationStamp() * 10; blocks[NON_EXISTENT].setGenerationStamp(newGenStamp); try { - ReplicaInPipelineInterface replicaInfo = + ReplicaInPipeline replicaInfo = dataSet.createTemporary(StorageType.DEFAULT, blocks[NON_EXISTENT]).getReplica(); Assert.assertTrue(replicaInfo.getGenerationStamp() == newGenStamp); Assert.assertTrue(