diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index c2510eb6f2..a853879114 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -464,6 +464,9 @@ Release 2.5.0 - UNRELEASED HDFS-6110 adding more slow action log in critical write path (Liang Xie via stack) + HDFS-6109 let sync_file_range() system call run in background + (Liang Xie via stack) + OPTIMIZATIONS HDFS-6214. Webhdfs has poor throughput for files >2GB (daryn) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index 9f06eba049..5061e08988 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -111,6 +111,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final boolean DFS_DATANODE_DROP_CACHE_BEHIND_WRITES_DEFAULT = false; public static final String DFS_DATANODE_SYNC_BEHIND_WRITES_KEY = "dfs.datanode.sync.behind.writes"; public static final boolean DFS_DATANODE_SYNC_BEHIND_WRITES_DEFAULT = false; + public static final String DFS_DATANODE_SYNC_BEHIND_WRITES_IN_BACKGROUND_KEY = "dfs.datanode.sync.behind.writes.in.background"; + public static final boolean DFS_DATANODE_SYNC_BEHIND_WRITES_IN_BACKGROUND_DEFAULT = false; public static final String DFS_DATANODE_DROP_CACHE_BEHIND_READS_KEY = "dfs.datanode.drop.cache.behind.reads"; public static final boolean DFS_DATANODE_DROP_CACHE_BEHIND_READS_DEFAULT = false; public static final String DFS_DATANODE_USE_DN_HOSTNAME = "dfs.datanode.use.datanode.hostname"; diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java index e36d40ccae..3d9ccdcca9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java @@ -104,6 +104,7 @@ class BlockReceiver implements Closeable { private boolean dropCacheBehindWrites; private long lastCacheManagementOffset = 0; private boolean syncBehindWrites; + private boolean syncBehindWritesInBackground; /** The client name. It is empty if a datanode is the client */ private final String clientname; @@ -207,6 +208,8 @@ class BlockReceiver implements Closeable { datanode.getDnConf().dropCacheBehindWrites : cachingStrategy.getDropBehind(); this.syncBehindWrites = datanode.getDnConf().syncBehindWrites; + this.syncBehindWritesInBackground = datanode.getDnConf(). + syncBehindWritesInBackground; final boolean isCreate = isDatanode || isTransfer || stage == BlockConstructionStage.PIPELINE_SETUP_CREATE; @@ -668,10 +671,17 @@ private void manageWriterOsCache(long offsetInBlock) { // of file // if (syncBehindWrites) { - NativeIO.POSIX.syncFileRangeIfPossible(outFd, - lastCacheManagementOffset, - offsetInBlock - lastCacheManagementOffset, - NativeIO.POSIX.SYNC_FILE_RANGE_WRITE); + if (syncBehindWritesInBackground) { + this.datanode.getFSDataset().submitBackgroundSyncFileRangeRequest( + block, outFd, lastCacheManagementOffset, + offsetInBlock - lastCacheManagementOffset, + NativeIO.POSIX.SYNC_FILE_RANGE_WRITE); + } else { + NativeIO.POSIX.syncFileRangeIfPossible(outFd, + lastCacheManagementOffset, offsetInBlock + - lastCacheManagementOffset, + NativeIO.POSIX.SYNC_FILE_RANGE_WRITE); + } } // // For POSIX_FADV_DONTNEED, we want to drop from the beginning diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java index 27dba1baca..4c1d39da7b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java @@ -67,6 +67,7 @@ public class DNConf { final boolean transferToAllowed; final boolean dropCacheBehindWrites; final boolean syncBehindWrites; + final boolean syncBehindWritesInBackground; final boolean dropCacheBehindReads; final boolean syncOnClose; final boolean encryptDataTransfer; @@ -119,6 +120,9 @@ public DNConf(Configuration conf) { syncBehindWrites = conf.getBoolean( DFSConfigKeys.DFS_DATANODE_SYNC_BEHIND_WRITES_KEY, DFSConfigKeys.DFS_DATANODE_SYNC_BEHIND_WRITES_DEFAULT); + syncBehindWritesInBackground = conf.getBoolean( + DFSConfigKeys.DFS_DATANODE_SYNC_BEHIND_WRITES_IN_BACKGROUND_KEY, + DFSConfigKeys.DFS_DATANODE_SYNC_BEHIND_WRITES_IN_BACKGROUND_DEFAULT); dropCacheBehindReads = conf.getBoolean( DFSConfigKeys.DFS_DATANODE_DROP_CACHE_BEHIND_READS_KEY, DFSConfigKeys.DFS_DATANODE_DROP_CACHE_BEHIND_READS_DEFAULT); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java index 6c78ac7b19..8eb083a93f 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java @@ -19,6 +19,7 @@ import java.io.File; +import java.io.FileDescriptor; import java.io.IOException; import java.io.InputStream; import java.util.List; @@ -431,5 +432,12 @@ public HdfsBlocksMetadata getHdfsBlocksMetadata(String bpid, * @return true when trash is enabled */ public boolean trashEnabled(String bpid); + + /** + * submit a sync_file_range request to AsyncDiskService + */ + public void submitBackgroundSyncFileRangeRequest(final ExtendedBlock block, + final FileDescriptor fd, final long offset, final long nbytes, + final int flags); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetAsyncDiskService.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetAsyncDiskService.java index d80f729209..b76cee48ed 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetAsyncDiskService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetAsyncDiskService.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl; import java.io.File; +import java.io.FileDescriptor; import java.util.HashMap; import java.util.Map; import java.util.concurrent.LinkedBlockingQueue; @@ -31,6 +32,8 @@ import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.protocol.BlockCommand; +import org.apache.hadoop.io.nativeio.NativeIO; +import org.apache.hadoop.io.nativeio.NativeIOException; /** * This class is a container of multiple thread pools, each for a volume, @@ -42,6 +45,7 @@ * can be slow, and we don't want to use a single thread pool because that * is inefficient when we have more than 1 volume. AsyncDiskService is the * solution for these. + * Another example of async disk operation is requesting sync_file_range(). * * This class and {@link org.apache.hadoop.util.AsyncDiskService} are similar. * They should be combined. @@ -148,6 +152,21 @@ synchronized void shutdown() { } } + public void submitSyncFileRangeRequest(FsVolumeImpl volume, + final FileDescriptor fd, final long offset, final long nbytes, + final int flags) { + execute(volume.getCurrentDir(), new Runnable() { + @Override + public void run() { + try { + NativeIO.POSIX.syncFileRangeIfPossible(fd, offset, nbytes, flags); + } catch (NativeIOException e) { + LOG.warn("sync_file_range error", e); + } + } + }); + } + /** * Delete the block file and meta file from the disk asynchronously, adjust * dfsUsed statistics accordingly. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java index 4d836fed83..40d3e81ad6 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java @@ -1907,5 +1907,13 @@ public RollingLogs createRollingLogs(String bpid, String prefix } return new RollingLogsImpl(dir, prefix); } + + @Override + public void submitBackgroundSyncFileRangeRequest(ExtendedBlock block, + FileDescriptor fd, long offset, long nbytes, int flags) { + FsVolumeImpl fsVolumeImpl = this.getVolume(block); + asyncDiskService.submitSyncFileRangeRequest(fsVolumeImpl, fd, offset, + nbytes, flags); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/SimulatedFSDataset.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/SimulatedFSDataset.java index d6ca798143..baf046af84 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/SimulatedFSDataset.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/SimulatedFSDataset.java @@ -18,6 +18,7 @@ package org.apache.hadoop.hdfs.server.datanode; import java.io.File; +import java.io.FileDescriptor; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -1112,5 +1113,11 @@ public RollingLogs createRollingLogs(String bpid, String prefix) { public FsVolumeSpi getVolume(ExtendedBlock b) { throw new UnsupportedOperationException(); } + + @Override + public void submitBackgroundSyncFileRangeRequest(ExtendedBlock block, + FileDescriptor fd, long offset, long nbytes, int flags) { + throw new UnsupportedOperationException(); + } }