From e4b070025b38d8866dd1e2dc4a3f46b8d9db4ad5 Mon Sep 17 00:00:00 2001 From: Mukund Thakur Date: Wed, 9 Oct 2024 08:34:47 -0500 Subject: [PATCH] HADOOP-19291. RawLocalFileSystem to allow overlapping ranges (#7101) ChecksumFileSystem creates the chunked ranges based on the checksum chunk size and then calls readVectored on Raw Local which may lead to overlapping ranges in some cases. Contributed by: Mukund Thakur --- .../apache/hadoop/fs/RawLocalFileSystem.java | 7 ++-- .../markdown/filesystem/fsdatainputstream.md | 9 ++++- .../AbstractContractVectoredReadTest.java | 38 ++++++++++++++----- .../hadoop/fs/contract/ContractOptions.java | 2 + .../src/test/resources/contract/rawlocal.xml | 5 +++ 5 files changed, 46 insertions(+), 15 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java index 7866c794c8..fa5624e671 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/RawLocalFileSystem.java @@ -68,7 +68,8 @@ import org.apache.hadoop.util.Shell; import org.apache.hadoop.util.StringUtils; -import static org.apache.hadoop.fs.VectoredReadUtils.validateAndSortRanges; +import static org.apache.hadoop.fs.VectoredReadUtils.sortRangeList; +import static org.apache.hadoop.fs.VectoredReadUtils.validateRangeRequest; import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs; import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_BYTES; import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_EXCEPTIONS; @@ -320,10 +321,10 @@ public void readVectored(List ranges, IntFunction allocate) throws IOException { // Validate, but do not pass in a file length as it may change. - List sortedRanges = validateAndSortRanges(ranges, - Optional.empty()); + List sortedRanges = sortRangeList(ranges); // Set up all of the futures, so that we can use them if things fail for(FileRange range: sortedRanges) { + validateRangeRequest(range); range.setData(new CompletableFuture<>()); } try { diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md index db844a94e3..ef4a8ff11a 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md @@ -623,8 +623,13 @@ support -and fallback everywhere else. The restriction "no overlapping ranges" was only initially enforced in the S3A connector, which would raise `UnsupportedOperationException`. -Adding the range check as a precondition for all implementations guarantees -consistent behavior everywhere. +Adding the range check as a precondition for all implementations (Raw Local +being an exception) guarantees consistent behavior everywhere. +The reason Raw Local doesn't have this precondition is ChecksumFileSystem +creates the chunked ranges based on the checksum chunk size and then calls +readVectored on Raw Local which may lead to overlapping ranges in some cases. +For details see [HADOOP-19291](https://issues.apache.org/jira/browse/HADOOP-19291) + For reliable use with older hadoop releases with the API: sort the list of ranges and check for overlaps before calling `readVectored()`. diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java index aa478f3af6..dcdfba2add 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractVectoredReadTest.java @@ -270,13 +270,23 @@ public void testSomeRangesMergedSomeUnmerged() throws Exception { } /** - * Vectored IO doesn't support overlapping ranges. + * Most file systems won't support overlapping ranges. + * Currently, only Raw Local supports it. */ @Test public void testOverlappingRanges() throws Exception { - verifyExceptionalVectoredRead( - getSampleOverlappingRanges(), - IllegalArgumentException.class); + if (!isSupported(VECTOR_IO_OVERLAPPING_RANGES)) { + verifyExceptionalVectoredRead( + getSampleOverlappingRanges(), + IllegalArgumentException.class); + } else { + try (FSDataInputStream in = openVectorFile()) { + List fileRanges = getSampleOverlappingRanges(); + in.readVectored(fileRanges, allocate); + validateVectoredReadResult(fileRanges, DATASET, 0); + returnBuffersToPoolPostRead(fileRanges, pool); + } + } } /** @@ -284,9 +294,18 @@ public void testOverlappingRanges() throws Exception { */ @Test public void testSameRanges() throws Exception { - verifyExceptionalVectoredRead( - getSampleSameRanges(), - IllegalArgumentException.class); + if (!isSupported(VECTOR_IO_OVERLAPPING_RANGES)) { + verifyExceptionalVectoredRead( + getSampleSameRanges(), + IllegalArgumentException.class); + } else { + try (FSDataInputStream in = openVectorFile()) { + List fileRanges = getSampleSameRanges(); + in.readVectored(fileRanges, allocate); + validateVectoredReadResult(fileRanges, DATASET, 0); + returnBuffersToPoolPostRead(fileRanges, pool); + } + } } /** @@ -329,10 +348,9 @@ public void testSomeRandomNonOverlappingRanges() throws Exception { public void testConsecutiveRanges() throws Exception { List fileRanges = new ArrayList<>(); final int offset = 500; - final int length = 100; + final int length = 2011; range(fileRanges, offset, length); - range(fileRanges, 600, 200); - range(fileRanges, 800, 100); + range(fileRanges, offset + length, length); try (FSDataInputStream in = openVectorFile()) { in.readVectored(fileRanges, allocate); validateVectoredReadResult(fileRanges, DATASET, 0); diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractOptions.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractOptions.java index f7cf27fb69..7f092ff0d4 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractOptions.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractOptions.java @@ -261,4 +261,6 @@ public interface ContractOptions { * Does vector read check file length on open rather than in the read call? */ String VECTOR_IO_EARLY_EOF_CHECK = "vector-io-early-eof-check"; + + String VECTOR_IO_OVERLAPPING_RANGES = "vector-io-overlapping-ranges"; } diff --git a/hadoop-common-project/hadoop-common/src/test/resources/contract/rawlocal.xml b/hadoop-common-project/hadoop-common/src/test/resources/contract/rawlocal.xml index 198ca566e2..b538b15c19 100644 --- a/hadoop-common-project/hadoop-common/src/test/resources/contract/rawlocal.xml +++ b/hadoop-common-project/hadoop-common/src/test/resources/contract/rawlocal.xml @@ -142,4 +142,9 @@ true + + fs.contract.vector-io-overlapping-ranges + true + +