HADOOP-19291. RawLocalFileSystem to allow overlapping ranges (#7101)

ChecksumFileSystem creates the chunked ranges based on the checksum chunk size and then calls
readVectored on Raw Local which may lead to overlapping ranges in some cases.

Contributed by: Mukund Thakur
This commit is contained in:
Mukund Thakur 2024-10-09 08:34:47 -05:00 committed by GitHub
parent dc56fc385a
commit e4b070025b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 46 additions and 15 deletions

View File

@ -68,7 +68,8 @@
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.StringUtils;
import static org.apache.hadoop.fs.VectoredReadUtils.validateAndSortRanges;
import static org.apache.hadoop.fs.VectoredReadUtils.sortRangeList;
import static org.apache.hadoop.fs.VectoredReadUtils.validateRangeRequest;
import static org.apache.hadoop.fs.impl.PathCapabilitiesSupport.validatePathCapabilityArgs;
import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_BYTES;
import static org.apache.hadoop.fs.statistics.StreamStatisticNames.STREAM_READ_EXCEPTIONS;
@ -320,10 +321,10 @@ public void readVectored(List<? extends FileRange> ranges,
IntFunction<ByteBuffer> allocate) throws IOException {
// Validate, but do not pass in a file length as it may change.
List<? extends FileRange> sortedRanges = validateAndSortRanges(ranges,
Optional.empty());
List<? extends FileRange> sortedRanges = sortRangeList(ranges);
// Set up all of the futures, so that we can use them if things fail
for(FileRange range: sortedRanges) {
validateRangeRequest(range);
range.setData(new CompletableFuture<>());
}
try {

View File

@ -623,8 +623,13 @@ support -and fallback everywhere else.
The restriction "no overlapping ranges" was only initially enforced in
the S3A connector, which would raise `UnsupportedOperationException`.
Adding the range check as a precondition for all implementations guarantees
consistent behavior everywhere.
Adding the range check as a precondition for all implementations (Raw Local
being an exception) guarantees consistent behavior everywhere.
The reason Raw Local doesn't have this precondition is ChecksumFileSystem
creates the chunked ranges based on the checksum chunk size and then calls
readVectored on Raw Local which may lead to overlapping ranges in some cases.
For details see [HADOOP-19291](https://issues.apache.org/jira/browse/HADOOP-19291)
For reliable use with older hadoop releases with the API: sort the list of ranges
and check for overlaps before calling `readVectored()`.

View File

@ -270,13 +270,23 @@ public void testSomeRangesMergedSomeUnmerged() throws Exception {
}
/**
* Vectored IO doesn't support overlapping ranges.
* Most file systems won't support overlapping ranges.
* Currently, only Raw Local supports it.
*/
@Test
public void testOverlappingRanges() throws Exception {
verifyExceptionalVectoredRead(
getSampleOverlappingRanges(),
IllegalArgumentException.class);
if (!isSupported(VECTOR_IO_OVERLAPPING_RANGES)) {
verifyExceptionalVectoredRead(
getSampleOverlappingRanges(),
IllegalArgumentException.class);
} else {
try (FSDataInputStream in = openVectorFile()) {
List<FileRange> fileRanges = getSampleOverlappingRanges();
in.readVectored(fileRanges, allocate);
validateVectoredReadResult(fileRanges, DATASET, 0);
returnBuffersToPoolPostRead(fileRanges, pool);
}
}
}
/**
@ -284,9 +294,18 @@ public void testOverlappingRanges() throws Exception {
*/
@Test
public void testSameRanges() throws Exception {
verifyExceptionalVectoredRead(
getSampleSameRanges(),
IllegalArgumentException.class);
if (!isSupported(VECTOR_IO_OVERLAPPING_RANGES)) {
verifyExceptionalVectoredRead(
getSampleSameRanges(),
IllegalArgumentException.class);
} else {
try (FSDataInputStream in = openVectorFile()) {
List<FileRange> fileRanges = getSampleSameRanges();
in.readVectored(fileRanges, allocate);
validateVectoredReadResult(fileRanges, DATASET, 0);
returnBuffersToPoolPostRead(fileRanges, pool);
}
}
}
/**
@ -329,10 +348,9 @@ public void testSomeRandomNonOverlappingRanges() throws Exception {
public void testConsecutiveRanges() throws Exception {
List<FileRange> fileRanges = new ArrayList<>();
final int offset = 500;
final int length = 100;
final int length = 2011;
range(fileRanges, offset, length);
range(fileRanges, 600, 200);
range(fileRanges, 800, 100);
range(fileRanges, offset + length, length);
try (FSDataInputStream in = openVectorFile()) {
in.readVectored(fileRanges, allocate);
validateVectoredReadResult(fileRanges, DATASET, 0);

View File

@ -261,4 +261,6 @@ public interface ContractOptions {
* Does vector read check file length on open rather than in the read call?
*/
String VECTOR_IO_EARLY_EOF_CHECK = "vector-io-early-eof-check";
String VECTOR_IO_OVERLAPPING_RANGES = "vector-io-overlapping-ranges";
}

View File

@ -142,4 +142,9 @@
<value>true</value>
</property>
<property>
<name>fs.contract.vector-io-overlapping-ranges</name>
<value>true</value>
</property>
</configuration>