From 708a0ce21bc1bd6164ff650e9104de70fe3f1dbb Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Tue, 22 Mar 2022 13:20:37 +0000 Subject: [PATCH] HADOOP-13704. Optimized S3A getContentSummary() Optimize the scan for s3 by performing a deep tree listing, inferring directory counts from the paths returned. Contributed by Ahmar Suhail. Change-Id: I26ffa8c6f65fd11c68a88d6e2243b0eac6ffd024 --- .../site/markdown/filesystem/filesystem.md | 20 ++++ .../AbstractContractContentSummaryTest.java | 65 +++++++++++ .../TestLocalFSContractContentSummary.java | 31 ++++++ .../apache/hadoop/fs/s3a/S3AFileSystem.java | 6 +- .../hadoop/fs/s3a/S3ObjectAttributes.java | 2 +- .../s3a/impl/GetContentSummaryOperation.java | 102 +++++++++++------- .../s3a/ITestS3AContractContentSummary.java | 70 ++++++++++++ .../ITestS3AMiscOperationCost.java | 4 +- .../scale/ITestS3ADirectoryPerformance.java | 33 ++++++ 9 files changed, 288 insertions(+), 45 deletions(-) create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractContentSummaryTest.java create mode 100644 hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSContractContentSummary.java create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractContentSummary.java diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md index 0e01aa1dc8..4517bd8ff4 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md @@ -453,6 +453,26 @@ The function `getLocatedFileStatus(FS, d)` is as defined in The atomicity and consistency constraints are as for `listStatus(Path, PathFilter)`. + +### `ContentSummary getContentSummary(Path path)` + +Given a path return its content summary. + +`getContentSummary()` first checks if the given path is a file and if yes, it returns 0 for directory count +and 1 for file count. + +#### Preconditions + + exists(FS, path) else raise FileNotFoundException + +#### Postconditions + +Returns a `ContentSummary` object with information such as directory count +and file count for a given path. + +The atomicity and consistency constraints are as for +`listStatus(Path, PathFilter)`. + ### `BlockLocation[] getFileBlockLocations(FileStatus f, int s, int l)` #### Preconditions diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractContentSummaryTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractContentSummaryTest.java new file mode 100644 index 0000000000..5e5c917395 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractContentSummaryTest.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.contract; + +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import org.assertj.core.api.Assertions; +import org.junit.Test; + +import java.io.FileNotFoundException; + +import static org.apache.hadoop.fs.contract.ContractTestUtils.touch; +import static org.apache.hadoop.test.LambdaTestUtils.intercept; + +public abstract class AbstractContractContentSummaryTest extends AbstractFSContractTestBase { + + @Test + public void testGetContentSummary() throws Throwable { + FileSystem fs = getFileSystem(); + + Path parent = path("parent"); + Path nested = path(parent + "/a/b/c"); + Path filePath = path(nested + "file.txt"); + + fs.mkdirs(parent); + fs.mkdirs(nested); + touch(getFileSystem(), filePath); + + ContentSummary summary = fs.getContentSummary(parent); + + Assertions.assertThat(summary.getDirectoryCount()).as("Summary " + summary).isEqualTo(4); + + Assertions.assertThat(summary.getFileCount()).as("Summary " + summary).isEqualTo(1); + } + + @Test + public void testGetContentSummaryIncorrectPath() throws Throwable { + FileSystem fs = getFileSystem(); + + Path parent = path("parent"); + Path nested = path(parent + "/a"); + + fs.mkdirs(parent); + + intercept(FileNotFoundException.class, () -> fs.getContentSummary(nested)); + } +} diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSContractContentSummary.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSContractContentSummary.java new file mode 100644 index 0000000000..7555cf8515 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/localfs/TestLocalFSContractContentSummary.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.contract.localfs; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.contract.AbstractContractContentSummaryTest; +import org.apache.hadoop.fs.contract.AbstractFSContract; + +public class TestLocalFSContractContentSummary extends AbstractContractContentSummaryTest { + + @Override + protected AbstractFSContract createContract(Configuration conf) { + return new LocalFSContract(conf); + } +} diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java index 86da70ecdd..4b450c4dcc 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java @@ -3266,9 +3266,9 @@ public S3AFileStatus probePathStatus(final Path path, } @Override - public RemoteIterator listStatusIterator(final Path path) - throws IOException { - return S3AFileSystem.this.innerListStatus(path); + public RemoteIterator listFilesIterator(final Path path, + final boolean recursive) throws IOException { + return S3AFileSystem.this.innerListFiles(path, recursive, Listing.ACCEPT_ALL_BUT_S3N, null); } } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java index 5a8dfc7bc8..275b207cd6 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java @@ -66,7 +66,7 @@ public S3ObjectAttributes( /** * Construct from the result of a copy and those parameters * which aren't included in an AWS SDK response. - * @param path + * @param path path * @param copyResult copy result. * @param serverSideEncryptionAlgorithm current encryption algorithm * @param serverSideEncryptionKey any server side encryption key? diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/GetContentSummaryOperation.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/GetContentSummaryOperation.java index 23631c67e3..248bffb940 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/GetContentSummaryOperation.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/impl/GetContentSummaryOperation.java @@ -20,6 +20,7 @@ import java.io.FileNotFoundException; import java.io.IOException; +import java.util.HashSet; import java.util.Set; import org.slf4j.Logger; @@ -34,22 +35,15 @@ import org.apache.hadoop.fs.statistics.IOStatistics; import org.apache.hadoop.fs.statistics.IOStatisticsSnapshot; import org.apache.hadoop.fs.statistics.IOStatisticsSource; +import org.apache.hadoop.fs.s3a.S3ALocatedFileStatus; import static org.apache.hadoop.fs.statistics.IOStatisticsSupport.retrieveIOStatistics; /** * GetContentSummary operation. - * This is based on {@code FileSystem.get#getContentSummary}; - * its still doing sequential treewalk with the efficiency - * issues. * - * Changes: - * 1. On the recursive calls there - * is no probe to see if the path is a file: we know the - * recursion only happens with a dir. - * 2. If a subdirectory is not found during the walk, that - * does not trigger an error. The directory is clearly - * not part of the content any more. + * It is optimized for s3 and performs a deep tree listing, + * inferring directory counts from the paths returned. * * The Operation serves up IOStatistics; this counts * the cost of all the list operations, but not the @@ -122,9 +116,7 @@ public ContentSummary execute() throws IOException { /** * Return the {@link ContentSummary} of a given directory. - * This is a recursive operation (as the original is); - * it'd be more efficient of stack and heap if it managed its - * own stack. + * * @param dir dir to scan * @throws FileNotFoundException if the path does not resolve * @throws IOException IO failure @@ -133,34 +125,65 @@ public ContentSummary execute() throws IOException { * @throws IOException failure */ public ContentSummary getDirSummary(Path dir) throws IOException { + long totalLength = 0; long fileCount = 0; long dirCount = 1; - final RemoteIterator it - = callbacks.listStatusIterator(dir); + + RemoteIterator it = callbacks.listFilesIterator(dir, true); + + Set dirSet = new HashSet<>(); + Set pathsTraversed = new HashSet<>(); while (it.hasNext()) { - final S3AFileStatus s = it.next(); - if (s.isDirectory()) { - try { - ContentSummary c = getDirSummary(s.getPath()); - totalLength += c.getLength(); - fileCount += c.getFileCount(); - dirCount += c.getDirectoryCount(); - } catch (FileNotFoundException ignored) { - // path was deleted during the scan; exclude from - // summary. - } - } else { - totalLength += s.getLen(); + S3ALocatedFileStatus fileStatus = it.next(); + Path filePath = fileStatus.getPath(); + + if (fileStatus.isDirectory() && !filePath.equals(dir)) { + dirSet.add(filePath); + buildDirectorySet(dirSet, pathsTraversed, dir, filePath.getParent()); + } else if (!fileStatus.isDirectory()) { fileCount += 1; + totalLength += fileStatus.getLen(); + buildDirectorySet(dirSet, pathsTraversed, dir, filePath.getParent()); } + } + // Add the list's IOStatistics iostatistics.aggregate(retrieveIOStatistics(it)); + return new ContentSummary.Builder().length(totalLength). - fileCount(fileCount).directoryCount(dirCount). - spaceConsumed(totalLength).build(); + fileCount(fileCount).directoryCount(dirCount + dirSet.size()). + spaceConsumed(totalLength).build(); + } + + /*** + * This method builds the set of all directories found under the base path. We need to do this + * because if the directory structure /a/b/c was created with a single mkdirs() call, it is + * stored as 1 object in S3 and the list files iterator will only return a single entry /a/b/c. + * + * We keep track of paths traversed so far to prevent duplication of work. For eg, if we had + * a/b/c/file-1.txt and /a/b/c/file-2.txt, we will only recurse over the complete path once + * and won't have to do anything for file-2.txt. + * + * @param dirSet Set of all directories found in the path + * @param pathsTraversed Set of all paths traversed so far + * @param basePath Path of directory to scan + * @param parentPath Parent path of the current file/directory in the iterator + */ + private void buildDirectorySet(Set dirSet, Set pathsTraversed, Path basePath, + Path parentPath) { + + if (parentPath == null || pathsTraversed.contains(parentPath) || parentPath.equals(basePath)) { + return; + } + + dirSet.add(parentPath); + + buildDirectorySet(dirSet, pathsTraversed, basePath, parentPath.getParent()); + + pathsTraversed.add(parentPath); } /** @@ -186,23 +209,24 @@ public interface GetContentSummaryCallbacks { /** * Get the status of a path. - * @param path path to probe. + * + * @param path path to probe. * @param probes probes to exec * @return the status * @throws IOException failure */ @Retries.RetryTranslated - S3AFileStatus probePathStatus(Path path, - Set probes) throws IOException; + S3AFileStatus probePathStatus(Path path, Set probes) throws IOException; - /** - * Incremental list of all entries in a directory. - * @param path path of dir - * @return an iterator + /*** + * List all entries under a path. + * + * @param path + * @param recursive if the subdirectories need to be traversed recursively + * @return an iterator over the listing. * @throws IOException failure */ - RemoteIterator listStatusIterator(Path path) + RemoteIterator listFilesIterator(Path path, boolean recursive) throws IOException; - } } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractContentSummary.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractContentSummary.java new file mode 100644 index 0000000000..ad83cfe52d --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/ITestS3AContractContentSummary.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.contract.s3a; + +import org.assertj.core.api.Assertions; +import org.junit.Test; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.ContentSummary; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.contract.AbstractContractContentSummaryTest; +import org.apache.hadoop.fs.contract.AbstractFSContract; +import org.apache.hadoop.fs.s3a.S3AFileSystem; + +import static org.apache.hadoop.fs.contract.ContractTestUtils.touch; + +public class ITestS3AContractContentSummary extends AbstractContractContentSummaryTest { + + @Test + public void testGetContentSummaryDir() throws Throwable { + describe("getContentSummary on test dir with children"); + S3AFileSystem fs = getFileSystem(); + Path baseDir = methodPath(); + + // Nested folders created separately will return as separate objects in listFiles() + fs.mkdirs(new Path(baseDir, "a")); + fs.mkdirs(new Path(baseDir, "a/b")); + fs.mkdirs(new Path(baseDir, "a/b/a")); + + // Will return as one object + fs.mkdirs(new Path(baseDir, "d/e/f")); + + Path filePath = new Path(baseDir, "a/b/file"); + touch(fs, filePath); + + // look at path to see if it is a file + // it is not: so LIST + final ContentSummary summary = fs.getContentSummary(baseDir); + + Assertions.assertThat(summary.getDirectoryCount()).as("Summary " + summary).isEqualTo(7); + Assertions.assertThat(summary.getFileCount()).as("Summary " + summary).isEqualTo(1); + } + + @Override + protected AbstractFSContract createContract(Configuration conf) { + return new S3AContract(conf); + } + + @Override + public S3AFileSystem getFileSystem() { + return (S3AFileSystem) super.getFileSystem(); + } + +} diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3AMiscOperationCost.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3AMiscOperationCost.java index 75701b50d2..6a6baa03ea 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3AMiscOperationCost.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/performance/ITestS3AMiscOperationCost.java @@ -144,8 +144,8 @@ public void testGetContentSummaryDir() throws Throwable { with(INVOCATION_GET_CONTENT_SUMMARY, 1), withAuditCount(1), always(FILE_STATUS_FILE_PROBE // look at path to see if it is a file - .plus(LIST_OPERATION) // it is not: so LIST - .plus(LIST_OPERATION))); // and a LIST on the child dir + .plus(LIST_OPERATION))); // it is not: so LIST + Assertions.assertThat(summary.getDirectoryCount()) .as("Summary " + summary) .isEqualTo(2); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3ADirectoryPerformance.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3ADirectoryPerformance.java index d87af3bac5..946e59e9e3 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3ADirectoryPerformance.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3ADirectoryPerformance.java @@ -19,6 +19,7 @@ package org.apache.hadoop.fs.s3a.scale; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; @@ -154,6 +155,38 @@ public void testListOperations() throws Throwable { listStatusCalls, getFileStatusCalls); + describe("Get content summary for directory"); + + NanoTimer getContentSummaryTimer = new NanoTimer(); + + ContentSummary rootPathSummary = fs.getContentSummary(scaleTestDir); + ContentSummary testPathSummary = fs.getContentSummary(listDir); + + getContentSummaryTimer.end("getContentSummary of %s", created); + + // only two list operations should have taken place + print(LOG, + metadataRequests, + listRequests, + listContinueRequests, + listStatusCalls, + getFileStatusCalls); + assertEquals(listRequests.toString(), 2, listRequests.diff()); + reset(metadataRequests, + listRequests, + listContinueRequests, + listStatusCalls, + getFileStatusCalls); + + assertTrue("Root directory count should be > test path", + rootPathSummary.getDirectoryCount() > testPathSummary.getDirectoryCount()); + assertTrue("Root file count should be >= to test path", + rootPathSummary.getFileCount() >= testPathSummary.getFileCount()); + assertEquals("Incorrect directory count", created.getDirCount() + 1, + testPathSummary.getDirectoryCount()); + assertEquals("Incorrect file count", created.getFileCount(), + testPathSummary.getFileCount()); + } finally { describe("deletion"); // deletion at the end of the run