HADOOP-18168. Fix S3A ITestMarkerTool use of purged public bucket. (#4140)

This moves off use of the purged s3a://landsat-pds bucket, so fixing tests
which had started failing.
* Adds a new class, PublicDatasetTestUtils to manage the use of public datasets.
* The new test bucket s3a://usgs-landsat/ is requester pays, so depends upon
  HADOOP-14661.

Consult the updated test documentation when running against other S3 stores.

Contributed by Daniel Carl Jones

Change-Id: Ie8585e4d9b67667f8cb80b2970225d79a4f8d257
This commit is contained in:
Daniel Carl Jones 2022-05-03 14:26:52 +01:00 committed by Steve Loughran
parent c2b2494d8b
commit 4c977f5f71
6 changed files with 170 additions and 18 deletions

View File

@ -544,6 +544,18 @@ which address issues. In particular, we encourage testing of Hadoop release
candidates, as these third-party endpoints get even less testing than the candidates, as these third-party endpoints get even less testing than the
S3 endpoint itself. S3 endpoint itself.
### Public datasets used in tests
Some tests rely on the presence of existing public datasets available on Amazon S3.
You may find a number of these in `org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils`.
When testing against an endpoint which is not part of Amazon S3's standard commercial partition
(`aws`) such as third-party implementations or AWS's China regions, you should replace these
configurations with an empty space (` `) to disable the tests or an existing path in your object
store that supports these tests.
An example of this might be the MarkerTools tests which require a bucket with a large number of
objects or the requester pays tests that require requester pays to be enabled for the bucket.
### Disabling the encryption tests ### Disabling the encryption tests

View File

@ -26,11 +26,11 @@
import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils;
import org.apache.hadoop.fs.statistics.IOStatisticAssertions; import org.apache.hadoop.fs.statistics.IOStatisticAssertions;
import org.apache.hadoop.fs.statistics.StreamStatisticNames; import org.apache.hadoop.fs.statistics.StreamStatisticNames;
import static org.apache.hadoop.fs.s3a.Constants.ALLOW_REQUESTER_PAYS; import static org.apache.hadoop.fs.s3a.Constants.ALLOW_REQUESTER_PAYS;
import static org.apache.hadoop.fs.s3a.Constants.ENDPOINT;
import static org.apache.hadoop.fs.s3a.Constants.S3A_BUCKET_PROBE; import static org.apache.hadoop.fs.s3a.Constants.S3A_BUCKET_PROBE;
import static org.apache.hadoop.test.LambdaTestUtils.intercept; import static org.apache.hadoop.test.LambdaTestUtils.intercept;
@ -42,10 +42,15 @@ public class ITestS3ARequesterPays extends AbstractS3ATestBase {
@Override @Override
protected Configuration createConfiguration() { protected Configuration createConfiguration() {
Configuration conf = super.createConfiguration(); Configuration conf = super.createConfiguration();
S3ATestUtils.removeBaseAndBucketOverrides(conf,
Path requesterPaysPath = getRequesterPaysPath(conf);
String requesterPaysBucketName = requesterPaysPath.toUri().getHost();
S3ATestUtils.removeBaseAndBucketOverrides(
requesterPaysBucketName,
conf,
ALLOW_REQUESTER_PAYS, ALLOW_REQUESTER_PAYS,
ENDPOINT,
S3A_BUCKET_PROBE); S3A_BUCKET_PROBE);
return conf; return conf;
} }
@ -102,14 +107,8 @@ public void testRequesterPaysDisabledFails() throws Throwable {
} }
} }
private Path getRequesterPaysPath(Configuration conf) { private static Path getRequesterPaysPath(Configuration conf) {
String requesterPaysFile = return new Path(PublicDatasetTestUtils.getRequesterPaysObject(conf));
conf.getTrimmed(KEY_REQUESTER_PAYS_FILE, DEFAULT_REQUESTER_PAYS_FILE);
S3ATestUtils.assume(
"Empty test property: " + KEY_REQUESTER_PAYS_FILE,
!requesterPaysFile.isEmpty()
);
return new Path(requesterPaysFile);
} }
} }

View File

@ -20,6 +20,9 @@
import java.time.Duration; import java.time.Duration;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils;
/** /**
* Constants for S3A Testing. * Constants for S3A Testing.
*/ */
@ -99,14 +102,19 @@ public interface S3ATestConstants {
/** /**
* Configuration key for an existing object in a requester pays bucket: {@value}. * Configuration key for an existing object in a requester pays bucket: {@value}.
* If not set, defaults to {@value DEFAULT_REQUESTER_PAYS_FILE}. *
* Accessible via
* {@link PublicDatasetTestUtils#getRequesterPaysObject(Configuration)}.
*/ */
String KEY_REQUESTER_PAYS_FILE = TEST_FS_S3A + "requester.pays.file"; String KEY_REQUESTER_PAYS_FILE = TEST_FS_S3A + "requester.pays.file";
/** /**
* Default path for an S3 object inside a requester pays enabled bucket: {@value}. * Configuration key for an existing bucket with many objects: {@value}.
*
* This is used for tests depending on buckets with a large number of keys.
*/ */
String DEFAULT_REQUESTER_PAYS_FILE = "s3a://usgs-landsat/collection02/catalog.json"; String KEY_BUCKET_WITH_MANY_OBJECTS
= TEST_FS_S3A + "bucket-with-many-objects";
/** /**
* Name of the property to define the timeout for scale tests: {@value}. * Name of the property to define the timeout for scale tests: {@value}.

View File

@ -0,0 +1,101 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a.test;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.s3a.S3ATestUtils;
import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_BUCKET_WITH_MANY_OBJECTS;
import static org.apache.hadoop.fs.s3a.S3ATestConstants.KEY_REQUESTER_PAYS_FILE;
/**
* Provides S3A filesystem URIs for public data sets for specific use cases.
*
* This allows for the contract between S3A tests and the existence of data sets
* to be explicit and also standardizes access and configuration of
* replacements.
*
* Bucket specific configuration such as endpoint or requester pays should be
* configured within "hadoop-tools/hadoop-aws/src/test/resources/core-site.xml".
*
* Warning: methods may mutate the configuration instance passed in.
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
public final class PublicDatasetTestUtils {
/**
* Private constructor for utility class.
*/
private PublicDatasetTestUtils() {}
/**
* Default path for an object inside a requester pays bucket: {@value}.
*/
private static final String DEFAULT_REQUESTER_PAYS_FILE
= "s3a://usgs-landsat/collection02/catalog.json";
/**
* Default bucket for an S3A file system with many objects: {@value}.
*
* We use a subdirectory to ensure we have permissions on all objects
* contained within as well as permission to inspect the directory itself.
*/
private static final String DEFAULT_BUCKET_WITH_MANY_OBJECTS
= "s3a://usgs-landsat/collection02/level-1/";
/**
* Provide a URI for a directory containing many objects.
*
* Unless otherwise configured,
* this will be {@value DEFAULT_BUCKET_WITH_MANY_OBJECTS}.
*
* @param conf Hadoop configuration
* @return S3A FS URI
*/
public static String getBucketPrefixWithManyObjects(Configuration conf) {
return fetchFromConfig(conf,
KEY_BUCKET_WITH_MANY_OBJECTS, DEFAULT_BUCKET_WITH_MANY_OBJECTS);
}
/**
* Provide a URI to an object within a requester pays enabled bucket.
*
* Unless otherwise configured,
* this will be {@value DEFAULT_REQUESTER_PAYS_FILE}.
*
* @param conf Hadoop configuration
* @return S3A FS URI
*/
public static String getRequesterPaysObject(Configuration conf) {
return fetchFromConfig(conf,
KEY_REQUESTER_PAYS_FILE, DEFAULT_REQUESTER_PAYS_FILE);
}
private static String fetchFromConfig(Configuration conf, String key, String defaultValue) {
String value = conf.getTrimmed(key, defaultValue);
S3ATestUtils.assume("Empty test property: " + key, !value.isEmpty());
return value;
}
}

View File

@ -28,9 +28,11 @@
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.contract.ContractTestUtils; import org.apache.hadoop.fs.contract.ContractTestUtils;
import org.apache.hadoop.fs.s3a.test.PublicDatasetTestUtils;
import org.apache.hadoop.fs.s3a.S3AFileSystem; import org.apache.hadoop.fs.s3a.S3AFileSystem;
import static org.apache.hadoop.fs.s3a.Constants.DIRECTORY_MARKER_POLICY_AUTHORITATIVE; import static org.apache.hadoop.fs.s3a.Constants.DIRECTORY_MARKER_POLICY_AUTHORITATIVE;
@ -307,22 +309,25 @@ public void testRunLimitedAudit() throws Throwable {
} }
/** /**
* Run an audit against the landsat bucket. * Run an audit against a bucket with a large number of objects.
* <p></p> * <p></p>
* This tests paging/scale against a larger bucket without * This tests paging/scale against a larger bucket without
* worrying about setup costs. * worrying about setup costs.
*/ */
@Test @Test
public void testRunLimitedLandsatAudit() throws Throwable { public void testRunAuditManyObjectsInBucket() throws Throwable {
describe("Audit a few thousand landsat objects"); describe("Audit a few thousand objects");
final File audit = tempAuditFile(); final File audit = tempAuditFile();
Configuration conf = super.createConfiguration();
String bucketUri = PublicDatasetTestUtils.getBucketPrefixWithManyObjects(conf);
runToFailure(EXIT_INTERRUPTED, runToFailure(EXIT_INTERRUPTED,
MARKERS, MARKERS,
AUDIT, AUDIT,
m(OPT_LIMIT), 3000, m(OPT_LIMIT), 3000,
m(OPT_OUT), audit, m(OPT_OUT), audit,
LANDSAT_BUCKET); bucketUri);
readOutput(audit); readOutput(audit);
} }

View File

@ -30,6 +30,8 @@
<final>false</final> <final>false</final>
</property> </property>
<!-- Per-bucket configurations: landsat-pds -->
<property> <property>
<name>fs.s3a.bucket.landsat-pds.endpoint</name> <name>fs.s3a.bucket.landsat-pds.endpoint</name>
<value>${central.endpoint}</value> <value>${central.endpoint}</value>
@ -55,6 +57,31 @@
<description>Do not add the referrer header to landsat operations</description> <description>Do not add the referrer header to landsat operations</description>
</property> </property>
<!-- Per-bucket configurations: usgs-landsat -->
<property>
<name>fs.s3a.bucket.usgs-landsat.endpoint</name>
<value>${central.endpoint}</value>
</property>
<property>
<name>fs.s3a.bucket.usgs-landsat.requester.pays.enabled</name>
<value>true</value>
<description>usgs-landsat requires requester pays enabled</description>
</property>
<property>
<name>fs.s3a.bucket.usgs-landsat.multipart.purge</name>
<value>false</value>
<description>Don't try to purge uploads in the read-only bucket, as
it will only create log noise.</description>
</property>
<property>
<name>fs.s3a.bucket.usgs-landsat.audit.add.referrer.header</name>
<value>false</value>
</property>
<!-- <!--
This is the default endpoint, which can be used to interact This is the default endpoint, which can be used to interact