From f365957c6326f88734bc0a5d01cfb7eac713db20 Mon Sep 17 00:00:00 2001 From: Steve Loughran Date: Tue, 5 Feb 2019 11:51:02 +0000 Subject: [PATCH] HADOOP-15229. Add FileSystem builder-based openFile() API to match createFile(); S3A to implement S3 Select through this API. The new openFile() API is asynchronous, and implemented across FileSystem and FileContext. The MapReduce V2 inputs are moved to this API, and you can actually set must/may options to pass in. This is more useful for setting things like s3a seek policy than for S3 select, as the existing input format/record readers can't handle S3 select output where the stream is shorter than the file length, and splitting plain text is suboptimal. Future work is needed there. In the meantime, any/all filesystem connectors are now free to add their own filesystem-specific configuration parameters which can be set in jobs and used to set filesystem input stream options (seek policy, retry, encryption secrets, etc). Contributed by Steve Loughran --- .../apache/hadoop/fs/AbstractFileSystem.java | 33 + .../hadoop/fs/DelegateToFileSystem.java | 20 + .../java/org/apache/hadoop/fs/FSBuilder.java | 131 ++ .../hadoop/fs/FSDataOutputStreamBuilder.java | 193 +-- .../org/apache/hadoop/fs/FileContext.java | 69 +- .../java/org/apache/hadoop/fs/FileSystem.java | 188 ++- .../apache/hadoop/fs/FilterFileSystem.java | 33 + .../java/org/apache/hadoop/fs/FilterFs.java | 13 + .../fs/FutureDataInputStreamBuilder.java | 50 + .../hadoop/fs/impl/AbstractFSBuilderImpl.java | 356 ++++++ .../FutureDataInputStreamBuilderImpl.java | 141 +++ .../hadoop/fs/impl/FutureIOSupport.java | 191 +++ .../hadoop/fs/impl/WrappedIOException.java | 56 + .../apache/hadoop/fs/impl/package-info.java | 49 + .../hadoop/io/compress/PassthroughCodec.java | 246 ++++ .../org/apache/hadoop/util/LambdaUtils.java | 59 + .../src/main/resources/core-default.xml | 112 ++ .../site/markdown/filesystem/filesystem.md | 87 +- .../markdown/filesystem/fsdatainputstream.md | 14 + .../filesystem/fsdatainputstreambuilder.md | 112 ++ .../filesystem/fsdataoutputstreambuilder.md | 6 +- .../fs/FileContextMainOperationsBaseTest.java | 95 +- .../apache/hadoop/fs/TestHarFileSystem.java | 20 + .../apache/hadoop/fs/TestLocalFileSystem.java | 2 +- .../fs/contract/AbstractContractOpenTest.java | 135 +- .../AbstractContractPathHandleTest.java | 61 + .../hadoop/fs/contract/ContractTestUtils.java | 31 + .../apache/hadoop/test/LambdaTestUtils.java | 130 ++ .../hadoop/test/TestLambdaTestUtils.java | 114 +- .../hadoop/hdfs/DistributedFileSystem.java | 2 +- .../contract/hdfs/TestHDFSContractOpen.java | 2 +- .../hadoop/mapred/LineRecordReader.java | 12 +- .../apache/hadoop/mapreduce/MRJobConfig.java | 14 + .../lib/input/FixedLengthRecordReader.java | 14 +- .../mapreduce/lib/input/LineRecordReader.java | 12 +- .../mapreduce/lib/input/NLineInputFormat.java | 12 +- .../dev-support/findbugs-exclude.xml | 5 + .../hadoop/fs/s3a/InternalConstants.java | 53 + .../apache/hadoop/fs/s3a/S3AFileSystem.java | 203 ++- .../apache/hadoop/fs/s3a/S3AInputStream.java | 43 +- .../hadoop/fs/s3a/S3AInstrumentation.java | 3 +- .../apache/hadoop/fs/s3a/S3AOpContext.java | 25 + .../hadoop/fs/s3a/S3AReadOpContext.java | 97 +- .../org/apache/hadoop/fs/s3a/S3AUtils.java | 9 +- .../hadoop/fs/s3a/S3ObjectAttributes.java | 27 +- .../org/apache/hadoop/fs/s3a/Statistic.java | 2 + .../hadoop/fs/s3a/WriteOperationHelper.java | 105 +- .../hadoop/fs/s3a/s3guard/S3GuardTool.java | 82 +- .../s3a/select/InternalSelectConstants.java | 77 ++ .../hadoop/fs/s3a/select/SelectBinding.java | 431 +++++++ .../hadoop/fs/s3a/select/SelectConstants.java | 296 +++++ .../fs/s3a/select/SelectInputStream.java | 457 +++++++ .../hadoop/fs/s3a/select/SelectTool.java | 355 ++++++ .../hadoop/fs/s3a/select/package-info.java | 27 + .../markdown/tools/hadoop-aws/s3_select.md | 1100 +++++++++++++++++ .../site/markdown/tools/hadoop-aws/testing.md | 16 + .../s3a/ITestS3AAWSCredentialsProvider.java | 4 +- .../fs/s3a/ITestS3AFailureHandling.java | 11 +- .../apache/hadoop/fs/s3a/S3ATestUtils.java | 72 +- .../fs/s3a/TestS3AAWSCredentialsProvider.java | 3 +- .../fs/s3a/commit/AbstractCommitITest.java | 2 +- .../s3guard/AbstractS3GuardToolTestBase.java | 71 +- .../s3a/s3guard/ITestS3GuardToolDynamoDB.java | 2 +- .../fs/s3a/s3guard/ITestS3GuardToolLocal.java | 16 +- .../fs/s3a/s3guard/S3GuardToolTestHelper.java | 89 ++ .../fs/s3a/select/AbstractS3SelectTest.java | 746 +++++++++++ .../apache/hadoop/fs/s3a/select/CsvFile.java | 138 +++ .../hadoop/fs/s3a/select/ITestS3Select.java | 967 +++++++++++++++ .../fs/s3a/select/ITestS3SelectCLI.java | 347 ++++++ .../fs/s3a/select/ITestS3SelectLandsat.java | 432 +++++++ .../fs/s3a/select/ITestS3SelectMRJob.java | 206 +++ .../mapreduce/StreamInputFormat.java | 14 +- 72 files changed, 8944 insertions(+), 404 deletions(-) create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSBuilder.java create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AbstractFSBuilderImpl.java create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureIOSupport.java create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/WrappedIOException.java create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/package-info.java create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/PassthroughCodec.java create mode 100644 hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LambdaUtils.java create mode 100644 hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstreambuilder.md create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/InternalConstants.java create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/InternalSelectConstants.java create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectBinding.java create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectConstants.java create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectInputStream.java create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java create mode 100644 hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/package-info.java create mode 100644 hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/s3_select.md create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardToolTestHelper.java create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/AbstractS3SelectTest.java create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/CsvFile.java create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java create mode 100644 hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java index 200a2d10fb..dc6cd2bc2b 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/AbstractFileSystem.java @@ -25,12 +25,15 @@ import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.EnumSet; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; +import java.util.Set; import java.util.StringTokenizer; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; import org.apache.hadoop.HadoopIllegalArgumentException; @@ -41,6 +44,7 @@ import org.apache.hadoop.fs.Options.ChecksumOpt; import org.apache.hadoop.fs.Options.CreateOpts; import org.apache.hadoop.fs.Options.Rename; +import org.apache.hadoop.fs.impl.AbstractFSBuilderImpl; import org.apache.hadoop.fs.permission.AclEntry; import org.apache.hadoop.fs.permission.AclStatus; import org.apache.hadoop.fs.permission.FsAction; @@ -48,6 +52,7 @@ import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.SecurityUtil; import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.util.LambdaUtils; import org.apache.hadoop.util.Progressable; import com.google.common.annotations.VisibleForTesting; @@ -1329,4 +1334,32 @@ public boolean equals(Object other) { } return myUri.equals(((AbstractFileSystem) other).myUri); } + + /** + * Open a file with the given set of options. + * The base implementation performs a blocking + * call to {@link #open(Path, int)}in this call; + * the actual outcome is in the returned {@code CompletableFuture}. + * This avoids having to create some thread pool, while still + * setting up the expectation that the {@code get()} call + * is needed to evaluate the result. + * @param path path to the file + * @param mandatoryKeys set of options declared as mandatory. + * @param options options set during the build sequence. + * @param bufferSize buffer size + * @return a future which will evaluate to the opened file. + * @throws IOException failure to resolve the link. + * @throws IllegalArgumentException unknown mandatory key + */ + public CompletableFuture openFileWithOptions(Path path, + Set mandatoryKeys, + Configuration options, + int bufferSize) throws IOException { + AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(mandatoryKeys, + Collections.emptySet(), + "for " + path); + return LambdaUtils.eval( + new CompletableFuture<>(), () -> open(path, bufferSize)); + } + } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java index a5ab75e474..165c56c3d5 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/DelegateToFileSystem.java @@ -24,6 +24,8 @@ import java.util.Arrays; import java.util.EnumSet; import java.util.List; +import java.util.Set; +import java.util.concurrent.CompletableFuture; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; @@ -261,4 +263,22 @@ public String getCanonicalServiceName() { public List> getDelegationTokens(String renewer) throws IOException { return Arrays.asList(fsImpl.addDelegationTokens(renewer, null)); } + + /** + * Open a file by delegating to + * {@link FileSystem#openFileWithOptions(Path, Set, Configuration, int)}. + * @param path path to the file + * @param mandatoryKeys set of options declared as mandatory. + * @param options options set during the build sequence. + * @param bufferSize buffer size + * @return a future which will evaluate to the opened file. + * @throws IOException failure to resolve the link. + * @throws IllegalArgumentException unknown mandatory key + */ + public CompletableFuture openFileWithOptions(Path path, + Set mandatoryKeys, + Configuration options, + int bufferSize) throws IOException { + return fsImpl.openFileWithOptions(path, mandatoryKeys, options, bufferSize); + } } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSBuilder.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSBuilder.java new file mode 100644 index 0000000000..b7757a62e2 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSBuilder.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs; + +import javax.annotation.Nonnull; +import java.io.IOException; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * The base interface which various FileSystem FileContext Builder + * interfaces can extend, and which underlying implementations + * will then implement. + * @param Return type on the {@link #build()} call. + * @param type of builder itself. + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public interface FSBuilder> { + + /** + * Set optional Builder parameter. + */ + B opt(@Nonnull String key, @Nonnull String value); + + /** + * Set optional boolean parameter for the Builder. + * + * @see #opt(String, String) + */ + B opt(@Nonnull String key, boolean value); + + /** + * Set optional int parameter for the Builder. + * + * @see #opt(String, String) + */ + B opt(@Nonnull String key, int value); + + /** + * Set optional float parameter for the Builder. + * + * @see #opt(String, String) + */ + B opt(@Nonnull String key, float value); + + /** + * Set optional double parameter for the Builder. + * + * @see #opt(String, String) + */ + B opt(@Nonnull String key, double value); + + /** + * Set an array of string values as optional parameter for the Builder. + * + * @see #opt(String, String) + */ + B opt(@Nonnull String key, @Nonnull String... values); + + /** + * Set mandatory option to the Builder. + * + * If the option is not supported or unavailable, + * the client should expect {@link #build()} throws IllegalArgumentException. + */ + B must(@Nonnull String key, @Nonnull String value); + + /** + * Set mandatory boolean option. + * + * @see #must(String, String) + */ + B must(@Nonnull String key, boolean value); + + /** + * Set mandatory int option. + * + * @see #must(String, String) + */ + B must(@Nonnull String key, int value); + + /** + * Set mandatory float option. + * + * @see #must(String, String) + */ + B must(@Nonnull String key, float value); + + /** + * Set mandatory double option. + * + * @see #must(String, String) + */ + B must(@Nonnull String key, double value); + + /** + * Set a string array as mandatory option. + * + * @see #must(String, String) + */ + B must(@Nonnull String key, @Nonnull String... values); + + /** + * Instantiate the object which was being built. + * + * @throws IllegalArgumentException if the parameters are not valid. + * @throws UnsupportedOperationException if the filesystem does not support + * the specific operation. + * @throws IOException on filesystem IO errors. + */ + S build() throws IllegalArgumentException, + UnsupportedOperationException, IOException; +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStreamBuilder.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStreamBuilder.java index d43129388b..62a3182dfb 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStreamBuilder.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FSDataOutputStreamBuilder.java @@ -17,22 +17,18 @@ */ package org.apache.hadoop.fs; -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Options.ChecksumOpt; +import org.apache.hadoop.fs.impl.AbstractFSBuilderImpl; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.util.Progressable; import javax.annotation.Nonnull; import java.io.IOException; -import java.util.Collections; import java.util.EnumSet; -import java.util.HashSet; -import java.util.Set; +import static com.google.common.base.Preconditions.checkNotNull; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY; @@ -87,9 +83,9 @@ @InterfaceAudience.Public @InterfaceStability.Evolving public abstract class FSDataOutputStreamBuilder - > { + > + extends AbstractFSBuilderImpl { private final FileSystem fs; - private final Path path; private FsPermission permission = null; private int bufferSize; private short replication; @@ -100,34 +96,23 @@ public abstract class FSDataOutputStreamBuilder private Progressable progress = null; private ChecksumOpt checksumOpt = null; - /** - * Contains optional and mandatory parameters. - * - * It does not load default configurations from default files. - */ - private final Configuration options = new Configuration(false); - - /** Keep track of the keys for mandatory options. */ - private final Set mandatoryKeys = new HashSet<>(); - /** * Return the concrete implementation of the builder instance. */ - protected abstract B getThisBuilder(); + public abstract B getThisBuilder(); /** * Construct from a {@link FileContext}. * * @param fc FileContext * @param p path. - * @throws IOException + * @throws IOException failure */ FSDataOutputStreamBuilder(@Nonnull FileContext fc, @Nonnull Path p) throws IOException { - Preconditions.checkNotNull(fc); - Preconditions.checkNotNull(p); + super(checkNotNull(p)); + checkNotNull(fc); this.fs = null; - this.path = p; AbstractFileSystem afs = fc.getFSofPath(p); FsServerDefaults defaults = afs.getServerDefaults(p); @@ -141,25 +126,20 @@ public abstract class FSDataOutputStreamBuilder */ protected FSDataOutputStreamBuilder(@Nonnull FileSystem fileSystem, @Nonnull Path p) { - Preconditions.checkNotNull(fileSystem); - Preconditions.checkNotNull(p); + super(checkNotNull(p)); + checkNotNull(fileSystem); fs = fileSystem; - path = p; bufferSize = fs.getConf().getInt(IO_FILE_BUFFER_SIZE_KEY, IO_FILE_BUFFER_SIZE_DEFAULT); - replication = fs.getDefaultReplication(path); + replication = fs.getDefaultReplication(p); blockSize = fs.getDefaultBlockSize(p); } protected FileSystem getFS() { - Preconditions.checkNotNull(fs); + checkNotNull(fs); return fs; } - protected Path getPath() { - return path; - } - protected FsPermission getPermission() { if (permission == null) { permission = FsPermission.getFileDefault(); @@ -171,7 +151,7 @@ protected FsPermission getPermission() { * Set permission for the file. */ public B permission(@Nonnull final FsPermission perm) { - Preconditions.checkNotNull(perm); + checkNotNull(perm); permission = perm; return getThisBuilder(); } @@ -235,7 +215,7 @@ protected Progressable getProgress() { * Set the facility of reporting progress. */ public B progress(@Nonnull final Progressable prog) { - Preconditions.checkNotNull(prog); + checkNotNull(prog); progress = prog; return getThisBuilder(); } @@ -282,154 +262,11 @@ protected ChecksumOpt getChecksumOpt() { * Set checksum opt. */ public B checksumOpt(@Nonnull final ChecksumOpt chksumOpt) { - Preconditions.checkNotNull(chksumOpt); + checkNotNull(chksumOpt); checksumOpt = chksumOpt; return getThisBuilder(); } - /** - * Set optional Builder parameter. - */ - public B opt(@Nonnull final String key, @Nonnull final String value) { - mandatoryKeys.remove(key); - options.set(key, value); - return getThisBuilder(); - } - - /** - * Set optional boolean parameter for the Builder. - * - * @see #opt(String, String) - */ - public B opt(@Nonnull final String key, boolean value) { - mandatoryKeys.remove(key); - options.setBoolean(key, value); - return getThisBuilder(); - } - - /** - * Set optional int parameter for the Builder. - * - * @see #opt(String, String) - */ - public B opt(@Nonnull final String key, int value) { - mandatoryKeys.remove(key); - options.setInt(key, value); - return getThisBuilder(); - } - - /** - * Set optional float parameter for the Builder. - * - * @see #opt(String, String) - */ - public B opt(@Nonnull final String key, float value) { - mandatoryKeys.remove(key); - options.setFloat(key, value); - return getThisBuilder(); - } - - /** - * Set optional double parameter for the Builder. - * - * @see #opt(String, String) - */ - public B opt(@Nonnull final String key, double value) { - mandatoryKeys.remove(key); - options.setDouble(key, value); - return getThisBuilder(); - } - - /** - * Set an array of string values as optional parameter for the Builder. - * - * @see #opt(String, String) - */ - public B opt(@Nonnull final String key, @Nonnull final String... values) { - mandatoryKeys.remove(key); - options.setStrings(key, values); - return getThisBuilder(); - } - - /** - * Set mandatory option to the Builder. - * - * If the option is not supported or unavailable on the {@link FileSystem}, - * the client should expect {@link #build()} throws IllegalArgumentException. - */ - public B must(@Nonnull final String key, @Nonnull final String value) { - mandatoryKeys.add(key); - options.set(key, value); - return getThisBuilder(); - } - - /** - * Set mandatory boolean option. - * - * @see #must(String, String) - */ - public B must(@Nonnull final String key, boolean value) { - mandatoryKeys.add(key); - options.setBoolean(key, value); - return getThisBuilder(); - } - - /** - * Set mandatory int option. - * - * @see #must(String, String) - */ - public B must(@Nonnull final String key, int value) { - mandatoryKeys.add(key); - options.setInt(key, value); - return getThisBuilder(); - } - - /** - * Set mandatory float option. - * - * @see #must(String, String) - */ - public B must(@Nonnull final String key, float value) { - mandatoryKeys.add(key); - options.setFloat(key, value); - return getThisBuilder(); - } - - /** - * Set mandatory double option. - * - * @see #must(String, String) - */ - public B must(@Nonnull final String key, double value) { - mandatoryKeys.add(key); - options.setDouble(key, value); - return getThisBuilder(); - } - - /** - * Set a string array as mandatory option. - * - * @see #must(String, String) - */ - public B must(@Nonnull final String key, @Nonnull final String... values) { - mandatoryKeys.add(key); - options.setStrings(key, values); - return getThisBuilder(); - } - - protected Configuration getOptions() { - return options; - } - - /** - * Get all the keys that are set as mandatory keys. - */ - @VisibleForTesting - protected Set getMandatoryKeys() { - return Collections.unmodifiableSet(mandatoryKeys); - } - /** * Create the FSDataOutputStream to write on the file system. * diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java index e5438f553a..f65074856b 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java @@ -35,6 +35,7 @@ import java.util.Stack; import java.util.TreeSet; import java.util.Map.Entry; +import java.util.concurrent.CompletableFuture; import javax.annotation.Nonnull; @@ -44,6 +45,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem.Statistics; import org.apache.hadoop.fs.Options.CreateOpts; +import org.apache.hadoop.fs.impl.FutureDataInputStreamBuilderImpl; import org.apache.hadoop.fs.permission.AclEntry; import org.apache.hadoop.fs.permission.AclStatus; import org.apache.hadoop.fs.permission.FsAction; @@ -56,7 +58,6 @@ import org.apache.hadoop.ipc.RpcClientException; import org.apache.hadoop.ipc.RpcServerException; import org.apache.hadoop.ipc.UnexpectedServerException; -import org.apache.hadoop.fs.InvalidPathException; import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.token.Token; @@ -714,7 +715,7 @@ private FCDataOutputStreamBuilder( } @Override - protected FCDataOutputStreamBuilder getThisBuilder() { + public FCDataOutputStreamBuilder getThisBuilder() { return this; } @@ -2869,4 +2870,68 @@ public Collection getAllStoragePolicies() Tracer getTracer() { return tracer; } + + /** + * Open a file for reading through a builder API. + * Ultimately calls {@link #open(Path, int)} unless a subclass + * executes the open command differently. + * + * The semantics of this call are therefore the same as that of + * {@link #open(Path, int)} with one special point: it is in + * {@code FSDataInputStreamBuilder.build()} in which the open operation + * takes place -it is there where all preconditions to the operation + * are checked. + * @param path file path + * @return a FSDataInputStreamBuilder object to build the input stream + * @throws IOException if some early checks cause IO failures. + * @throws UnsupportedOperationException if support is checked early. + */ + @InterfaceStability.Unstable + public FutureDataInputStreamBuilder openFile(Path path) + throws IOException, UnsupportedOperationException { + + return new FSDataInputStreamBuilder(path); + } + + /** + * Builder returned for {@link #openFile(Path)}. + */ + private class FSDataInputStreamBuilder + extends FutureDataInputStreamBuilderImpl { + + /** + * Path Constructor. + * @param path path to open. + */ + protected FSDataInputStreamBuilder( + @Nonnull final Path path) throws IOException { + super(FileContext.this, path); + } + + /** + * Perform the open operation. + * + * @return a future to the input stream. + * @throws IOException early failure to open + * @throws UnsupportedOperationException if the specific operation + * is not supported. + * @throws IllegalArgumentException if the parameters are not valid. + */ + @Override + public CompletableFuture build() throws IOException { + final Path absF = fixRelativePart(getPath()); + return new FSLinkResolver>() { + @Override + public CompletableFuture next( + final AbstractFileSystem fs, + final Path p) + throws IOException { + return fs.openFileWithOptions(p, + getMandatoryKeys(), + getOptions(), + getBufferSize()); + } + }.resolve(FileContext.this, absF); + } + } } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java index 5454cd0270..7e144e0f83 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.fs; +import javax.annotation.Nonnull; import java.io.Closeable; import java.io.FileNotFoundException; import java.io.IOException; @@ -27,6 +28,7 @@ import java.security.PrivilegedExceptionAction; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; @@ -35,11 +37,13 @@ import java.util.List; import java.util.Map; import java.util.NoSuchElementException; +import java.util.Optional; import java.util.ServiceConfigurationError; import java.util.ServiceLoader; import java.util.Set; import java.util.Stack; import java.util.TreeSet; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.logging.Log; @@ -52,6 +56,8 @@ import org.apache.hadoop.fs.Options.ChecksumOpt; import org.apache.hadoop.fs.Options.HandleOpt; import org.apache.hadoop.fs.Options.Rename; +import org.apache.hadoop.fs.impl.AbstractFSBuilderImpl; +import org.apache.hadoop.fs.impl.FutureDataInputStreamBuilderImpl; import org.apache.hadoop.fs.permission.AclEntry; import org.apache.hadoop.fs.permission.AclStatus; import org.apache.hadoop.fs.permission.FsAction; @@ -67,6 +73,7 @@ import org.apache.hadoop.security.token.DelegationTokenIssuer; import org.apache.hadoop.util.ClassUtil; import org.apache.hadoop.util.DataChecksum; +import org.apache.hadoop.util.LambdaUtils; import org.apache.hadoop.util.Progressable; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.ShutdownHookManager; @@ -117,6 +124,11 @@ *
  • The term "file" refers to a file in the remote filesystem, * rather than instances of {@code java.io.File}.
  • * + * + * This is a carefully evolving class. + * New methods may be marked as Unstable or Evolving for their initial release, + * as a warning that they are new and may change based on the + * experience of use in applications. *****************************************************************/ @SuppressWarnings("DeprecatedIsStillUsed") @InterfaceAudience.Public @@ -4241,6 +4253,8 @@ protected FileSystemDataOutputStreamBuilder(FileSystem fileSystem, Path p) { @Override public FSDataOutputStream build() throws IOException { + rejectUnknownMandatoryKeys(Collections.emptySet(), + " for " + getPath()); if (getFlags().contains(CreateFlag.CREATE) || getFlags().contains(CreateFlag.OVERWRITE)) { if (isRecursive()) { @@ -4255,11 +4269,12 @@ public FSDataOutputStream build() throws IOException { } else if (getFlags().contains(CreateFlag.APPEND)) { return getFS().append(getPath(), getBufferSize(), getProgress()); } - throw new IOException("Must specify either create, overwrite or append"); + throw new PathIOException(getPath().toString(), + "Must specify either create, overwrite or append"); } @Override - protected FileSystemDataOutputStreamBuilder getThisBuilder() { + public FileSystemDataOutputStreamBuilder getThisBuilder() { return this; } } @@ -4287,4 +4302,173 @@ public FSDataOutputStreamBuilder createFile(Path path) { public FSDataOutputStreamBuilder appendFile(Path path) { return new FileSystemDataOutputStreamBuilder(this, path).append(); } + + /** + * Open a file for reading through a builder API. + * Ultimately calls {@link #open(Path, int)} unless a subclass + * executes the open command differently. + * + * The semantics of this call are therefore the same as that of + * {@link #open(Path, int)} with one special point: it is in + * {@code FSDataInputStreamBuilder.build()} in which the open operation + * takes place -it is there where all preconditions to the operation + * are checked. + * @param path file path + * @return a FSDataInputStreamBuilder object to build the input stream + * @throws IOException if some early checks cause IO failures. + * @throws UnsupportedOperationException if support is checked early. + */ + @InterfaceStability.Unstable + public FutureDataInputStreamBuilder openFile(Path path) + throws IOException, UnsupportedOperationException { + return new FSDataInputStreamBuilder(this, path).getThisBuilder(); + } + + /** + * Open a file for reading through a builder API. + * Ultimately calls {@link #open(PathHandle, int)} unless a subclass + * executes the open command differently. + * + * If PathHandles are unsupported, this may fail in the + * {@code FSDataInputStreamBuilder.build()} command, + * rather than in this {@code openFile()} operation. + * @param pathHandle path handle. + * @return a FSDataInputStreamBuilder object to build the input stream + * @throws IOException if some early checks cause IO failures. + * @throws UnsupportedOperationException if support is checked early. + */ + @InterfaceStability.Unstable + public FutureDataInputStreamBuilder openFile(PathHandle pathHandle) + throws IOException, UnsupportedOperationException { + return new FSDataInputStreamBuilder(this, pathHandle) + .getThisBuilder(); + } + + /** + * Execute the actual open file operation. + * + * This is invoked from {@code FSDataInputStreamBuilder.build()} + * and from {@link DelegateToFileSystem} and is where + * the action of opening the file should begin. + * + * The base implementation performs a blocking + * call to {@link #open(Path, int)}in this call; + * the actual outcome is in the returned {@code CompletableFuture}. + * This avoids having to create some thread pool, while still + * setting up the expectation that the {@code get()} call + * is needed to evaluate the result. + * @param path path to the file + * @param mandatoryKeys set of options declared as mandatory. + * @param options options set during the build sequence. + * @param bufferSize buffer size + * @return a future which will evaluate to the opened file. + * @throws IOException failure to resolve the link. + * @throws IllegalArgumentException unknown mandatory key + */ + protected CompletableFuture openFileWithOptions( + final Path path, + final Set mandatoryKeys, + final Configuration options, + final int bufferSize) throws IOException { + AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(mandatoryKeys, + Collections.emptySet(), + "for " + path); + return LambdaUtils.eval( + new CompletableFuture<>(), () -> open(path, bufferSize)); + } + + /** + * Execute the actual open file operation. + * The base implementation performs a blocking + * call to {@link #open(Path, int)}in this call; + * the actual outcome is in the returned {@code CompletableFuture}. + * This avoids having to create some thread pool, while still + * setting up the expectation that the {@code get()} call + * is needed to evaluate the result. + * @param pathHandle path to the file + * @param mandatoryKeys set of options declared as mandatory. + * @param options options set during the build sequence. + * @param bufferSize buffer size + * @return a future which will evaluate to the opened file. + * @throws IOException failure to resolve the link. + * @throws IllegalArgumentException unknown mandatory key + * @throws UnsupportedOperationException PathHandles are not supported. + * This may be deferred until the future is evaluated. + */ + protected CompletableFuture openFileWithOptions( + final PathHandle pathHandle, + final Set mandatoryKeys, + final Configuration options, + final int bufferSize) throws IOException { + AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(mandatoryKeys, + Collections.emptySet(), ""); + CompletableFuture result = new CompletableFuture<>(); + try { + result.complete(open(pathHandle, bufferSize)); + } catch (UnsupportedOperationException tx) { + // fail fast here + throw tx; + } catch (Throwable tx) { + // fail lazily here to ensure callers expect all File IO operations to + // surface later + result.completeExceptionally(tx); + } + return result; + } + + /** + * Builder returned for {@code #openFile(Path)} + * and {@code #openFile(PathHandle)}. + */ + private static class FSDataInputStreamBuilder + extends FutureDataInputStreamBuilderImpl + implements FutureDataInputStreamBuilder { + + /** + * Path Constructor. + * @param fileSystem owner + * @param path path to open. + */ + protected FSDataInputStreamBuilder( + @Nonnull final FileSystem fileSystem, + @Nonnull final Path path) { + super(fileSystem, path); + } + + /** + * Construct from a path handle. + * @param fileSystem owner + * @param pathHandle path handle of file to open. + */ + protected FSDataInputStreamBuilder( + @Nonnull final FileSystem fileSystem, + @Nonnull final PathHandle pathHandle) { + super(fileSystem, pathHandle); + } + + /** + * Perform the open operation. + * Returns a future which, when get() or a chained completion + * operation is invoked, will supply the input stream of the file + * referenced by the path/path handle. + * @return a future to the input stream. + * @throws IOException early failure to open + * @throws UnsupportedOperationException if the specific operation + * is not supported. + * @throws IllegalArgumentException if the parameters are not valid. + */ + @Override + public CompletableFuture build() throws IOException { + Optional optionalPath = getOptionalPath(); + if(optionalPath.isPresent()) { + return getFS().openFileWithOptions(optionalPath.get(), + getMandatoryKeys(), getOptions(), getBufferSize()); + } else { + return getFS().openFileWithOptions(getPathHandle(), + getMandatoryKeys(), getOptions(), getBufferSize()); + } + } + + } + } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java index a3e10b4fed..99c18b6646 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFileSystem.java @@ -25,6 +25,8 @@ import java.util.EnumSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.concurrent.CompletableFuture; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; @@ -692,4 +694,35 @@ public FSDataOutputStreamBuilder createFile(Path path) { public FSDataOutputStreamBuilder appendFile(Path path) { return fs.appendFile(path); } + + @Override + public FutureDataInputStreamBuilder openFile(final Path path) + throws IOException, UnsupportedOperationException { + return fs.openFile(path); + } + + @Override + public FutureDataInputStreamBuilder openFile(final PathHandle pathHandle) + throws IOException, UnsupportedOperationException { + return fs.openFile(pathHandle); + } + + @Override + protected CompletableFuture openFileWithOptions( + final Path path, + final Set mandatoryKeys, + final Configuration options, + final int bufferSize) throws IOException { + return fs.openFileWithOptions(path, mandatoryKeys, options, bufferSize); + } + + @Override + protected CompletableFuture openFileWithOptions( + final PathHandle pathHandle, + final Set mandatoryKeys, + final Configuration options, + final int bufferSize) throws IOException { + return fs.openFileWithOptions(pathHandle, mandatoryKeys, options, + bufferSize); + } } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java index d6dc59bad3..f5430d6026 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FilterFs.java @@ -26,9 +26,12 @@ import java.util.EnumSet; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.concurrent.CompletableFuture; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem.Statistics; import org.apache.hadoop.fs.permission.AclEntry; import org.apache.hadoop.fs.permission.AclStatus; @@ -433,4 +436,14 @@ public Collection getAllStoragePolicies() throws IOException { return myFs.getAllStoragePolicies(); } + + @Override + public CompletableFuture openFileWithOptions( + final Path path, + final Set mandatoryKeys, + final Configuration options, + final int bufferSize) throws IOException { + return myFs.openFileWithOptions(path, mandatoryKeys, options, bufferSize); + } + } diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java new file mode 100644 index 0000000000..774d30927d --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FutureDataInputStreamBuilder.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.fs; + +import java.io.IOException; +import java.util.concurrent.CompletableFuture; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * Builder for input streams and subclasses whose return value is + * actually a completable future: this allows for better asynchronous + * operation. + * + * To be more generic, {@link #opt(String, int)} and {@link #must(String, int)} + * variants provide implementation-agnostic way to customize the builder. + * Each FS-specific builder implementation can interpret the FS-specific + * options accordingly, for example: + * + * If the option is not related to the file system, the option will be ignored. + * If the option is must, but not supported by the file system, a + * {@link IllegalArgumentException} will be thrown. + * + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public interface FutureDataInputStreamBuilder + extends FSBuilder, FutureDataInputStreamBuilder> { + + @Override + CompletableFuture build() + throws IllegalArgumentException, UnsupportedOperationException, + IOException; +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AbstractFSBuilderImpl.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AbstractFSBuilderImpl.java new file mode 100644 index 0000000000..5fc92e97be --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/AbstractFSBuilderImpl.java @@ -0,0 +1,356 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.impl; + +import javax.annotation.Nonnull; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.NoSuchElementException; +import java.util.Optional; +import java.util.Set; + +import com.google.common.annotations.VisibleForTesting; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSBuilder; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathHandle; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; + +/** + * Builder for filesystem/filecontext operations of various kinds, + * with option support. + * + * + * .opt("foofs:option.a", true) + * .opt("foofs:option.b", "value") + * .opt("barfs:cache", true) + * .must("foofs:cache", true) + * .must("barfs:cache-size", 256 * 1024 * 1024) + * .build(); + * + * + * Configuration keys declared in an {@code opt()} may be ignored by + * a builder which does not recognise them. + * + * Configuration keys declared in a {@code must()} function set must + * be understood by the implementation or a + * {@link IllegalArgumentException} will be thrown. + * + * @param Return type on the {@link #build()} call. + * @param type of builder itself. + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public abstract class + AbstractFSBuilderImpl> + implements FSBuilder { + + public static final String UNKNOWN_MANDATORY_KEY = "Unknown mandatory key"; + + @VisibleForTesting + static final String E_BOTH_A_PATH_AND_A_PATH_HANDLE + = "Both a path and a pathHandle has been provided to the constructor"; + + private final Optional optionalPath; + + private final Optional optionalPathHandle; + + /** + * Contains optional and mandatory parameters. + * + * It does not load default configurations from default files. + */ + private final Configuration options = new Configuration(false); + + /** Keep track of the keys for mandatory options. */ + private final Set mandatoryKeys = new HashSet<>(); + + /** + * Constructor with both optional path and path handle. + * Either or both argument may be empty, but it is an error for + * both to be defined. + * @param optionalPath a path or empty + * @param optionalPathHandle a path handle/empty + * @throws IllegalArgumentException if both parameters are set. + */ + protected AbstractFSBuilderImpl( + @Nonnull Optional optionalPath, + @Nonnull Optional optionalPathHandle) { + checkArgument(!(checkNotNull(optionalPath).isPresent() + && checkNotNull(optionalPathHandle).isPresent()), + E_BOTH_A_PATH_AND_A_PATH_HANDLE); + this.optionalPath = optionalPath; + this.optionalPathHandle = optionalPathHandle; + } + + protected AbstractFSBuilderImpl(@Nonnull final Path path) { + this(Optional.of(path), Optional.empty()); + } + + protected AbstractFSBuilderImpl(@Nonnull final PathHandle pathHandle) { + this(Optional.empty(), Optional.of(pathHandle)); + } + + + /** + * Get the cast builder. + * @return this object, typecast + */ + public B getThisBuilder() { + return (B)this; + } + + /** + * Get the optional path; may be empty. + * @return the optional path field. + */ + public Optional getOptionalPath() { + return optionalPath; + } + + /** + * Get the path: only valid if constructed with a path. + * @return the path + * @throws NoSuchElementException if the field is empty. + */ + public Path getPath() { + return optionalPath.get(); + } + + /** + * Get the optional path handle; may be empty. + * @return the optional path handle field. + */ + public Optional getOptionalPathHandle() { + return optionalPathHandle; + } + + /** + * Get the PathHandle: only valid if constructed with a PathHandle. + * @return the PathHandle + * @throws NoSuchElementException if the field is empty. + */ + public PathHandle getPathHandle() { + return optionalPathHandle.get(); + } + + /** + * Set optional Builder parameter. + */ + @Override + public B opt(@Nonnull final String key, @Nonnull final String value) { + mandatoryKeys.remove(key); + options.set(key, value); + return getThisBuilder(); + } + + /** + * Set optional boolean parameter for the Builder. + * + * @see #opt(String, String) + */ + @Override + public B opt(@Nonnull final String key, boolean value) { + mandatoryKeys.remove(key); + options.setBoolean(key, value); + return getThisBuilder(); + } + + /** + * Set optional int parameter for the Builder. + * + * @see #opt(String, String) + */ + @Override + public B opt(@Nonnull final String key, int value) { + mandatoryKeys.remove(key); + options.setInt(key, value); + return getThisBuilder(); + } + + /** + * Set optional float parameter for the Builder. + * + * @see #opt(String, String) + */ + @Override + public B opt(@Nonnull final String key, float value) { + mandatoryKeys.remove(key); + options.setFloat(key, value); + return getThisBuilder(); + } + + /** + * Set optional double parameter for the Builder. + * + * @see #opt(String, String) + */ + @Override + public B opt(@Nonnull final String key, double value) { + mandatoryKeys.remove(key); + options.setDouble(key, value); + return getThisBuilder(); + } + + /** + * Set an array of string values as optional parameter for the Builder. + * + * @see #opt(String, String) + */ + @Override + public B opt(@Nonnull final String key, @Nonnull final String... values) { + mandatoryKeys.remove(key); + options.setStrings(key, values); + return getThisBuilder(); + } + + /** + * Set mandatory option to the Builder. + * + * If the option is not supported or unavailable on the {@link FileSystem}, + * the client should expect {@link #build()} throws IllegalArgumentException. + */ + @Override + public B must(@Nonnull final String key, @Nonnull final String value) { + mandatoryKeys.add(key); + options.set(key, value); + return getThisBuilder(); + } + + /** + * Set mandatory boolean option. + * + * @see #must(String, String) + */ + @Override + public B must(@Nonnull final String key, boolean value) { + mandatoryKeys.add(key); + options.setBoolean(key, value); + return getThisBuilder(); + } + + /** + * Set mandatory int option. + * + * @see #must(String, String) + */ + @Override + public B must(@Nonnull final String key, int value) { + mandatoryKeys.add(key); + options.setInt(key, value); + return getThisBuilder(); + } + + /** + * Set mandatory float option. + * + * @see #must(String, String) + */ + @Override + public B must(@Nonnull final String key, float value) { + mandatoryKeys.add(key); + options.setFloat(key, value); + return getThisBuilder(); + } + + /** + * Set mandatory double option. + * + * @see #must(String, String) + */ + @Override + public B must(@Nonnull final String key, double value) { + mandatoryKeys.add(key); + options.setDouble(key, value); + return getThisBuilder(); + } + + /** + * Set a string array as mandatory option. + * + * @see #must(String, String) + */ + @Override + public B must(@Nonnull final String key, @Nonnull final String... values) { + mandatoryKeys.add(key); + options.setStrings(key, values); + return getThisBuilder(); + } + + /** + * Get the mutable option configuration. + * @return the option configuration. + */ + public Configuration getOptions() { + return options; + } + + /** + * Get all the keys that are set as mandatory keys. + */ + public Set getMandatoryKeys() { + return Collections.unmodifiableSet(mandatoryKeys); + } + + /** + * Reject a configuration if one or more mandatory keys are + * not in the set of mandatory keys. + * The first invalid key raises the exception; the order of the + * scan and hence the specific key raising the exception is undefined. + * @param knownKeys a possibly empty collection of known keys + * @param extraErrorText extra error text to include. + * @throws IllegalArgumentException if any key is unknown. + */ + protected void rejectUnknownMandatoryKeys(final Collection knownKeys, + String extraErrorText) + throws IllegalArgumentException { + rejectUnknownMandatoryKeys(mandatoryKeys, knownKeys, extraErrorText); + } + + /** + * Reject a configuration if one or more mandatory keys are + * not in the set of mandatory keys. + * The first invalid key raises the exception; the order of the + * scan and hence the specific key raising the exception is undefined. + * @param mandatory the set of mandatory keys + * @param knownKeys a possibly empty collection of known keys + * @param extraErrorText extra error text to include. + * @throws IllegalArgumentException if any key is unknown. + */ + public static void rejectUnknownMandatoryKeys( + final Set mandatory, + final Collection knownKeys, + final String extraErrorText) + throws IllegalArgumentException { + final String eText = extraErrorText.isEmpty() + ? "" + : (extraErrorText + " "); + mandatory.forEach((key) -> + checkArgument(knownKeys.contains(key), + UNKNOWN_MANDATORY_KEY + " %s\"%s\"", eText, key)); + } + +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java new file mode 100644 index 0000000000..2aa4a5d95f --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureDataInputStreamBuilderImpl.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.impl; + +import javax.annotation.Nonnull; +import java.io.IOException; +import java.util.concurrent.CompletableFuture; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileContext; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FutureDataInputStreamBuilder; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathHandle; + +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT; +import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY; + +/** + * Builder for input streams and subclasses whose return value is + * actually a completable future: this allows for better asynchronous + * operation. + * + * To be more generic, {@link #opt(String, int)} and {@link #must(String, int)} + * variants provide implementation-agnostic way to customize the builder. + * Each FS-specific builder implementation can interpret the FS-specific + * options accordingly, for example: + * + * If the option is not related to the file system, the option will be ignored. + * If the option is must, but not supported by the file system, a + * {@link IllegalArgumentException} will be thrown. + * + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public abstract class FutureDataInputStreamBuilderImpl + extends AbstractFSBuilderImpl, FutureDataInputStreamBuilder> + implements FutureDataInputStreamBuilder { + + private final FileSystem fileSystem; + + private int bufferSize; + + /** + * Construct from a {@link FileContext}. + * + * @param fc FileContext + * @param path path. + * @throws IOException failure + */ + protected FutureDataInputStreamBuilderImpl(@Nonnull FileContext fc, + @Nonnull Path path) throws IOException { + super(checkNotNull(path)); + checkNotNull(fc); + this.fileSystem = null; + bufferSize = IO_FILE_BUFFER_SIZE_DEFAULT; + } + + /** + * Constructor. + * @param fileSystem owner FS. + * @param path path + */ + protected FutureDataInputStreamBuilderImpl(@Nonnull FileSystem fileSystem, + @Nonnull Path path) { + super(checkNotNull(path)); + this.fileSystem = checkNotNull(fileSystem); + initFromFS(); + } + + /** + * Constructor with PathHandle. + * @param fileSystem owner FS. + * @param pathHandle path handle + */ + public FutureDataInputStreamBuilderImpl(@Nonnull FileSystem fileSystem, + @Nonnull PathHandle pathHandle) { + super(pathHandle); + this.fileSystem = fileSystem; + initFromFS(); + } + + /** + * Initialize from a filesystem. + */ + private void initFromFS() { + bufferSize = fileSystem.getConf().getInt(IO_FILE_BUFFER_SIZE_KEY, + IO_FILE_BUFFER_SIZE_DEFAULT); + } + + protected FileSystem getFS() { + checkNotNull(fileSystem); + return fileSystem; + } + + protected int getBufferSize() { + return bufferSize; + } + + /** + * Set the size of the buffer to be used. + */ + public FutureDataInputStreamBuilder bufferSize(int bufSize) { + bufferSize = bufSize; + return getThisBuilder(); + } + + /** + * Get the builder. + * This must be used after the constructor has been invoked to create + * the actual builder: it allows for subclasses to do things after + * construction. + */ + public FutureDataInputStreamBuilder builder() { + return getThisBuilder(); + } + + @Override + public FutureDataInputStreamBuilder getThisBuilder() { + return this; + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureIOSupport.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureIOSupport.java new file mode 100644 index 0000000000..9d5f2bf4b6 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/FutureIOSupport.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.impl; + +import java.io.IOException; +import java.io.InterruptedIOException; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSBuilder; + +/** + * Support for future IO and the FS Builder subclasses. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +public final class FutureIOSupport { + + private FutureIOSupport() { + } + + /** + * Given a future, evaluate it. Raised exceptions are + * extracted and handled. + * @param future future to evaluate + * @param type of the result. + * @return the result, if all went well. + * @throws InterruptedIOException future was interrupted + * @throws IOException if something went wrong + * @throws RuntimeException any nested RTE thrown + */ + public static T awaitFuture(final Future future) + throws InterruptedIOException, IOException, RuntimeException { + try { + return future.get(); + } catch (InterruptedException e) { + throw (InterruptedIOException)new InterruptedIOException(e.toString()) + .initCause(e); + } catch (ExecutionException e) { + return raiseInnerCause(e); + } + } + + + /** + * Given a future, evaluate it. Raised exceptions are + * extracted and handled. + * @param future future to evaluate + * @param type of the result. + * @return the result, if all went well. + * @throws InterruptedIOException future was interrupted + * @throws IOException if something went wrong + * @throws RuntimeException any nested RTE thrown + * @throws TimeoutException the future timed out. + */ + public static T awaitFuture(final Future future, + final long timeout, + final TimeUnit unit) + throws InterruptedIOException, IOException, RuntimeException, + TimeoutException { + + try { + return future.get(timeout, unit); + } catch (InterruptedException e) { + throw (InterruptedIOException)new InterruptedIOException(e.toString()) + .initCause(e); + } catch (ExecutionException e) { + return raiseInnerCause(e); + } + } + + + /** + * From the inner cause of an execution exception, extract the inner cause + * if it is an IOE or RTE. + * This will always raise an exception, either the inner IOException, + * an inner RuntimeException, or a new IOException wrapping the raised + * exception. + * + * @param e exception. + * @param type of return value. + * @return nothing, ever. + * @throws IOException either the inner IOException, or a wrapper around + * any non-Runtime-Exception + * @throws RuntimeException if that is the inner cause. + */ + public static T raiseInnerCause(final ExecutionException e) + throws IOException { + Throwable cause = e.getCause(); + if (cause instanceof IOException) { + throw (IOException) cause; + } else if (cause instanceof WrappedIOException){ + throw ((WrappedIOException) cause).getCause(); + } else if (cause instanceof RuntimeException){ + throw (RuntimeException) cause; + } else if (cause != null) { + // other type: wrap with a new IOE + throw new IOException(cause); + } else { + // this only happens if somebody deliberately raises + // an ExecutionException + throw new IOException(e); + } + } + + /** + * Propagate options to any builder, converting everything with the + * prefix to an option where, if there were 2+ dot-separated elements, + * it is converted to a schema. + *
    +   *   fs.example.s3a.option => s3a:option
    +   *   fs.example.fs.io.policy => s3a.io.policy
    +   *   fs.example.something => something
    +   * 
    + * @param builder builder to modify + * @param conf configuration to read + * @param optionalPrefix prefix for optional settings + * @param mandatoryPrefix prefix for mandatory settings + * @param type of result + * @param type of builder + * @return the builder passed in. + */ + public static > + FSBuilder propagateOptions( + final FSBuilder builder, + final Configuration conf, + final String optionalPrefix, + final String mandatoryPrefix) { + propagateOptions(builder, conf, + optionalPrefix, false); + propagateOptions(builder, conf, + mandatoryPrefix, true); + return builder; + } + + /** + * Propagate options to any builder, converting everything with the + * prefix to an option where, if there were 2+ dot-separated elements, + * it is converted to a schema. + *
    +   *   fs.example.s3a.option => s3a:option
    +   *   fs.example.fs.io.policy => s3a.io.policy
    +   *   fs.example.something => something
    +   * 
    + * @param builder builder to modify + * @param conf configuration to read + * @param prefix prefix to scan/strip + * @param mandatory are the options to be mandatory or optional? + */ + public static void propagateOptions( + final FSBuilder builder, + final Configuration conf, + final String prefix, + final boolean mandatory) { + + final String p = prefix.endsWith(".") ? prefix : (prefix + "."); + final Map propsWithPrefix = conf.getPropsWithPrefix(p); + for (Map.Entry entry : propsWithPrefix.entrySet()) { + // change the schema off each entry + String key = entry.getKey(); + String val = entry.getValue(); + if (mandatory) { + builder.must(key, val); + } else { + builder.opt(key, val); + } + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/WrappedIOException.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/WrappedIOException.java new file mode 100644 index 0000000000..1de1ecb785 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/WrappedIOException.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.impl; + +import java.io.IOException; +import java.util.concurrent.ExecutionException; + +import com.google.common.base.Preconditions; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * A wrapper for an IOException which + * {@link FutureIOSupport#raiseInnerCause(ExecutionException)} knows to + * always extract the exception. + * + * The constructor signature guarantees the cause will be an IOException, + * and as it checks for a null-argument, non-null. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +public class WrappedIOException extends RuntimeException { + + private static final long serialVersionUID = 2510210974235779294L; + + /** + * Construct from a non-null IOException. + * @param cause inner cause + * @throws NullPointerException if the cause is null. + */ + public WrappedIOException(final IOException cause) { + super(Preconditions.checkNotNull(cause)); + } + + @Override + public synchronized IOException getCause() { + return (IOException) super.getCause(); + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/package-info.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/package-info.java new file mode 100644 index 0000000000..f1cd76c8f8 --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/impl/package-info.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This package contains implementation classes for use inside + * filesystems. + * + * These classes MUST NOT be directly exposed as the arguments + * or return values of methods, or as part of a visible + * inheritance tree. + * + * These classes MAY be returned behind interfaces. + * When such interfaces are used as parameters, the methods + * which accept the interfaces MUST NOT cast them to the classes + * contained therein: they MUST interact purely through + * the interface. + * + * That is: don't expose the implementation classes in here, + * and don't expect input interface implementations to always + * be the classes in here. + * + * These classes are for the private use of FileSystem/ + * FileContext implementations. + * Implementation classes not developed within the ASF Hadoop + * codebase MAY use these, with the caveat that these classes + * are highly unstable. + */ + +@InterfaceAudience.LimitedPrivate("Filesystems") +@InterfaceStability.Unstable +package org.apache.hadoop.fs.impl; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/PassthroughCodec.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/PassthroughCodec.java new file mode 100644 index 0000000000..a3f0bffeeb --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/compress/PassthroughCodec.java @@ -0,0 +1,246 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.io.compress; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; + +/** + * This is a special codec which does not transform the output. + * It can be declared as a codec in the option "io.compression.codecs", + * and then it will declare that it supports the file extension + * set in {@link #OPT_EXTENSION}. + * + * This allows decompression to be disabled on a job, even when there is + * a registered/discoverable decompression codec for a file extension + * -without having to change the standard codec binding mechanism. + * + * For example, to disable decompression for a gzipped files, set the + * options + *
    + *   io.compression.codecs = org.apache.hadoop.io.compress.PassthroughCodec
    + *   io.compress.passthrough.extension = .gz
    + * 
    + * + * Note: this is not a Splittable codec: it doesn't know the + * capabilities of the passed in stream. It should be possible to + * extend this in a subclass: the inner classes are marked as protected + * to enable this. Do not retrofit splitting to this class.. + * + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public class PassthroughCodec + implements Configurable, CompressionCodec { + + private static final Logger LOG = + LoggerFactory.getLogger(PassthroughCodec.class); + + /** + * Classname of the codec: {@value}. + */ + public static final String CLASSNAME = + "org.apache.hadoop.io.compress.PassthroughCodec"; + + /** + * Option to control the extension of the code: {@value}. + */ + public static final String OPT_EXTENSION = + "io.compress.passthrough.extension"; + + /** + * This default extension is here so that if no extension has been defined, + * some value is still returned: {@value}.. + */ + public static final String DEFAULT_EXTENSION = ".passthrough"; + + private Configuration conf; + + private String extension = DEFAULT_EXTENSION; + + public PassthroughCodec() { + } + + @Override + public Configuration getConf() { + return conf; + } + + @Override + public void setConf(final Configuration conf) { + this.conf = conf; + // update the default extension value at this point, adding + // a dot prefix if needed. + String ex = conf.getTrimmed(OPT_EXTENSION, DEFAULT_EXTENSION); + extension = ex.startsWith(".") ? ex : ("." + ex); + } + + @Override + public String getDefaultExtension() { + LOG.info("Registering fake codec for extension {}", extension); + return extension; + } + + @Override + public CompressionOutputStream createOutputStream(final OutputStream out) + throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public CompressionOutputStream createOutputStream(final OutputStream out, + final Compressor compressor) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public Class getCompressorType() { + throw new UnsupportedOperationException(); + } + + @Override + public Compressor createCompressor() { + throw new UnsupportedOperationException(); + } + + @Override + public CompressionInputStream createInputStream(final InputStream in) + throws IOException { + return createInputStream(in, null); + } + + @Override + public CompressionInputStream createInputStream(final InputStream in, + final Decompressor decompressor) throws IOException { + return new PassthroughDecompressorStream(in); + } + + @Override + public Class getDecompressorType() { + return StubDecompressor.class; + } + + @Override + public Decompressor createDecompressor() { + return new StubDecompressor(); + } + + /** + * The decompressor. + */ + protected static final class PassthroughDecompressorStream + extends DecompressorStream { + + private final InputStream input; + + PassthroughDecompressorStream(final InputStream input) + throws IOException { + super(input); + this.input = input; + } + + @Override + public int read(final byte[] b) throws IOException { + return input.read(b); + } + + @Override + public int read() throws IOException { + return input.read(); + } + + @Override + public int read(final byte[] b, final int off, final int len) + throws IOException { + return input.read(b, off, len); + } + + @Override + public long skip(final long n) throws IOException { + return input.skip(n); + } + + @Override + public int available() throws IOException { + return input.available(); + } + } + + /** + * The decompressor is a no-op. It is not needed other than + * to complete the methods offered by the interface. + */ + protected static final class StubDecompressor implements Decompressor { + + @Override + public void setInput(final byte[] b, final int off, final int len) { + + } + + @Override + public boolean needsInput() { + return false; + } + + @Override + public void setDictionary(final byte[] b, final int off, final int len) { + + } + + @Override + public boolean needsDictionary() { + return false; + } + + @Override + public boolean finished() { + return false; + } + + @Override + public int decompress(final byte[] b, final int off, final int len) + throws IOException { + return 0; + } + + @Override + public int getRemaining() { + return 0; + } + + @Override + public void reset() { + + } + + @Override + public void end() { + + } + } +} diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LambdaUtils.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LambdaUtils.java new file mode 100644 index 0000000000..14c6db608a --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/LambdaUtils.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.util; + +import java.util.concurrent.Callable; +import java.util.concurrent.CompletableFuture; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * Lambda-expression utilities be they generic or specific to + * Hadoop datatypes. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +public final class LambdaUtils { + + private LambdaUtils() { + } + + /** + * Utility method to evaluate a callable and fill in the future + * with the result or the exception raised. + * Once this method returns, the future will have been evaluated to + * either a return value or an exception. + * @param type of future + * @param result future for the result. + * @param call callable to invoke. + * @return the future passed in + */ + public static CompletableFuture eval( + final CompletableFuture result, + final Callable call) { + try { + result.complete(call.call()); + } catch (Throwable tx) { + result.completeExceptionally(tx); + } + return result; + } + +} diff --git a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml index 73f2d1020d..d05e1bb4f7 100644 --- a/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml +++ b/hadoop-common-project/hadoop-common/src/main/resources/core-default.xml @@ -1712,6 +1712,118 @@ + + fs.s3a.select.enabled + true + Is S3 Select enabled? + + + + fs.s3a.select.input.csv.comment.marker + # + In S3 Select queries: the marker for comment lines in CSV files + + + + fs.s3a.select.input.csv.record.delimiter + \n + In S3 Select queries over CSV files: the record delimiter. + \t is remapped to the TAB character, \r to CR \n to newline. \\ to \ + and \" to " + + + + + fs.s3a.select.input.csv.field.delimiter + , + In S3 Select queries over CSV files: the field delimiter. + \t is remapped to the TAB character, \r to CR \n to newline. \\ to \ + and \" to " + + + + + fs.s3a.select.input.csv.quote.character + " + In S3 Select queries over CSV files: quote character. + \t is remapped to the TAB character, \r to CR \n to newline. \\ to \ + and \" to " + + + + + fs.s3a.select.input.csv.quote.escape.character + \\ + In S3 Select queries over CSV files: quote escape character. + \t is remapped to the TAB character, \r to CR \n to newline. \\ to \ + and \" to " + + + + + fs.s3a.select.input.csv.header + none + In S3 Select queries over CSV files: what is the role of the header? One of "none", "ignore" and "use" + + + + fs.s3a.select.input.compression + none + In S3 Select queries, the source compression + algorithm. One of: "none" and "gzip" + + + + fs.s3a.select.output.csv.quote.fields + always + + In S3 Select queries: should fields in generated CSV Files be quoted? + One of: "always", "asneeded". + + + + + fs.s3a.select.output.csv.quote.character + " + + In S3 Select queries: the quote character for generated CSV Files. + + + + + fs.s3a.select.output.csv.quote.escape.character + \\ + + In S3 Select queries: the quote escape character for generated CSV Files. + + + + + fs.s3a.select.output.csv.record.delimiter + \n + + In S3 Select queries: the record delimiter for generated CSV Files. + + + + + fs.s3a.select.output.csv.field.delimiter + , + + In S3 Select queries: the field delimiter for generated CSV Files. + + + + + fs.s3a.select.errors.include.sql + false + + Include the SQL statement in errors: this is useful for development but + may leak security and Personally Identifying Information in production, + so must be disabled there. + + + fs.AbstractFileSystem.s3a.impl org.apache.hadoop.fs.s3a.S3A diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md index 28c6fbe240..3751847c7f 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md @@ -693,9 +693,94 @@ symbolic links exists in the metadata, but no copies of any its blocks can be located; -`FileNotFoundException` would seem more accurate and useful. +### `FSDataInputStreamBuilder openFile(Path path)` + +Creates a [`FSDataInputStreamBuilder`](fsdatainputstreambuilder.html) +to construct a operation to open the file at `path` for reading. + + +When `build()` is invoked on the returned `FSDataInputStreamBuilder` instance, +the builder parameters are verified and +`openFileWithOptions(Path, Set, Configuration, int)` invoked. + +This (protected) operation returns a `CompletableFuture` +which, when its `get()` method is called, either returns an input +stream of the contents of opened file, or raises an exception. + +The base implementation of the `openFileWithOptions(PathHandle, Set, Configuration, int)` +ultimately invokes `open(Path, int)`. + +Thus the chain `openFile(path).build().get()` has the same preconditions +and postconditions as `open(Path p, int bufferSize)` + + +The `openFile()` operation may check the state of the filesystem during this +call, but as the state of the filesystem may change betwen this call and +the actual `build()` and `get()` operations, this file-specific +preconditions (file exists, file is readable, etc) MUST NOT be checked here. + +FileSystem implementations which do not implement `open(Path, int)` +MAY postpone raising an `UnsupportedOperationException` until either the +`FSDataInputStreamBuilder.build()` or the subsequent `get()` call, +else they MAY fail fast in the `openFile()` call. + +### Implementors notes + +The base implementation of `openFileWithOptions()` actually executes +the `open(path)` operation synchronously, yet still returns the result +or any failures in the `CompletableFuture<>`, so as to ensure that users +code expecting this. + +Any filesystem where the time to open a file may be significant SHOULD +execute it asynchronously by submitting the operation in some executor/thread +pool. This is particularly recommended for object stores and other filesystems +likely to be accessed over long-haul connections. + +Arbitrary filesystem-specific options MAY be supported; these MUST +be prefixed with either the filesystem schema, e.g. `hdfs.` +or in the "fs.SCHEMA" format as normal configuration settings `fs.hdfs`). The +latter style allows the same configuration option to be used for both +filesystem configuration and file-specific configuration. + +It SHOULD be possible to always open a file without specifying any options, +so as to present a consistent model to users. However, an implementation MAY +opt to require one or more mandatory options to be set. + +### `FSDataInputStreamBuilder openFile(PathHandle)` + +Creates a `FSDataInputStreamBuilder` to build an operation to open a file. +Creates a [`FSDataInputStreamBuilder`](fsdatainputstreambuilder.html) +to construct a operation to open the file identified by the given `PathHandle` for reading. + +When `build()` is invoked on the returned `FSDataInputStreamBuilder` instance, +the builder parameters are verified and +`openFileWithOptions(PathHandle, Set, Configuration, int)` invoked. + +This (protected) operation returns a `CompletableFuture` +which, when its `get()` method is called, either returns an input +stream of the contents of opened file, or raises an exception. + +The base implementation of the `openFileWithOptions(Path,PathHandle, Set, Configuration, int)` method +returns a future which invokes `open(Path, int)`. + +Thus the chain `openFile(pathhandle).build().get()` has the same preconditions +and postconditions as `open(Pathhandle, int)` + +As with `FSDataInputStreamBuilder openFile(PathHandle)`, the `openFile()` +call must not be where path-specific preconditions are checked -that +is postponed to the `build()` and `get()` calls. + +FileSystem implementations which do not implement `open(PathHandle handle, int bufferSize)` +MAY postpone raising an `UnsupportedOperationException` until either the +`FSDataInputStreamBuilder.build()` or the subsequent `get()` call, +else they MAY fail fast in the `openFile()` call. + +The base implementation raises this exception in the `build()` operation; +other implementations SHOULD copy this. + ### `PathHandle getPathHandle(FileStatus stat, HandleOpt... options)` -Implementaions without a compliant call MUST throw `UnsupportedOperationException` +Implementations without a compliant call MUST throw `UnsupportedOperationException` #### Preconditions diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md index 32eeb5b757..e067b078b3 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md @@ -200,6 +200,10 @@ Some FileSystems do not raise an exception if this condition is not met. They instead return -1 on any `read()` operation where, at the time of the read, `len(data(FSDIS)) < pos(FSDIS)`. +After a failed seek, the value of `pos(FSDIS)` may change. +As an example, seeking past the EOF may move the read position +to the end of the file, *as well as raising an `EOFException`.* + #### Postconditions FSDIS' = (s, data, True) @@ -211,6 +215,16 @@ There is an implicit invariant: a seek to the current position is a no-op Implementations may recognise this operation and bypass all other precondition checks, leaving the input stream unchanged. +The most recent connectors to object stores all implement some form +of "lazy-seek": the `seek()` call may appear to update the stream, and the value +of `getPos()` is updated, but the file is not opened/reopenend until +data is actually read. Implementations of lazy seek MUST still validate +the new seek position against the known length of the file. +However the state of the file (i.e. does it exist, what +its current length is) does not need to be refreshed at this point. +The fact that a file has been deleted or truncated may not surface until +that `read()` call. + ### `Seekable.seekToNewSource(offset)` diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstreambuilder.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstreambuilder.md new file mode 100644 index 0000000000..f1beed862c --- /dev/null +++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstreambuilder.md @@ -0,0 +1,112 @@ + + + + + + +# class `org.apache.hadoop.fs.FSDataInputStreamBuilder` + + + +An interface offering of the Builder pattern for creating Java `Future` +references to `FSDataInputStream` and its subclasses. +It is used to initate a (potentially asynchronous) operation to open an existing +file for reading. + +## Invariants + +The `FSDataInputStreamBuilder` interface does not require parameters or +or the state of `FileSystem` until [`build()`](#build) is +invoked and/or during the asynchronous open operation itself. + +Some aspects of the state of the filesystem, MAY be checked in the initial +`openFile()` call, provided they are known to be invariants which will not +change between `openFile()` and the `build().get()` sequence. For example, +path validation. + +## Implementation-agnostic parameters. + + +### `FSDataInputStreamBuilder bufferSize(int bufSize)` + +Set the size of the buffer to be used. + +### Set optional or mandatory parameters + + FSDataInputStreamBuilder opt(String key, ...) + FSDataInputStreamBuilder must(String key, ...) + +Set optional or mandatory parameters to the builder. Using `opt()` or `must()`, +client can specify FS-specific parameters without inspecting the concrete type +of `FileSystem`. + +```java +out = fs.openFile(path) + .opt("fs.s3a.experimental.fadvise", "random") + .must("fs.s3a.readahead.range", 256 * 1024) + .build() + .get(); +``` + +#### Implementation Notes + +Checking for supported options must be performed in the `build()` operation. + +1. If a mandatory parameter declared via `must(key, value)`) is not recognized, +`IllegalArgumentException` MUST be thrown. + +1. If a mandatory parameter declared via `must(key, value)`) relies on +a feature which is recognized but not supported in the specific +Filesystem/FileContext instance `UnsupportedException` MUST be thrown. + +The behavior of resolving the conflicts between the parameters set by +builder methods (i.e., `bufferSize()`) and `opt()`/`must()` is as follows: + +> The last option specified defines the value and its optional/mandatory state. + + +## Builder interface + +### `CompletableFuture build()` + + +Return an `CompletableFuture` which, when successfully +completed, returns an input stream which can read data from the filesystem. + +The `build()` operation MAY perform the validation of the file's existence, +its kind, so rejecting attempts to read from a directory or non-existent +file. **Alternatively**, the `build()` operation may delay all checks +until an asynchronous operation whose outcome is provided by the `Future` + +That is, the precondition `exists(FS, path)` and `isFile(FS, path)` are +only guaranteed to have been met after the `get()` on the returned future is successful. + +Thus, if even a file does not exist, the following call will still succeed, returning +a future to be evaluated. + +```java +Path p = new Path("file://tmp/file-which-does-not-exist"); + +CompletableFuture future = p.getFileSystem(conf) + .openFile(p) + .build; +``` + +The preconditions for opening the file are checked during the asynchronous +evaluation, and so will surface when the future is completed: + +```java +FSDataInputStream in = future.get(); +``` diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdataoutputstreambuilder.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdataoutputstreambuilder.md index 4ea1fd168f..64dda2df8c 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdataoutputstreambuilder.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdataoutputstreambuilder.md @@ -114,10 +114,12 @@ MUST verify that implementation-agnostic parameters (i.e., "syncable") or implementation-specific parameters (i.e., "foofs:cache") are supported. `FileSystem` will satisfy optional parameters (via `opt(key, ...)`) on best effort. If the mandatory parameters (via `must(key, ...)`) can not be satisfied -in the `FileSystem`, `IllegalArgumentException` should be thrown in `build()`. +in the `FileSystem`, `IllegalArgumentException` must be thrown in `build()`. The behavior of resolving the conflicts between the parameters set by -builder methods (i.e., `bufferSize()`) and `opt()`/`must()` is undefined. +builder methods (i.e., `bufferSize()`) and `opt()`/`must()` is as follows: + +> The last option specified defines the value and its optional/mandatory state. ## HDFS-specific parameters. diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/FileContextMainOperationsBaseTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/FileContextMainOperationsBaseTest.java index c07a6ffa34..4c90490b09 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/FileContextMainOperationsBaseTest.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/FileContextMainOperationsBaseTest.java @@ -23,10 +23,13 @@ import java.io.IOException; import java.util.EnumSet; import java.util.NoSuchElementException; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.hadoop.HadoopIllegalArgumentException; import org.apache.hadoop.fs.Options.CreateOpts; import org.apache.hadoop.fs.Options.Rename; +import org.apache.hadoop.fs.contract.ContractTestUtils; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.test.GenericTestUtils; @@ -40,6 +43,8 @@ import static org.apache.hadoop.fs.FileContextTestHelper.*; import static org.apache.hadoop.fs.CreateFlag.*; +import static org.apache.hadoop.test.LambdaTestUtils.intercept; +import static org.apache.hadoop.test.LambdaTestUtils.interceptFuture; /** *

    @@ -1326,13 +1331,10 @@ public void testOpen2() throws IOException { final Path path = new Path(rootPath, "zoo"); createFile(path); final long length = fc.getFileStatus(path).getLen(); - FSDataInputStream fsdis = fc.open(path, 2048); - try { - byte[] bb = new byte[(int)length]; + try (FSDataInputStream fsdis = fc.open(path, 2048)) { + byte[] bb = new byte[(int) length]; fsdis.readFully(bb); assertArrayEquals(data, bb); - } finally { - fsdis.close(); } } @@ -1452,4 +1454,87 @@ public void testGetFileContext1() throws IOException { private Path getTestRootPath(FileContext fc, String pathString) { return fileContextTestHelper.getTestRootPath(fc, pathString); } + + /** + * Create a path under the test path. + * @param filepath path string in + * @return a path qualified by the test filesystem + * @throws IOException IO problems + */ + protected Path path(String filepath) throws IOException { + return getTestRootPath(fc, filepath); + } + + /** + * Describe a test. This is a replacement for javadocs + * where the tests role is printed in the log output + * @param text description + */ + protected void describe(String text) { + LOG.info(text); + } + + @Test + public void testOpenFileRead() throws Exception { + final Path path = path("testOpenFileRead"); + createFile(path); + final long length = fc.getFileStatus(path).getLen(); + try (FSDataInputStream fsdis = fc.openFile(path) + .opt("fs.test.something", true) + .opt("fs.test.something2", 3) + .opt("fs.test.something3", "3") + .build().get()) { + byte[] bb = new byte[(int) length]; + fsdis.readFully(bb); + assertArrayEquals(data, bb); + } + } + + @Test + public void testOpenFileUnknownOption() throws Throwable { + describe("calling openFile fails when a 'must()' option is unknown"); + + final Path path = path("testOpenFileUnknownOption"); + FutureDataInputStreamBuilder builder = + fc.openFile(path) + .opt("fs.test.something", true) + .must("fs.test.something", true); + intercept(IllegalArgumentException.class, + () -> builder.build()); + } + + @Test + public void testOpenFileLazyFail() throws Throwable { + describe("openFile fails on a missing file in the get() and not before"); + FutureDataInputStreamBuilder builder = + fc.openFile(path("testOpenFileUnknownOption")) + .opt("fs.test.something", true); + interceptFuture(FileNotFoundException.class, "", builder.build()); + } + + @Test + public void testOpenFileApplyRead() throws Throwable { + describe("use the apply sequence"); + Path path = path("testOpenFileApplyRead"); + createFile(path); + CompletableFuture readAllBytes = fc.openFile(path) + .build() + .thenApply(ContractTestUtils::readStream); + assertEquals("Wrong number of bytes read from stream", + data.length, + (long)readAllBytes.get()); + } + + @Test + public void testOpenFileApplyAsyncRead() throws Throwable { + describe("verify that async accept callbacks are evaluated"); + Path path = path("testOpenFileApplyAsyncRead"); + createFile(path); + CompletableFuture future = fc.openFile(path).build(); + AtomicBoolean accepted = new AtomicBoolean(false); + future.thenAcceptAsync(i -> accepted.set(true)).get(); + assertTrue("async accept operation not invoked", + accepted.get()); + } + } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java index b3021a9be7..57798c2c8b 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestHarFileSystem.java @@ -40,6 +40,8 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.concurrent.CompletableFuture; import static org.apache.hadoop.fs.Options.ChecksumOpt; import static org.apache.hadoop.fs.Options.CreateOpts; @@ -230,6 +232,24 @@ public Collection getAllStoragePolicies() public Collection getTrashRoots(boolean allUsers) throws IOException; StorageStatistics getStorageStatistics(); + + FutureDataInputStreamBuilder openFile(Path path) + throws IOException, UnsupportedOperationException; + + FutureDataInputStreamBuilder openFile(PathHandle pathHandle) + throws IOException, UnsupportedOperationException; + + CompletableFuture openFileWithOptions( + PathHandle pathHandle, + Set mandatoryKeys, + Configuration options, + int bufferSize) throws IOException; + + CompletableFuture openFileWithOptions( + Path path, + Set mandatoryKeys, + Configuration options, + int bufferSize) throws IOException; } @Test diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestLocalFileSystem.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestLocalFileSystem.java index d5622af085..fae3db83cf 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestLocalFileSystem.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestLocalFileSystem.java @@ -729,7 +729,7 @@ private static class BuilderWithSupportedKeys } @Override - protected BuilderWithSupportedKeys getThisBuilder() { + public BuilderWithSupportedKeys getThisBuilder() { return this; } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java index d475c6e643..c9283dcd52 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractOpenTest.java @@ -19,22 +19,30 @@ import java.io.FileNotFoundException; import java.io.IOException; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FutureDataInputStreamBuilder; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.impl.FutureIOSupport; import org.apache.hadoop.io.IOUtils; import static org.apache.hadoop.fs.contract.ContractTestUtils.createFile; import static org.apache.hadoop.fs.contract.ContractTestUtils.dataset; import static org.apache.hadoop.fs.contract.ContractTestUtils.touch; +import static org.apache.hadoop.test.LambdaTestUtils.intercept; +import static org.apache.hadoop.test.LambdaTestUtils.interceptFuture; import org.junit.Test; /** - * Test Seek operations + * Test Open operations. */ public abstract class AbstractContractOpenTest extends AbstractFSContractTestBase { @@ -63,8 +71,7 @@ public void testOpenReadZeroByteFile() throws Throwable { instream = getFileSystem().open(path); assertEquals(0, instream.getPos()); //expect initial read to fail - int result = instream.read(); - assertMinusOne("initial byte read", result); + assertMinusOne("initial byte read", instream.read()); } @Test @@ -163,4 +170,126 @@ public void testSequentialRead() throws Throwable { instream.close(); } + @Test + public void testOpenFileReadZeroByte() throws Throwable { + describe("create & read a 0 byte file through the builders"); + Path path = path("zero.txt"); + FileSystem fs = getFileSystem(); + fs.createFile(path).overwrite(true).build().close(); + try (FSDataInputStream is = fs.openFile(path) + .opt("fs.test.something", true) + .opt("fs.test.something2", 3) + .opt("fs.test.something3", "3") + .build().get()) { + assertMinusOne("initial byte read", is.read()); + } + } + + @Test + public void testOpenFileUnknownOption() throws Throwable { + describe("calling openFile fails when a 'must()' option is unknown"); + FutureDataInputStreamBuilder builder = + getFileSystem().openFile(path("testOpenFileUnknownOption")) + .opt("fs.test.something", true) + .must("fs.test.something", true); + intercept(IllegalArgumentException.class, + () -> builder.build()); + } + + @Test + public void testOpenFileLazyFail() throws Throwable { + describe("openFile fails on a missing file in the get() and not before"); + FutureDataInputStreamBuilder builder = + getFileSystem().openFile(path("testOpenFileLazyFail")) + .opt("fs.test.something", true); + interceptFuture(FileNotFoundException.class, "", builder.build()); + } + + @Test + public void testOpenFileFailExceptionally() throws Throwable { + describe("openFile missing file chains into exceptionally()"); + FutureDataInputStreamBuilder builder = + getFileSystem().openFile(path("testOpenFileFailExceptionally")) + .opt("fs.test.something", true); + assertNull("exceptional uprating", + builder.build().exceptionally(ex -> null).get()); + } + + @Test + public void testAwaitFutureFailToFNFE() throws Throwable { + describe("Verify that FutureIOSupport.awaitFuture extracts IOExceptions"); + FutureDataInputStreamBuilder builder = + getFileSystem().openFile(path("testAwaitFutureFailToFNFE")) + .opt("fs.test.something", true); + intercept(FileNotFoundException.class, + () -> FutureIOSupport.awaitFuture(builder.build())); + } + + @Test + public void testAwaitFutureTimeoutFailToFNFE() throws Throwable { + describe("Verify that FutureIOSupport.awaitFuture with a timeout works"); + FutureDataInputStreamBuilder builder = + getFileSystem().openFile(path("testAwaitFutureFailToFNFE")) + .opt("fs.test.something", true); + intercept(FileNotFoundException.class, + () -> FutureIOSupport.awaitFuture(builder.build(), + 10, TimeUnit.DAYS)); + } + + @Test + public void testOpenFileExceptionallyTranslating() throws Throwable { + describe("openFile missing file chains into exceptionally()"); + CompletableFuture f = getFileSystem() + .openFile(path("testOpenFileUnknownOption")).build(); + interceptFuture(RuntimeException.class, + "exceptionally", + f.exceptionally(ex -> { + throw new RuntimeException("exceptionally", ex); + })); + } + + @Test + public void testChainedFailureAwaitFuture() throws Throwable { + describe("await Future handles chained failures"); + CompletableFuture f = getFileSystem() + .openFile(path("testOpenFileUnknownOption")) + .build(); + intercept(RuntimeException.class, + "exceptionally", + () -> FutureIOSupport.awaitFuture( + f.exceptionally(ex -> { + throw new RuntimeException("exceptionally", ex); + }))); + } + + @Test + public void testOpenFileApplyRead() throws Throwable { + describe("use the apply sequence to read a whole file"); + Path path = path("testOpenFileApplyRead"); + FileSystem fs = getFileSystem(); + int len = 4096; + createFile(fs, path, true, + dataset(len, 0x40, 0x80)); + CompletableFuture readAllBytes = fs.openFile(path) + .build() + .thenApply(ContractTestUtils::readStream); + assertEquals("Wrong number of bytes read value", + len, + (long) readAllBytes.get()); + } + + @Test + public void testOpenFileApplyAsyncRead() throws Throwable { + describe("verify that async accept callbacks are evaluated"); + Path path = path("testOpenFileApplyAsyncRead"); + FileSystem fs = getFileSystem(); + createFile(fs, path, true, + dataset(4, 0x40, 0x80)); + CompletableFuture future = fs.openFile(path).build(); + AtomicBoolean accepted = new AtomicBoolean(false); + future.thenAcceptAsync(i -> accepted.set(true)).get(); + assertTrue("async accept operation not invoked", + accepted.get()); + } + } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractPathHandleTest.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractPathHandleTest.java index 36cfa6ccda..17043dca93 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractPathHandleTest.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/AbstractContractPathHandleTest.java @@ -17,16 +17,19 @@ */ package org.apache.hadoop.fs.contract; +import java.io.FileNotFoundException; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.concurrent.CompletableFuture; import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.InvalidPathHandleException; import org.apache.hadoop.fs.Options.HandleOpt; import org.apache.hadoop.fs.Path; @@ -38,6 +41,7 @@ import static org.apache.hadoop.fs.contract.ContractTestUtils.verifyRead; import static org.apache.hadoop.fs.contract.ContractTestUtils.verifyFileContents; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY; +import static org.apache.hadoop.test.LambdaTestUtils.interceptFuture; import org.apache.hadoop.fs.RawPathHandle; import org.junit.Test; @@ -249,4 +253,61 @@ protected PathHandle getHandleOrSkip(FileStatus stat) { // unreachable return null; } + + + @Test + public void testOpenFileApplyRead() throws Throwable { + describe("use the apply sequence to read a whole file"); + CompletableFuture readAllBytes = getFileSystem() + .openFile( + getHandleOrSkip( + testFile(B1))) + .build() + .thenApply(ContractTestUtils::readStream); + assertEquals("Wrong number of bytes read value", + TEST_FILE_LEN, + (long) readAllBytes.get()); + } + + @Test + public void testOpenFileDelete() throws Throwable { + describe("use the apply sequence to read a whole file"); + FileStatus testFile = testFile(B1); + PathHandle handle = getHandleOrSkip(testFile); + // delete that file + FileSystem fs = getFileSystem(); + fs.delete(testFile.getPath(), false); + // now construct the builder. + // even if the open happens in the build operation, + // the failure must not surface until later. + CompletableFuture builder = + fs.openFile(handle) + .opt("fs.test.something", true) + .build(); + IOException ioe = interceptFuture(IOException.class, "", builder); + if (!(ioe instanceof FileNotFoundException) + && !(ioe instanceof InvalidPathHandleException)) { + // support both FileNotFoundException + // and InvalidPathHandleException as different implementations + // support either -and with non-atomic open sequences, possibly + // both + throw ioe; + } + } + + @Test + public void testOpenFileLazyFail() throws Throwable { + describe("openFile fails on a misssng file in the get() and not before"); + FileStatus stat = testFile(B1); + CompletableFuture readAllBytes = getFileSystem() + .openFile( + getHandleOrSkip( + stat)) + .build() + .thenApply(ContractTestUtils::readStream); + assertEquals("Wrong number of bytes read value", + TEST_FILE_LEN, + (long) readAllBytes.get()); + } + } diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java index ba1204848a..b4db3a5803 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/contract/ContractTestUtils.java @@ -1482,6 +1482,37 @@ public static void assertCapabilities( } } + /** + * Function which calls {@code InputStream.read()} and + * downgrades an IOE to a runtime exception. + * @param in input + * @return the read value + * @throws AssertionError on any IOException + */ + public static int read(InputStream in) { + try { + return in.read(); + } catch (IOException ex) { + throw new AssertionError(ex); + } + } + + /** + * Read a whole stream; downgrades an IOE to a runtime exception. + * @param in input + * @return the number of bytes read. + * @throws AssertionError on any IOException + */ + public static long readStream(InputStream in) { + long count = 0; + + while (read(in) >= 0) { + count++; + } + return count; + } + + /** * Results of recursive directory creation/scan operations. */ diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/LambdaTestUtils.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/LambdaTestUtils.java index 67df6daa1a..c1b6cc4081 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/LambdaTestUtils.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/LambdaTestUtils.java @@ -30,6 +30,10 @@ import java.security.PrivilegedExceptionAction; import java.util.Optional; import java.util.concurrent.Callable; +import java.util.concurrent.CancellationException; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; /** @@ -690,6 +694,132 @@ public static void doAs(UserGroupInformation user, VoidCallable eval) user.doAs(new PrivilegedVoidOperation(eval)); } + /** + * Expect a future to raise a specific exception class when evaluated, + * looking inside the raised {@code ExecutionException} for it. + * @param clazz class of exception; the nested exception must be this class + * or a subclass. + * + * This is simply an unwrapping of the outcome of the future. + * + * If an exception is not raised, the return value of the {@code get()} + * call is included in the exception string. + * + * If the nested cause of the raised ExecutionException is not an + * Exception (i.e its an error), then the outer ExecutionException is + * rethrown. + * This keeps the operation signatures in sync. + * + * @param contained string which must be in the {@code toString()} value + * of the exception + * @param future future to get + * @param return type of expression + * @param exception class + * @return the caught exception if it was of the expected type and contents + * @throws AssertionError if the evaluation call didn't raise an exception. + * The error includes the {@code toString()} value of the result, if this + * can be determined. + * @throws CancellationException if the computation was cancelled + * @throws ExecutionException if the raised exception didn't contain an + * exception. + * @throws InterruptedException if the current thread was interrupted + * @throws TimeoutException if the wait timed out + * @throws Exception if the wrong exception was raised, or there was + * a text mismatch. + */ + public static E interceptFuture( + Class clazz, + String contained, + Future future) throws Exception { + return intercept(clazz, + contained, + () -> { + try { + return future.get(); + } catch (ExecutionException e) { + Throwable cause = e.getCause(); + if (cause instanceof Exception) { + throw (Exception) cause; + } else { + throw e; + } + } + }); + } + + /** + * Expect a future to raise a specific exception class when evaluated, + * looking inside the raised {@code ExecutionException} for it. + * @param clazz class of exception; the nested exception must be this class + * or a subclass. + * + * This is simply an unwrapping of the outcome of the future. + * + * If an exception is not raised, the return value of the {@code get()} + * call is included in the exception string. + * + * If the nested cause of the raised ExecutionException is not an + * Exception (i.e its an error), then the outer ExecutionException is + * rethrown. + * This keeps the operation signatures in sync. + * + * @param contained string which must be in the {@code toString()} value + * of the exception + * @param future future to get + * @param return type of expression + * @param exception class + * @return the caught exception if it was of the expected type and contents + * @throws AssertionError if the evaluation call didn't raise an exception. + * The error includes the {@code toString()} value of the result, if this + * can be determined. + * @throws CancellationException if the computation was cancelled + * @throws ExecutionException if the raised exception didn't contain an + * exception. + * @throws InterruptedException if the current thread was interrupted + * @throws TimeoutException if the wait timed out + * @throws Exception if the wrong exception was raised, or there was + * a text mismatch. + */ + public static E interceptFuture( + final Class clazz, + final String contained, + final long timeout, + final TimeUnit tu, + final Future future) throws Exception { + return intercept(clazz, + contained, + () -> { + try { + return future.get(timeout, tu); + } catch (ExecutionException e) { + Throwable cause = e.getCause(); + if (cause instanceof Exception) { + throw (Exception) cause; + } else { + throw e; + } + } + }); + } + + /** + * Verify that the cause of an exception is of the given type. + * @param exception class + * @param caught caught exception + * @return the extracted exception if it is of the expect type. + * @throws Exception the outer exception if there is no inner/wrong type + */ + public static E verifyCause( + Class clazz, + final Throwable caught) throws Throwable { + Throwable cause = caught.getCause(); + if (cause == null || !clazz.isAssignableFrom(cause.getClass())) { + throw caught; + } else { + return (E) caught; + } + } + /** * Returns {@code TimeoutException} on a timeout. If * there was a inner class passed in, includes it as the diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/TestLambdaTestUtils.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/TestLambdaTestUtils.java index 694fe73724..479dd35b0a 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/TestLambdaTestUtils.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/TestLambdaTestUtils.java @@ -24,6 +24,10 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.concurrent.Callable; +import java.util.concurrent.CancellationException; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; @@ -516,17 +520,105 @@ public void testEvalDoesntWrapRTEs() throws Throwable { */ @Test public void testEvalDoesWrapIOEs() throws Throwable { - AssertionError ex = intercept(AssertionError.class, "ioe", - () -> eval(() -> { - throw new IOException("ioe"); - })); - Throwable cause = ex.getCause(); - if (cause == null) { - throw ex; - } - if (!(cause instanceof IOException)) { - throw cause; - } + verifyCause(IOException.class, + intercept(AssertionError.class, "ioe", + () -> eval(() -> { + throw new IOException("ioe"); + }))); + } + + @Test + public void testInterceptFutureUnwrapped() throws Throwable { + CompletableFuture future = new CompletableFuture<>(); + future.completeExceptionally(new IOException("oops")); + interceptFuture(IOException.class, "oops", future); + } + + @Test + public void testInterceptFutureWrongException() throws Throwable { + CompletableFuture future = new CompletableFuture<>(); + future.completeExceptionally(new RuntimeException("oops")); + intercept(RuntimeException.class, + "oops", + () -> interceptFuture(IOException.class, "", future)); + } + + @Test + public void testInterceptFutureNotAnException() throws Throwable { + CompletableFuture future = new CompletableFuture<>(); + future.completeExceptionally(new Error("oops")); + verifyCause(Error.class, + intercept(ExecutionException.class, + "oops", + () -> interceptFuture(IOException.class, "", future))); + } + + /** + * Variant for exception catching. + */ + @Test + public void testInterceptFutureNotAnException2() throws Throwable { + CompletableFuture future = new CompletableFuture<>(); + future.completeExceptionally(new Error("oops")); + verifyCause(Error.class, + interceptFuture(ExecutionException.class, "", future)); + } + + @Test + public void testInterceptFutureNoFailures() throws Throwable { + CompletableFuture future = new CompletableFuture<>(); + future.complete("happy"); + intercept(AssertionError.class, + "happy", + () -> interceptFuture(IOException.class, "oops", future)); + } + + /** + * This will timeout immediately and raise a TimeoutException. + */ + @Test + public void testInterceptFutureTimeout() throws Throwable { + CompletableFuture future = new CompletableFuture<>(); + intercept(TimeoutException.class, + "", + () -> interceptFuture(IOException.class, "oops", + 1, TimeUnit.NANOSECONDS, + future)); + } + + /** + * This will timeout immediately and raise a TimeoutException. + */ + @Test + public void testInterceptFutureTimeout2() throws Throwable { + CompletableFuture future = new CompletableFuture<>(); + interceptFuture(TimeoutException.class, "", + 1, TimeUnit.NANOSECONDS, + future); + } + + /** + * This will timeout immediately and raise a TimeoutException. + */ + @Test + public void testInterceptFutureTimeoutSuccess() throws Throwable { + CompletableFuture future = new CompletableFuture<>(); + future.completeExceptionally(new IOException("oops")); + interceptFuture(IOException.class, "oops", + 1, TimeUnit.NANOSECONDS, + future); + } + + /** + * This will timeout immediately and raise a TimeoutException. + */ + @Test + public void testInterceptFutureCancelled() throws Throwable { + CompletableFuture future = new CompletableFuture<>(); + future.cancel(false); + interceptFuture(CancellationException.class, "", + 1, TimeUnit.NANOSECONDS, + future); } } diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java index 48d50e5e38..ed37f1dcdf 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DistributedFileSystem.java @@ -3195,7 +3195,7 @@ private HdfsDataOutputStreamBuilder(DistributedFileSystem dfs, Path path) { } @Override - protected HdfsDataOutputStreamBuilder getThisBuilder() { + public HdfsDataOutputStreamBuilder getThisBuilder() { return this; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/contract/hdfs/TestHDFSContractOpen.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/contract/hdfs/TestHDFSContractOpen.java index 125e8eec93..0d9e810320 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/contract/hdfs/TestHDFSContractOpen.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/fs/contract/hdfs/TestHDFSContractOpen.java @@ -27,7 +27,7 @@ import java.io.IOException; /** - * Test dir operations on a the local FS. + * Test Open operations on HDFS. */ public class TestHDFSContractOpen extends AbstractContractOpenTest { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java index bfc6c0e855..1fcb118a10 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapred/LineRecordReader.java @@ -25,9 +25,10 @@ import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FutureDataInputStreamBuilder; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Seekable; +import org.apache.hadoop.fs.impl.FutureIOSupport; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CodecPool; @@ -36,6 +37,7 @@ import org.apache.hadoop.io.compress.Decompressor; import org.apache.hadoop.io.compress.SplitCompressionInputStream; import org.apache.hadoop.io.compress.SplittableCompressionCodec; +import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.lib.input.CompressedSplitLineReader; import org.apache.hadoop.mapreduce.lib.input.SplitLineReader; import org.apache.hadoop.mapreduce.lib.input.UncompressedSplitLineReader; @@ -105,8 +107,12 @@ public LineRecordReader(Configuration job, FileSplit split, codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split - final FileSystem fs = file.getFileSystem(job); - fileIn = fs.open(file); + final FutureDataInputStreamBuilder builder = + file.getFileSystem(job).openFile(file); + FutureIOSupport.propagateOptions(builder, job, + MRJobConfig.INPUT_FILE_OPTION_PREFIX, + MRJobConfig.INPUT_FILE_MANDATORY_PREFIX); + fileIn = FutureIOSupport.awaitFuture(builder.build()); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java index b36b5ce5c6..fb346983c7 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/MRJobConfig.java @@ -1226,4 +1226,18 @@ public interface MRJobConfig { MR_AM_STAGING_DIR + ".erasurecoding.enabled"; boolean DEFAULT_MR_AM_STAGING_ERASURECODING_ENABLED = false; + + /** + * Prefix for options which are passed in to the filesystem + * after converting the subsequent dotted element to the schema. + */ + @Unstable + String INPUT_FILE_OPTION_PREFIX = "mapreduce.job.input.file.option."; + + /** + * Prefix for mandatory options which are passed in to the filesystem + * after converting the subsequent dotted element to the schema. + */ + @Unstable + String INPUT_FILE_MANDATORY_PREFIX = "mapreduce.job.input.file.must."; } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FixedLengthRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FixedLengthRecordReader.java index 71b2b79bea..c0ae9a5cda 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FixedLengthRecordReader.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/FixedLengthRecordReader.java @@ -25,9 +25,10 @@ import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FutureDataInputStreamBuilder; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Seekable; +import org.apache.hadoop.fs.impl.FutureIOSupport; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.compress.CodecPool; @@ -36,6 +37,7 @@ import org.apache.hadoop.io.compress.CompressionInputStream; import org.apache.hadoop.io.compress.Decompressor; import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.slf4j.Logger; @@ -89,9 +91,13 @@ public void initialize(Configuration job, long splitStart, long splitLength, numBytesToSkip = recordLength - partialRecordLength; } - // open the file and seek to the start of the split - final FileSystem fs = file.getFileSystem(job); - fileIn = fs.open(file); + // open the file + final FutureDataInputStreamBuilder builder = + file.getFileSystem(job).openFile(file); + FutureIOSupport.propagateOptions(builder, job, + MRJobConfig.INPUT_FILE_OPTION_PREFIX, + MRJobConfig.INPUT_FILE_MANDATORY_PREFIX); + fileIn = FutureIOSupport.awaitFuture(builder.build()); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null != codec) { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java index ca85982e0a..160c763565 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/LineRecordReader.java @@ -24,9 +24,10 @@ import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FutureDataInputStreamBuilder; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Seekable; +import org.apache.hadoop.fs.impl.FutureIOSupport; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CodecPool; @@ -36,6 +37,7 @@ import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.Decompressor; import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.slf4j.Logger; @@ -82,8 +84,12 @@ public void initialize(InputSplit genericSplit, final Path file = split.getPath(); // open the file and seek to the start of the split - final FileSystem fs = file.getFileSystem(job); - fileIn = fs.open(file); + final FutureDataInputStreamBuilder builder = + file.getFileSystem(job).openFile(file); + FutureIOSupport.propagateOptions(builder, job, + MRJobConfig.INPUT_FILE_OPTION_PREFIX, + MRJobConfig.INPUT_FILE_MANDATORY_PREFIX); + fileIn = FutureIOSupport.awaitFuture(builder.build()); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null!=codec) { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/NLineInputFormat.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/NLineInputFormat.java index 758996165f..dfff9ad0d2 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/NLineInputFormat.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/input/NLineInputFormat.java @@ -27,13 +27,15 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FutureDataInputStreamBuilder; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.impl.FutureIOSupport; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.util.LineReader; @@ -93,10 +95,14 @@ public static List getSplitsForFile(FileStatus status, if (status.isDirectory()) { throw new IOException("Not a file: " + fileName); } - FileSystem fs = fileName.getFileSystem(conf); LineReader lr = null; try { - FSDataInputStream in = fs.open(fileName); + final FutureDataInputStreamBuilder builder = + fileName.getFileSystem(conf).openFile(fileName); + FutureIOSupport.propagateOptions(builder, conf, + MRJobConfig.INPUT_FILE_OPTION_PREFIX, + MRJobConfig.INPUT_FILE_MANDATORY_PREFIX); + FSDataInputStream in = FutureIOSupport.awaitFuture(builder.build()); lr = new LineReader(in, conf); Text line = new Text(); int numLines = 0; diff --git a/hadoop-tools/hadoop-aws/dev-support/findbugs-exclude.xml b/hadoop-tools/hadoop-aws/dev-support/findbugs-exclude.xml index 855aac974c..bb6808f0f6 100644 --- a/hadoop-tools/hadoop-aws/dev-support/findbugs-exclude.xml +++ b/hadoop-tools/hadoop-aws/dev-support/findbugs-exclude.xml @@ -63,5 +63,10 @@ + + + + + diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/InternalConstants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/InternalConstants.java new file mode 100644 index 0000000000..bcf894f96b --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/InternalConstants.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * Constants for internal use in the org.apache.hadoop.fs.s3a module itself. + * Please don't refer to these outside of this module & its tests. + * If you find you need to then either the code is doing something it + * should not, or these constants need to be uprated to being + * public and stable entries. + */ +@InterfaceAudience.Private +public final class InternalConstants { + + private InternalConstants() { + } + + /** + * The known keys used in a standard openFile call. + * if there's a select marker in there then the keyset + * used becomes that of the select operation. + */ + @InterfaceStability.Unstable + public static final Set STANDARD_OPENFILE_KEYS = + Collections.unmodifiableSet( + new HashSet<>( + Arrays.asList(Constants.INPUT_FADVISE, + Constants.READAHEAD_RANGE))); +} diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java index eb055dc6bc..031a80be1d 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java @@ -39,6 +39,7 @@ import java.util.Optional; import java.util.Set; import java.util.Objects; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutorService; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; @@ -87,6 +88,8 @@ import org.apache.hadoop.fs.CreateFlag; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.s3a.select.InternalSelectConstants; +import org.apache.hadoop.util.LambdaUtils; import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -111,6 +114,8 @@ import org.apache.hadoop.fs.s3a.commit.CommitConstants; import org.apache.hadoop.fs.s3a.commit.PutTracker; import org.apache.hadoop.fs.s3a.commit.MagicCommitIntegration; +import org.apache.hadoop.fs.s3a.select.SelectBinding; +import org.apache.hadoop.fs.s3a.select.SelectConstants; import org.apache.hadoop.fs.s3a.s3guard.DirListingMetadata; import org.apache.hadoop.fs.s3a.s3guard.MetadataStoreListFilesIterator; import org.apache.hadoop.fs.s3a.s3guard.MetadataStore; @@ -126,6 +131,7 @@ import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.SemaphoredDelegatingExecutor; +import static org.apache.hadoop.fs.impl.AbstractFSBuilderImpl.rejectUnknownMandatoryKeys; import static org.apache.hadoop.fs.s3a.Constants.*; import static org.apache.hadoop.fs.s3a.Invoker.*; import static org.apache.hadoop.fs.s3a.S3AUtils.*; @@ -168,6 +174,7 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities, * retryable results in files being deleted. */ public static final boolean DELETE_CONSIDERED_IDEMPOTENT = true; + private URI uri; private Path workingDir; private String username; @@ -224,6 +231,7 @@ public class S3AFileSystem extends FileSystem implements StreamCapabilities, private S3ADataBlocks.BlockFactory blockFactory; private int blockOutputActiveBlocks; private WriteOperationHelper writeHelper; + private SelectBinding selectBinding; private boolean useListV1; private MagicCommitIntegration committerIntegration; @@ -361,6 +369,9 @@ public void initialize(URI name, Configuration originalConf) committerIntegration = new MagicCommitIntegration( this, magicCommitterEnabled); + // instantiate S3 Select support + selectBinding = new SelectBinding(writeHelper); + boolean blockUploadEnabled = conf.getBoolean(FAST_UPLOAD, true); if (!blockUploadEnabled) { @@ -830,31 +841,87 @@ protected URI canonicalizeUri(URI rawUri) { * @param f the file name to open * @param bufferSize the size of the buffer to be used. */ + @Retries.RetryTranslated public FSDataInputStream open(Path f, int bufferSize) throws IOException { + return open(f, Optional.empty()); + } + + /** + * Opens an FSDataInputStream at the indicated Path. + * @param path the file to open + * @param options configuration options if opened with the builder API. + * @throws IOException IO failure. + */ + @Retries.RetryTranslated + private FSDataInputStream open( + final Path path, + final Optional options) + throws IOException { + entryPoint(INVOCATION_OPEN); - LOG.debug("Opening '{}' for reading; input policy = {}", f, inputPolicy); - final FileStatus fileStatus = getFileStatus(f); + final FileStatus fileStatus = getFileStatus(path); if (fileStatus.isDirectory()) { - throw new FileNotFoundException("Can't open " + f + throw new FileNotFoundException("Can't open " + path + " because it is a directory"); } + S3AReadOpContext readContext; + if (options.isPresent()) { + Configuration o = options.get(); + // normal path. Open the file with the chosen seek policy, if different + // from the normal one. + // and readahead. + S3AInputPolicy policy = S3AInputPolicy.getPolicy( + o.get(INPUT_FADVISE, inputPolicy.toString())); + long readAheadRange2 = o.getLong(READAHEAD_RANGE, readAhead); + readContext = createReadContext(fileStatus, policy, readAheadRange2); + } else { + readContext = createReadContext(fileStatus, inputPolicy, readAhead); + } + LOG.debug("Opening '{}'", readContext); + return new FSDataInputStream( - new S3AInputStream(new S3AReadOpContext(hasMetadataStore(), - invoker, - s3guardInvoker, - statistics, - instrumentation, - fileStatus), - new S3ObjectAttributes(bucket, - pathToKey(f), - getServerSideEncryptionAlgorithm(), - encryptionSecrets.getEncryptionKey()), + new S3AInputStream( + readContext, + createObjectAttributes(path), fileStatus.getLen(), - s3, - readAhead, - inputPolicy)); + s3)); + } + + /** + * Create the read context for reading from the referenced file, + * using FS state as well as the status. + * @param fileStatus file status. + * @param seekPolicy input policy for this operation + * @param readAheadRange readahead value. + * @return a context for read and select operations. + */ + private S3AReadOpContext createReadContext( + final FileStatus fileStatus, + final S3AInputPolicy seekPolicy, + final long readAheadRange) { + return new S3AReadOpContext(fileStatus.getPath(), + hasMetadataStore(), + invoker, + s3guardInvoker, + statistics, + instrumentation, + fileStatus, + seekPolicy, + readAheadRange); + } + + /** + * Create the attributes of an object for a get/select request. + * @param f path path of the request. + * @return attributes to use when building the query. + */ + private S3ObjectAttributes createObjectAttributes(final Path f) { + return new S3ObjectAttributes(bucket, + pathToKey(f), + getServerSideEncryptionAlgorithm(), + encryptionSecrets.getEncryptionKey()); } /** @@ -3549,6 +3616,10 @@ public boolean hasCapability(String capability) { // capability depends on FS configuration return isMagicCommitEnabled(); + case SelectConstants.S3_SELECT_CAPABILITY: + // select is only supported if enabled + return selectBinding.isEnabled(); + default: return false; } @@ -3576,4 +3647,104 @@ protected S3Guard.ITtlTimeProvider getTtlTimeProvider() { protected void setTtlTimeProvider(S3Guard.ITtlTimeProvider ttlTimeProvider) { this.ttlTimeProvider = ttlTimeProvider; } + + /** + * This is a proof of concept of a select API. + * Once a proper factory mechanism for opening files is added to the + * FileSystem APIs, this will be deleted without any warning. + * @param source path to source data + * @param expression select expression + * @param options request configuration from the builder. + * @return the stream of the results + * @throws IOException IO failure + */ + @Retries.RetryTranslated + private FSDataInputStream select(final Path source, + final String expression, + final Configuration options) + throws IOException { + entryPoint(OBJECT_SELECT_REQUESTS); + requireSelectSupport(source); + final Path path = makeQualified(source); + // call getFileStatus(), which will look at S3Guard first, + // so the operation will fail if it is not there or S3Guard believes it has + // been deleted. + // validation of the file status are delegated to the binding. + final FileStatus fileStatus = getFileStatus(path); + + // readahead range can be dynamically set + long ra = options.getLong(READAHEAD_RANGE, readAhead); + // build and execute the request + return selectBinding.select( + createReadContext(fileStatus, inputPolicy, ra), + expression, + options, + generateSSECustomerKey(), + createObjectAttributes(path)); + } + + /** + * Verify the FS supports S3 Select. + * @param source source file. + * @throws UnsupportedOperationException if not. + */ + private void requireSelectSupport(final Path source) throws + UnsupportedOperationException { + if (!selectBinding.isEnabled()) { + throw new UnsupportedOperationException( + SelectConstants.SELECT_UNSUPPORTED); + } + } + + /** + * Initiate the open or select operation. + * This is invoked from both the FileSystem and FileContext APIs + * @param path path to the file + * @param mandatoryKeys set of options declared as mandatory. + * @param options options set during the build sequence. + * @return a future which will evaluate to the opened/selected file. + * @throws IOException failure to resolve the link. + * @throws PathIOException operation is a select request but S3 select is + * disabled + * @throws IllegalArgumentException unknown mandatory key + */ + @Override + @Retries.RetryTranslated + public CompletableFuture openFileWithOptions( + final Path path, + final Set mandatoryKeys, + final Configuration options, + final int bufferSize) throws IOException { + String sql = options.get(SelectConstants.SELECT_SQL, null); + boolean isSelect = sql != null; + // choice of keys depends on open type + if (isSelect) { + rejectUnknownMandatoryKeys( + mandatoryKeys, + InternalSelectConstants.SELECT_OPTIONS, + "for " + path + " in S3 Select operation"); + } else { + rejectUnknownMandatoryKeys( + mandatoryKeys, + InternalConstants.STANDARD_OPENFILE_KEYS, + "for " + path + " in non-select file I/O"); + } + CompletableFuture result = new CompletableFuture<>(); + if (!isSelect) { + // normal path. + unboundedThreadPool.submit(() -> + LambdaUtils.eval(result, + () -> open(path, Optional.of(options)))); + } else { + // it is a select statement. + // fail fast if the method is not present + requireSelectSupport(path); + // submit the query + unboundedThreadPool.submit(() -> + LambdaUtils.eval(result, + () -> select(path, sql, options))); + } + return result; + } + } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java index 68f98e4abe..ccc86d03a5 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInputStream.java @@ -18,6 +18,8 @@ package org.apache.hadoop.fs.s3a; +import javax.annotation.Nullable; + import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.GetObjectRequest; import com.amazonaws.services.s3.model.S3Object; @@ -60,6 +62,10 @@ @InterfaceAudience.Private @InterfaceStability.Evolving public class S3AInputStream extends FSInputStream implements CanSetReadahead { + + public static final String E_NEGATIVE_READAHEAD_VALUE + = "Negative readahead value"; + /** * This is the public position; the one set in {@link #seek(long)} * and returned in {@link #getPos()}. @@ -112,12 +118,11 @@ public class S3AInputStream extends FSInputStream implements CanSetReadahead { * @param s3Attributes object attributes from a HEAD request * @param contentLength length of content * @param client S3 client to use - * @param readahead readahead bytes - * @param inputPolicy IO policy */ - public S3AInputStream(S3AReadOpContext ctx, S3ObjectAttributes s3Attributes, - long contentLength, AmazonS3 client, long readahead, - S3AInputPolicy inputPolicy) { + public S3AInputStream(S3AReadOpContext ctx, + S3ObjectAttributes s3Attributes, + long contentLength, + AmazonS3 client) { Preconditions.checkArgument(isNotEmpty(s3Attributes.getBucket()), "No Bucket"); Preconditions.checkArgument(isNotEmpty(s3Attributes.getKey()), "No Key"); @@ -133,8 +138,8 @@ public S3AInputStream(S3AReadOpContext ctx, S3ObjectAttributes s3Attributes, this.serverSideEncryptionAlgorithm = s3Attributes.getServerSideEncryptionAlgorithm(); this.serverSideEncryptionKey = s3Attributes.getServerSideEncryptionKey(); - setInputPolicy(inputPolicy); - setReadahead(readahead); + setInputPolicy(ctx.getInputPolicy()); + setReadahead(ctx.getReadahead()); } /** @@ -179,7 +184,7 @@ private synchronized void reopen(String reason, long targetPos, long length, } String text = String.format("Failed to %s %s at %d", (opencount == 0 ? "open" : "re-open"), uri, targetPos); - S3Object object = context.getReadInvoker().once(text, uri, + S3Object object = Invoker.once(text, uri, () -> client.getObject(request)); wrappedStream = object.getObjectContent(); contentRangeStart = targetPos; @@ -722,12 +727,7 @@ public S3AInstrumentation.InputStreamStatistics getS3AStreamStatistics() { @Override public synchronized void setReadahead(Long readahead) { - if (readahead == null) { - this.readahead = Constants.DEFAULT_READAHEAD_RANGE; - } else { - Preconditions.checkArgument(readahead >= 0, "Negative readahead value"); - this.readahead = readahead; - } + this.readahead = validateReadahead(readahead); } /** @@ -780,4 +780,19 @@ static long calculateRequestLimit( return rangeLimit; } + /** + * from a possibly null Long value, return a valid + * readahead. + * @param readahead new readahead + * @return a natural number. + * @throws IllegalArgumentException if the range is invalid. + */ + public static long validateReadahead(@Nullable Long readahead) { + if (readahead == null) { + return Constants.DEFAULT_READAHEAD_RANGE; + } else { + Preconditions.checkArgument(readahead >= 0, E_NEGATIVE_READAHEAD_VALUE); + return readahead; + } + } } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java index 78ba47d10b..17c5aff9af 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AInstrumentation.java @@ -160,6 +160,7 @@ public class S3AInstrumentation implements Closeable, MetricsSource { OBJECT_PUT_BYTES, OBJECT_PUT_REQUESTS, OBJECT_PUT_REQUESTS_COMPLETED, + OBJECT_SELECT_REQUESTS, STREAM_WRITE_FAILURES, STREAM_WRITE_BLOCK_UPLOADS, STREAM_WRITE_BLOCK_UPLOADS_COMMITTED, @@ -550,7 +551,7 @@ public void decrementGauge(Statistic op, long count) { * Create a stream input statistics instance. * @return the new instance */ - InputStreamStatistics newInputStreamStatistics() { + public InputStreamStatistics newInputStreamStatistics() { return new InputStreamStatistics(); } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AOpContext.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AOpContext.java index fba39b9a5f..553d02fb76 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AOpContext.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AOpContext.java @@ -84,4 +84,29 @@ public S3AOpContext(boolean isS3GuardEnabled, Invoker invoker, dstFileStatus); } + public boolean isS3GuardEnabled() { + return isS3GuardEnabled; + } + + public Invoker getInvoker() { + return invoker; + } + + @Nullable + public FileSystem.Statistics getStats() { + return stats; + } + + public S3AInstrumentation getInstrumentation() { + return instrumentation; + } + + @Nullable + public Invoker getS3guardInvoker() { + return s3guardInvoker; + } + + public FileStatus getDstFileStatus() { + return dstFileStatus; + } } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AReadOpContext.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AReadOpContext.java index 220cd0d8a0..73c219498f 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AReadOpContext.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AReadOpContext.java @@ -20,29 +20,69 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import javax.annotation.Nullable; +import com.google.common.base.Preconditions; + +import static com.google.common.base.Preconditions.checkNotNull; + /** * Read-specific operation context struct. */ public class S3AReadOpContext extends S3AOpContext { - public S3AReadOpContext(boolean isS3GuardEnabled, Invoker invoker, - Invoker s3guardInvoker, @Nullable FileSystem.Statistics stats, - S3AInstrumentation instrumentation, FileStatus dstFileStatus) { + + /** + * Path of read. + */ + private final Path path; + + /** + * Initial input policy of the stream. + */ + private final S3AInputPolicy inputPolicy; + + /** + * Readahead for GET operations/skip, etc. + */ + private final long readahead; + + /** + * Instantiate. + * @param path path of read + * @param isS3GuardEnabled true iff S3Guard is enabled. + * @param invoker invoker for normal retries. + * @param s3guardInvoker S3Guard-specific retry invoker. + * @param stats statistics (may be null) + * @param instrumentation FS instrumentation + * @param dstFileStatus target file status + * @param inputPolicy the input policy + * @param readahead readahead for GET operations/skip, etc. + */ + public S3AReadOpContext( + final Path path, + boolean isS3GuardEnabled, + Invoker invoker, + Invoker s3guardInvoker, + @Nullable FileSystem.Statistics stats, + S3AInstrumentation instrumentation, + FileStatus dstFileStatus, + S3AInputPolicy inputPolicy, + final long readahead) { super(isS3GuardEnabled, invoker, s3guardInvoker, stats, instrumentation, dstFileStatus); - } - - public S3AReadOpContext(boolean isS3GuardEnabled, Invoker invoker, - @Nullable FileSystem.Statistics stats, S3AInstrumentation instrumentation, - FileStatus dstFileStatus) { - super(isS3GuardEnabled, invoker, stats, instrumentation, dstFileStatus); + this.path = checkNotNull(path); + Preconditions.checkArgument(readahead >= 0, + "invalid readahead %d", readahead); + this.inputPolicy = checkNotNull(inputPolicy); + this.readahead = readahead; } /** - * Get invoker to use for read operations. When S3Guard is enabled we use - * the S3Guard invoker, which deals with things like FileNotFoundException + * Get invoker to use for read operations. + * When S3Guard is enabled we use the S3Guard invoker, + * which deals with things like FileNotFoundException * differently. * @return invoker to use for read codepaths */ @@ -53,4 +93,39 @@ public Invoker getReadInvoker() { return invoker; } } + + /** + * Get the path of this read. + * @return path. + */ + public Path getPath() { + return path; + } + + /** + * Get the IO policy. + * @return the initial input policy. + */ + public S3AInputPolicy getInputPolicy() { + return inputPolicy; + } + + /** + * Get the readahead for this operation. + * @return a value >= 0 + */ + public long getReadahead() { + return readahead; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder( + "S3AReadOpContext{"); + sb.append("path=").append(path); + sb.append(", inputPolicy=").append(inputPolicy); + sb.append(", readahead=").append(readahead); + sb.append('}'); + return sb.toString(); + } } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java index 6182b43cea..8428831bea 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AUtils.java @@ -83,6 +83,7 @@ import java.util.Set; import java.util.concurrent.ExecutionException; +import static org.apache.commons.lang3.StringUtils.isEmpty; import static org.apache.hadoop.fs.s3a.Constants.*; /** @@ -250,6 +251,12 @@ public static IOException translateException(@Nullable String operation, ioe.initCause(ase); break; + // method not allowed; seen on S3 Select. + // treated as a bad request + case 405: + ioe = new AWSBadRequestException(message, s3Exception); + break; + // out of range. This may happen if an object is overwritten with // a shorter one while it is being read. case 416: @@ -943,7 +950,7 @@ private static String getPassword(Configuration conf, String key, String val, String defVal) throws IOException { - return StringUtils.isEmpty(val) + return isEmpty(val) ? lookupPassword(conf, key, defVal) : val; } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java index d1bff8a054..d67e3e1e8c 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3ObjectAttributes.java @@ -18,19 +18,24 @@ package org.apache.hadoop.fs.s3a; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + /** * This class is only a holder for bucket, key, SSE Algorithm and SSE key - * attributes. It is only used in {@link S3AInputStream} + * attributes. It is used in {@link S3AInputStream} and the select equivalent. * as a way to reduce parameters being passed * to the constructor of such class. */ -class S3ObjectAttributes { - private String bucket; - private String key; - private S3AEncryptionMethods serverSideEncryptionAlgorithm; - private String serverSideEncryptionKey; +@InterfaceAudience.Private +@InterfaceStability.Evolving +public class S3ObjectAttributes { + private final String bucket; + private final String key; + private final S3AEncryptionMethods serverSideEncryptionAlgorithm; + private final String serverSideEncryptionKey; - S3ObjectAttributes( + public S3ObjectAttributes( String bucket, String key, S3AEncryptionMethods serverSideEncryptionAlgorithm, @@ -41,19 +46,19 @@ class S3ObjectAttributes { this.serverSideEncryptionKey = serverSideEncryptionKey; } - String getBucket() { + public String getBucket() { return bucket; } - String getKey() { + public String getKey() { return key; } - S3AEncryptionMethods getServerSideEncryptionAlgorithm() { + public S3AEncryptionMethods getServerSideEncryptionAlgorithm() { return serverSideEncryptionAlgorithm; } - String getServerSideEncryptionKey() { + public String getServerSideEncryptionKey() { return serverSideEncryptionKey; } } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java index 10ae1db0d8..6f792860d6 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Statistic.java @@ -102,6 +102,8 @@ public enum Statistic { OBJECT_PUT_BYTES("object_put_bytes", "number of bytes uploaded"), OBJECT_PUT_BYTES_PENDING("object_put_bytes_pending", "number of bytes queued for upload/being actively uploaded"), + OBJECT_SELECT_REQUESTS("object_select_requests", + "Count of S3 Select requests issued"), STREAM_ABORTED("stream_aborted", "Count of times the TCP stream was aborted"), STREAM_BACKWARD_SEEK_OPERATIONS("stream_backward_seek_operations", diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java index a5f68179d9..fcc16a16b7 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/WriteOperationHelper.java @@ -26,6 +26,7 @@ import java.util.List; import java.util.concurrent.atomic.AtomicInteger; +import com.amazonaws.services.s3.model.AmazonS3Exception; import com.amazonaws.services.s3.model.CompleteMultipartUploadRequest; import com.amazonaws.services.s3.model.CompleteMultipartUploadResult; import com.amazonaws.services.s3.model.InitiateMultipartUploadRequest; @@ -34,6 +35,8 @@ import com.amazonaws.services.s3.model.PartETag; import com.amazonaws.services.s3.model.PutObjectRequest; import com.amazonaws.services.s3.model.PutObjectResult; +import com.amazonaws.services.s3.model.SelectObjectContentRequest; +import com.amazonaws.services.s3.model.SelectObjectContentResult; import com.amazonaws.services.s3.model.UploadPartRequest; import com.amazonaws.services.s3.model.UploadPartResult; import com.amazonaws.services.s3.transfer.model.UploadResult; @@ -45,17 +48,19 @@ import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.s3a.commit.DurationInfo; +import org.apache.hadoop.fs.s3a.select.SelectBinding; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; import static org.apache.hadoop.fs.s3a.Invoker.*; /** - * Helper for low-level operations against an S3 Bucket for writing data - * and creating and committing pending writes. + * Helper for low-level operations against an S3 Bucket for writing data, + * creating and committing pending writes, and other S3-layer operations. *

    * It hides direct access to the S3 API - * and is a location where the object upload process can be evolved/enhanced. + * and is a location where the object operations can be evolved/enhanced. *

    * Features *

      @@ -65,8 +70,10 @@ * errors. *
    • Callbacks to let the FS know of events in the output stream * upload process.
    • + *
    • Other low-level access to S3 functions, for private use.
    • *
    • Failure handling, including converting exceptions to IOEs.
    • *
    • Integration with instrumentation and S3Guard.
    • + *
    • Evolution to add more low-level operations, such as S3 select.
    • *
    * * This API is for internal use only. @@ -76,9 +83,24 @@ public class WriteOperationHelper { private static final Logger LOG = LoggerFactory.getLogger(WriteOperationHelper.class); + + /** + * Owning filesystem. + */ private final S3AFileSystem owner; + + /** + * Invoker for operations; uses the S3A retry policy and calls int + * {@link #operationRetried(String, Exception, int, boolean)} on retries. + */ private final Invoker invoker; + /** Configuration of the owner. This is a reference, not a copy. */ + private final Configuration conf; + + /** Bucket of the owner FS. */ + private final String bucket; + /** * Constructor. * @param owner owner FS creating the helper @@ -89,6 +111,8 @@ protected WriteOperationHelper(S3AFileSystem owner, Configuration conf) { this.owner = owner; this.invoker = new Invoker(new S3ARetryPolicy(conf), this::operationRetried); + this.conf = conf; + bucket = owner.getBucket(); } /** @@ -189,7 +213,7 @@ public ObjectMetadata newObjectMetadata(long length) { public String initiateMultiPartUpload(String destKey) throws IOException { LOG.debug("Initiating Multipart upload to {}", destKey); final InitiateMultipartUploadRequest initiateMPURequest = - new InitiateMultipartUploadRequest(owner.getBucket(), + new InitiateMultipartUploadRequest(bucket, destKey, newObjectMetadata(-1)); initiateMPURequest.setCannedACL(owner.getCannedACL()); @@ -231,7 +255,7 @@ private CompleteMultipartUploadResult finalizeMultipartUpload( // attempt to sort an unmodifiable list. CompleteMultipartUploadResult result = owner.getAmazonS3Client().completeMultipartUpload( - new CompleteMultipartUploadRequest(owner.getBucket(), + new CompleteMultipartUploadRequest(bucket, destKey, uploadId, new ArrayList<>(partETags))); @@ -381,7 +405,7 @@ public UploadPartRequest newUploadPartRequest( LOG.debug("Creating part upload request for {} #{} size {}", uploadId, partNumber, size); UploadPartRequest request = new UploadPartRequest() - .withBucketName(owner.getBucket()) + .withBucketName(bucket) .withKey(destKey) .withUploadId(uploadId) .withPartNumber(partNumber) @@ -409,7 +433,7 @@ public UploadPartRequest newUploadPartRequest( @Override public String toString() { final StringBuilder sb = new StringBuilder( - "WriteOperationHelper {bucket=").append(owner.getBucket()); + "WriteOperationHelper {bucket=").append(bucket); sb.append('}'); return sb.toString(); } @@ -478,4 +502,71 @@ public UploadPartResult uploadPart(UploadPartRequest request) () -> owner.uploadPart(request)); } + /** + * Get the configuration of this instance; essentially the owning + * filesystem configuration. + * @return the configuration. + */ + public Configuration getConf() { + return conf; + } + + /** + * Create a S3 Select request for the destination path. + * This does not build the query. + * @param path pre-qualified path for query + * @return the request + */ + public SelectObjectContentRequest newSelectRequest(Path path) { + SelectObjectContentRequest request = new SelectObjectContentRequest(); + request.setBucketName(bucket); + request.setKey(owner.pathToKey(path)); + return request; + } + + /** + * Execute an S3 Select operation. + * On a failure, the request is only logged at debug to avoid the + * select exception being printed. + * @param source source for selection + * @param request Select request to issue. + * @param action the action for use in exception creation + * @return response + * @throws IOException failure + */ + @Retries.RetryTranslated + public SelectObjectContentResult select( + final Path source, + final SelectObjectContentRequest request, + final String action) + throws IOException { + String bucketName = request.getBucketName(); + Preconditions.checkArgument(bucket.equals(bucketName), + "wrong bucket: %s", bucketName); + if (LOG.isDebugEnabled()) { + LOG.debug("Initiating select call {} {}", + source, request.getExpression()); + LOG.debug(SelectBinding.toString(request)); + } + return invoker.retry( + action, + source.toString(), + true, + () -> { + try (DurationInfo ignored = + new DurationInfo(LOG, "S3 Select operation")) { + try { + return owner.getAmazonS3Client().selectObjectContent(request); + } catch (AmazonS3Exception e) { + LOG.error("Failure of S3 Select request against {}", + source); + LOG.debug("S3 Select request against {}:\n{}", + source, + SelectBinding.toString(request), + e); + throw e; + } + } + }); + } } diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java index 3751fdad50..e4fd06436a 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardTool.java @@ -53,6 +53,7 @@ import org.apache.hadoop.fs.s3a.S3AUtils; import org.apache.hadoop.fs.s3a.auth.delegation.S3ADelegationTokens; import org.apache.hadoop.fs.s3a.commit.CommitConstants; +import org.apache.hadoop.fs.s3a.select.SelectTool; import org.apache.hadoop.fs.shell.CommandFormat; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.ExitUtil; @@ -90,14 +91,15 @@ public abstract class S3GuardTool extends Configured implements Tool { "\t" + Uploads.NAME + " - " + Uploads.PURPOSE + "\n" + "\t" + Diff.NAME + " - " + Diff.PURPOSE + "\n" + "\t" + Prune.NAME + " - " + Prune.PURPOSE + "\n" + - "\t" + SetCapacity.NAME + " - " +SetCapacity.PURPOSE + "\n"; + "\t" + SetCapacity.NAME + " - " + SetCapacity.PURPOSE + "\n" + + "\t" + SelectTool.NAME + " - " + SelectTool.PURPOSE + "\n"; private static final String DATA_IN_S3_IS_PRESERVED = "(all data in S3 is preserved)"; public static final String E_NO_METASTORE_OR_FILESYSTEM = "No metastore or filesystem specified"; - abstract public String getUsage(); + public abstract String getUsage(); // Exit codes static final int SUCCESS = EXIT_SUCCESS; @@ -144,19 +146,19 @@ protected S3GuardTool(Configuration conf, String...opts) { /** * Return sub-command name. */ - abstract String getName(); + public abstract String getName(); /** * Parse DynamoDB region from either -m option or a S3 path. * - * This function should only be called from {@link Init} or - * {@link Destroy}. + * This function should only be called from {@link S3GuardTool.Init} or + * {@link S3GuardTool.Destroy}. * * @param paths remaining parameters from CLI. * @throws IOException on I/O errors. * @throws ExitUtil.ExitException on validation errors */ - void parseDynamoDBRegion(List paths) throws IOException { + protected void parseDynamoDBRegion(List paths) throws IOException { Configuration conf = getConf(); String fromCli = getCommandFormat().getOptValue(REGION_FLAG); String fromConf = conf.get(S3GUARD_DDB_REGION_KEY); @@ -269,7 +271,8 @@ protected void checkBucketNameOrDDBTableNameProvided(List paths) { * @param forceCreate override the auto-creation setting to true. * @return a initialized metadata store. */ - MetadataStore initMetadataStore(boolean forceCreate) throws IOException { + protected MetadataStore initMetadataStore(boolean forceCreate) + throws IOException { if (getStore() != null) { return getStore(); } @@ -334,7 +337,7 @@ MetadataStore initMetadataStore(boolean forceCreate) throws IOException { * @throws IOException failure to init filesystem * @throws ExitUtil.ExitException if the FS is not an S3A FS */ - void initS3AFileSystem(String path) throws IOException { + protected void initS3AFileSystem(String path) throws IOException { URI uri = toUri(path); // Make sure that S3AFileSystem does not hold an actual MetadataStore // implementation. @@ -367,7 +370,7 @@ void initS3AFileSystem(String path) throws IOException { * @param args command line arguments. * @return the position arguments from CLI. */ - List parseArgs(String[] args) { + protected List parseArgs(String[] args) { return getCommandFormat().parse(args, 1); } @@ -404,16 +407,16 @@ public final int run(String[] args) throws Exception { * * As well as returning an exit code, the implementations can choose to * throw an instance of {@link ExitUtil.ExitException} with their exit - * code set to the desired exit value. The exit code of auch an exception + * code set to the desired exit value. The exit code of such an exception * is used for the tool's exit code, and the stack trace only logged at * debug. * @param args argument list * @param out output stream * @return the exit code to return. * @throws Exception on any failure - * @throws ExitUtil.ExitException for an alternative clean exit */ - public abstract int run(String[] args, PrintStream out) throws Exception; + public abstract int run(String[] args, PrintStream out) throws Exception, + ExitUtil.ExitException; /** * Create the metadata store. @@ -448,7 +451,7 @@ static class Init extends S3GuardTool { } @Override - String getName() { + public String getName() { return NAME; } @@ -541,7 +544,7 @@ static class SetCapacity extends S3GuardTool { } @Override - String getName() { + public String getName() { return NAME; } @@ -613,7 +616,7 @@ static class Destroy extends S3GuardTool { } @Override - String getName() { + public String getName() { return NAME; } @@ -678,7 +681,7 @@ static class Import extends S3GuardTool { } @Override - String getName() { + public String getName() { return NAME; } @@ -810,7 +813,7 @@ static class Diff extends S3GuardTool { } @Override - String getName() { + public String getName() { return NAME; } @@ -1018,7 +1021,7 @@ void setMetadataStore(MetadataStore ms) { } @Override - String getName() { + public String getName() { return NAME; } @@ -1108,7 +1111,7 @@ static class BucketInfo extends S3GuardTool { } @Override - String getName() { + public String getName() { return NAME; } @@ -1290,7 +1293,7 @@ private enum Mode { LIST, EXPECT, ABORT }; } @Override - String getName() { + public String getName() { return NAME; } @@ -1457,7 +1460,7 @@ protected static URI toUri(String s3Path) { return uri; } - private static void printHelp(S3GuardTool tool) { + protected static void printHelp(S3GuardTool tool) { if (tool == null) { errorln("Usage: hadoop " + USAGE); errorln("\tperform S3Guard metadata store " + @@ -1469,11 +1472,11 @@ private static void printHelp(S3GuardTool tool) { errorln(COMMON_USAGE); } - private static void errorln() { + protected static void errorln() { System.err.println(); } - private static void errorln(String x) { + protected static void errorln(String x) { System.err.println(x); } @@ -1483,7 +1486,9 @@ private static void errorln(String x) { * @param format format string * @param args optional arguments */ - private static void println(PrintStream out, String format, Object... args) { + protected static void println(PrintStream out, + String format, + Object... args) { out.println(String.format(format, args)); } @@ -1523,8 +1528,7 @@ protected static ExitUtil.ExitException storeNotFound( */ protected static ExitUtil.ExitException invalidArgs( String format, Object...args) { - return new ExitUtil.ExitException(INVALID_ARGUMENT, - String.format(format, args)); + return exitException(INVALID_ARGUMENT, format, args); } /** @@ -1535,8 +1539,8 @@ protected static ExitUtil.ExitException invalidArgs( */ protected static ExitUtil.ExitException badState( String format, Object...args) { - return new ExitUtil.ExitException(E_BAD_STATE, - String.format(format, args)); + int exitCode = E_BAD_STATE; + return exitException(exitCode, format, args); } /** @@ -1547,7 +1551,22 @@ protected static ExitUtil.ExitException badState( */ protected static ExitUtil.ExitException userAborted( String format, Object...args) { - return new ExitUtil.ExitException(ERROR, String.format(format, args)); + return exitException(ERROR, format, args); + } + + /** + * Build a exception to throw with a formatted message. + * @param exitCode exit code to use + * @param format string format + * @param args optional arguments for the string + * @return a new exception to throw + */ + protected static ExitUtil.ExitException exitException( + final int exitCode, + final String format, + final Object... args) { + return new ExitUtil.ExitException(exitCode, + String.format(format, args)); } @@ -1607,6 +1626,11 @@ public static int run(Configuration conf, String...args) throws case Uploads.NAME: command = new Uploads(conf); break; + case SelectTool.NAME: + // the select tool is not technically a S3Guard tool, but it's on the CLI + // because this is the defacto S3 CLI. + command = new SelectTool(conf); + break; default: printHelp(null); throw new ExitUtil.ExitException(E_USAGE, diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/InternalSelectConstants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/InternalSelectConstants.java new file mode 100644 index 0000000000..ae3dc0816d --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/InternalSelectConstants.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.select; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.fs.s3a.InternalConstants; + +import static org.apache.hadoop.fs.s3a.select.SelectConstants.*; + +/** + * Constants for internal use in the org.apache.hadoop.fs.s3a module itself. + * Please don't refer to these outside of this module & its tests. + * If you find you need to then either the code is doing something it + * should not, or these constants need to be uprated to being + * public and stable entries. + */ +@InterfaceAudience.Private +public final class InternalSelectConstants { + + private InternalSelectConstants() { + } + + /** + * An unmodifiable set listing the options + * supported in {@code openFile()}. + */ + public static final Set SELECT_OPTIONS; + + /* + * Build up the options, pulling in the standard set too. + */ + static { + // when adding to this, please keep in alphabetical order after the + // common options and the SQL. + HashSet options = new HashSet<>(Arrays.asList( + SELECT_SQL, + SELECT_ERRORS_INCLUDE_SQL, + SELECT_INPUT_COMPRESSION, + SELECT_INPUT_FORMAT, + SELECT_OUTPUT_FORMAT, + CSV_INPUT_COMMENT_MARKER, + CSV_INPUT_HEADER, + CSV_INPUT_INPUT_FIELD_DELIMITER, + CSV_INPUT_QUOTE_CHARACTER, + CSV_INPUT_QUOTE_ESCAPE_CHARACTER, + CSV_INPUT_RECORD_DELIMITER, + CSV_OUTPUT_FIELD_DELIMITER, + CSV_OUTPUT_QUOTE_CHARACTER, + CSV_OUTPUT_QUOTE_ESCAPE_CHARACTER, + CSV_OUTPUT_QUOTE_FIELDS, + CSV_OUTPUT_RECORD_DELIMITER + )); + options.addAll(InternalConstants.STANDARD_OPENFILE_KEYS); + SELECT_OPTIONS = Collections.unmodifiableSet(options); + } +} diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectBinding.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectBinding.java new file mode 100644 index 0000000000..ff39b9ad95 --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectBinding.java @@ -0,0 +1,431 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.select; + +import java.io.IOException; +import java.util.Locale; +import java.util.Optional; + +import com.amazonaws.services.s3.model.CSVInput; +import com.amazonaws.services.s3.model.CSVOutput; +import com.amazonaws.services.s3.model.ExpressionType; +import com.amazonaws.services.s3.model.InputSerialization; +import com.amazonaws.services.s3.model.OutputSerialization; +import com.amazonaws.services.s3.model.QuoteFields; +import com.amazonaws.services.s3.model.SSECustomerKey; +import com.amazonaws.services.s3.model.SelectObjectContentRequest; +import com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathIOException; +import org.apache.hadoop.fs.s3a.Retries; +import org.apache.hadoop.fs.s3a.S3AReadOpContext; +import org.apache.hadoop.fs.s3a.S3ObjectAttributes; +import org.apache.hadoop.fs.s3a.WriteOperationHelper; + +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.commons.lang3.StringUtils.isNotEmpty; +import static org.apache.hadoop.fs.s3a.select.SelectConstants.*; + +/** + * Class to do the S3 select binding and build a select request from the + * supplied arguments/configuration. + * + * This class is intended to be instantiated by the owning S3AFileSystem + * instance to handle the construction of requests: IO is still done exclusively + * in the filesystem. + */ +public class SelectBinding { + + static final Logger LOG = + LoggerFactory.getLogger(SelectBinding.class); + + /** Operations on the store. */ + private final WriteOperationHelper operations; + + /** Is S3 Select enabled? */ + private final boolean enabled; + private final boolean errorsIncludeSql; + + /** + * Constructor. + * @param operations owning FS. + */ + public SelectBinding(final WriteOperationHelper operations) { + this.operations = checkNotNull(operations); + Configuration conf = getConf(); + this.enabled = conf.getBoolean(FS_S3A_SELECT_ENABLED, true); + this.errorsIncludeSql = conf.getBoolean(SELECT_ERRORS_INCLUDE_SQL, false); + } + + Configuration getConf() { + return operations.getConf(); + } + + /** + * Is the service supported? + * @return true iff select is enabled. + */ + public boolean isEnabled() { + return enabled; + } + + /** + * Build and execute a select request. + * @param readContext the read context, which includes the source path. + * @param expression the SQL expression. + * @param builderOptions query options + * @param sseKey optional SSE customer key + * @param objectAttributes object attributes from a HEAD request + * @return an FSDataInputStream whose wrapped stream is a SelectInputStream + * @throws IllegalArgumentException argument failure + * @throws IOException failure building, validating or executing the request. + * @throws PathIOException source path is a directory. + */ + @Retries.RetryTranslated + public FSDataInputStream select( + final S3AReadOpContext readContext, + final String expression, + final Configuration builderOptions, + final Optional sseKey, + final S3ObjectAttributes objectAttributes) throws IOException { + + return new FSDataInputStream( + executeSelect(readContext, + objectAttributes, + builderOptions, + buildSelectRequest( + readContext.getPath(), + expression, + builderOptions, + sseKey))); + } + + /** + * Build a select request. + * @param path source path. + * @param expression the SQL expression. + * @param builderOptions config to extract other query options from + * @param sseKey optional SSE customer key + * @return the request to serve + * @throws IllegalArgumentException argument failure + * @throws IOException problem building/validating the request + */ + public SelectObjectContentRequest buildSelectRequest( + final Path path, + final String expression, + final Configuration builderOptions, + final Optional sseKey) + throws IOException { + Preconditions.checkState(isEnabled(), + "S3 Select is not enabled for %s", path); + + SelectObjectContentRequest request = operations.newSelectRequest(path); + buildRequest(request, expression, builderOptions); + // optionally set an SSE key in the input + sseKey.ifPresent(request::withSSECustomerKey); + return request; + } + + /** + * Execute the select request. + * @param readContext read context + * @param objectAttributes object attributes from a HEAD request + * @param builderOptions the options which came in from the openFile builder. + * @param request the built up select request. + * @return a SelectInputStream + * @throws IOException failure + * @throws PathIOException source path is a directory. + */ + @Retries.RetryTranslated + private SelectInputStream executeSelect( + final S3AReadOpContext readContext, + final S3ObjectAttributes objectAttributes, + final Configuration builderOptions, + final SelectObjectContentRequest request) throws IOException { + + Path path = readContext.getPath(); + if (readContext.getDstFileStatus().isDirectory()) { + throw new PathIOException(path.toString(), + "Can't select " + path + + " because it is a directory"); + } + boolean sqlInErrors = builderOptions.getBoolean(SELECT_ERRORS_INCLUDE_SQL, + errorsIncludeSql); + String expression = request.getExpression(); + final String errorText = sqlInErrors ? expression : "Select"; + if (sqlInErrors) { + LOG.info("Issuing SQL request {}", expression); + } + return new SelectInputStream(readContext, + objectAttributes, + operations.select(path, request, errorText)); + } + + /** + * Build the select request from the configuration built up + * in {@code S3AFileSystem.openFile(Path)} and the default + * options in the cluster configuration. + * + * Options are picked up in the following order. + *
      + *
    1. Options in {@code openFileOptions}.
    2. + *
    3. Options in the owning filesystem configuration.
    4. + *
    5. The default values in {@link SelectConstants}
    6. + *
    + * + * @param request request to build up + * @param expression SQL expression + * @param builderOptions the options which came in from the openFile builder. + * @throws IllegalArgumentException if an option is somehow invalid. + * @throws IOException if an option is somehow invalid. + */ + void buildRequest( + final SelectObjectContentRequest request, + final String expression, + final Configuration builderOptions) + throws IllegalArgumentException, IOException { + Preconditions.checkArgument(StringUtils.isNotEmpty(expression), + "No expression provided in parameter " + SELECT_SQL); + + final Configuration ownerConf = operations.getConf(); + + + String inputFormat = builderOptions.get(SELECT_INPUT_FORMAT, + SELECT_FORMAT_CSV).toLowerCase(Locale.ENGLISH); + Preconditions.checkArgument(SELECT_FORMAT_CSV.equals(inputFormat), + "Unsupported input format %s", inputFormat); + String outputFormat = builderOptions.get(SELECT_OUTPUT_FORMAT, + SELECT_FORMAT_CSV) + .toLowerCase(Locale.ENGLISH); + Preconditions.checkArgument(SELECT_FORMAT_CSV.equals(outputFormat), + "Unsupported output format %s", outputFormat); + + request.setExpressionType(ExpressionType.SQL); + request.setExpression(expandBackslashChars(expression)); + + InputSerialization inputSerialization = buildCsvInputRequest(ownerConf, + builderOptions); + String compression = opt(builderOptions, + ownerConf, + SELECT_INPUT_COMPRESSION, + COMPRESSION_OPT_NONE, + true).toUpperCase(Locale.ENGLISH); + if (isNotEmpty(compression)) { + inputSerialization.setCompressionType(compression); + } + request.setInputSerialization(inputSerialization); + + request.setOutputSerialization(buildCSVOutput(ownerConf, builderOptions)); + + } + + /** + * Build the CSV input request. + * @param ownerConf FS owner configuration + * @param builderOptions options on the specific request + * @return the constructed request + * @throws IllegalArgumentException argument failure + * @throws IOException validation failure + */ + public InputSerialization buildCsvInputRequest( + final Configuration ownerConf, + final Configuration builderOptions) + throws IllegalArgumentException, IOException { + + String headerInfo = opt(builderOptions, + ownerConf, + CSV_INPUT_HEADER, + CSV_INPUT_HEADER_OPT_DEFAULT, + true).toUpperCase(Locale.ENGLISH); + String commentMarker = xopt(builderOptions, + ownerConf, + CSV_INPUT_COMMENT_MARKER, + CSV_INPUT_COMMENT_MARKER_DEFAULT); + String fieldDelimiter = xopt(builderOptions, + ownerConf, + CSV_INPUT_INPUT_FIELD_DELIMITER, + CSV_INPUT_FIELD_DELIMITER_DEFAULT); + String recordDelimiter = xopt(builderOptions, + ownerConf, + CSV_INPUT_RECORD_DELIMITER, + CSV_INPUT_RECORD_DELIMITER_DEFAULT); + String quoteCharacter = xopt(builderOptions, + ownerConf, + CSV_INPUT_QUOTE_CHARACTER, + CSV_INPUT_QUOTE_CHARACTER_DEFAULT); + String quoteEscapeCharacter = xopt(builderOptions, + ownerConf, + CSV_INPUT_QUOTE_ESCAPE_CHARACTER, + CSV_INPUT_QUOTE_ESCAPE_CHARACTER_DEFAULT); + + // CSV input + CSVInput csv = new CSVInput(); + csv.setFieldDelimiter(fieldDelimiter); + csv.setRecordDelimiter(recordDelimiter); + csv.setComments(commentMarker); + csv.setQuoteCharacter(quoteCharacter); + if (StringUtils.isNotEmpty(quoteEscapeCharacter)) { + csv.setQuoteEscapeCharacter(quoteEscapeCharacter); + } + csv.setFileHeaderInfo(headerInfo); + + InputSerialization inputSerialization = new InputSerialization(); + inputSerialization.setCsv(csv); + + return inputSerialization; + + } + + /** + * Build CSV output for a request. + * @param ownerConf FS owner configuration + * @param builderOptions options on the specific request + * @return the constructed request + * @throws IllegalArgumentException argument failure + * @throws IOException validation failure + */ + public OutputSerialization buildCSVOutput( + final Configuration ownerConf, + final Configuration builderOptions) + throws IllegalArgumentException, IOException { + String fieldDelimiter = xopt(builderOptions, + ownerConf, + CSV_OUTPUT_FIELD_DELIMITER, + CSV_OUTPUT_FIELD_DELIMITER_DEFAULT); + String recordDelimiter = xopt(builderOptions, + ownerConf, + CSV_OUTPUT_RECORD_DELIMITER, + CSV_OUTPUT_RECORD_DELIMITER_DEFAULT); + String quoteCharacter = xopt(builderOptions, + ownerConf, + CSV_OUTPUT_QUOTE_CHARACTER, + CSV_OUTPUT_QUOTE_CHARACTER_DEFAULT); + String quoteEscapeCharacter = xopt(builderOptions, + ownerConf, + CSV_OUTPUT_QUOTE_ESCAPE_CHARACTER, + CSV_OUTPUT_QUOTE_ESCAPE_CHARACTER_DEFAULT); + String quoteFields = xopt(builderOptions, + ownerConf, + CSV_OUTPUT_QUOTE_FIELDS, + CSV_OUTPUT_QUOTE_FIELDS_ALWAYS).toUpperCase(Locale.ENGLISH); + + // output is CSV, always + OutputSerialization outputSerialization + = new OutputSerialization(); + CSVOutput csvOut = new CSVOutput(); + csvOut.setQuoteCharacter(quoteCharacter); + csvOut.setQuoteFields( + QuoteFields.fromValue(quoteFields)); + csvOut.setFieldDelimiter(fieldDelimiter); + csvOut.setRecordDelimiter(recordDelimiter); + if (!quoteEscapeCharacter.isEmpty()) { + csvOut.setQuoteEscapeCharacter(quoteEscapeCharacter); + } + + outputSerialization.setCsv(csvOut); + return outputSerialization; + } + + /** + * Stringify the given SelectObjectContentRequest, as its + * toString() operator doesn't. + * @param request request to convert to a string + * @return a string to print. Does not contain secrets. + */ + public static String toString(final SelectObjectContentRequest request) { + StringBuilder sb = new StringBuilder(); + sb.append("SelectObjectContentRequest{") + .append("bucket name=").append(request.getBucketName()) + .append("; key=").append(request.getKey()) + .append("; expressionType=").append(request.getExpressionType()) + .append("; expression=").append(request.getExpression()); + InputSerialization input = request.getInputSerialization(); + if (input != null) { + sb.append("; Input") + .append(input.toString()); + } else { + sb.append("; Input Serialization: none"); + } + OutputSerialization out = request.getOutputSerialization(); + if (out != null) { + sb.append("; Output") + .append(out.toString()); + } else { + sb.append("; Output Serialization: none"); + } + return sb.append("}").toString(); + } + + /** + * Resolve an option. + * @param builderOptions the options which came in from the openFile builder. + * @param fsConf configuration of the owning FS. + * @param base base option (no s3a: prefix) + * @param defVal default value. Must not be null. + * @param trim should the result be trimmed. + * @return the possibly trimmed value. + */ + static String opt(Configuration builderOptions, + Configuration fsConf, + String base, + String defVal, + boolean trim) { + String r = builderOptions.get(base, fsConf.get(base, defVal)); + return trim ? r.trim() : r; + } + + /** + * Get an option with backslash arguments transformed. + * These are not trimmed, so whitespace is significant. + * @param selectOpts options in the select call + * @param fsConf filesystem conf + * @param base base option name + * @param defVal default value + * @return the transformed value + */ + static String xopt(Configuration selectOpts, + Configuration fsConf, + String base, + String defVal) { + return expandBackslashChars( + opt(selectOpts, fsConf, base, defVal, false)); + } + + /** + * Perform escaping. + * @param src source string. + * @return the replaced value + */ + static String expandBackslashChars(String src) { + return src.replace("\\n", "\n") + .replace("\\\"", "\"") + .replace("\\t", "\t") + .replace("\\r", "\r") + .replace("\\\"", "\"") + // backslash substitution must come last + .replace("\\\\", "\\"); + } + +} diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectConstants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectConstants.java new file mode 100644 index 0000000000..d74411d2f9 --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectConstants.java @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.select; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * Options related to S3 Select. + * + * These options are set for the entire filesystem unless overridden + * as an option in the URI + */ +@InterfaceAudience.Public +@InterfaceStability.Unstable +public final class SelectConstants { + + public static final String SELECT_UNSUPPORTED = "S3 Select is not supported"; + + private SelectConstants() { + } + + public static final String FS_S3A_SELECT = "fs.s3a.select."; + + + /** + * This is the big SQL expression: {@value}. + * When used in an open() call, switch to a select operation. + * This is only used in the open call, never in a filesystem configuration. + */ + public static final String SELECT_SQL = FS_S3A_SELECT + "sql"; + + /** + * Does the FS Support S3 Select? + * Value: {@value}. + */ + public static final String S3_SELECT_CAPABILITY = "s3a:fs.s3a.select.sql"; + + /** + * Flag: is S3 select enabled? + * Value: {@value}. + */ + public static final String FS_S3A_SELECT_ENABLED = FS_S3A_SELECT + + "enabled"; + + /** + * Input format for data. + * Value: {@value}. + */ + public static final String SELECT_INPUT_FORMAT = + "fs.s3a.select.input.format"; + + /** + * Output format for data -that is, what the results are generated + * as. + * Value: {@value}. + */ + public static final String SELECT_OUTPUT_FORMAT = + "fs.s3a.select.output.format"; + + /** + * CSV as an input or output format: {@value}. + */ + public static final String SELECT_FORMAT_CSV = "csv"; + + /** + * JSON as an input or output format: {@value}. + */ + public static final String SELECT_FORMAT_JSON = "json"; + + /** + * Should Select errors include the SQL statement? + * It is easier to debug but a security risk if the exceptions + * ever get printed/logged and the query contains secrets. + */ + public static final String SELECT_ERRORS_INCLUDE_SQL = + FS_S3A_SELECT + "errors.include.sql"; + + /** + * How is the input compressed? This applies to all formats. + * Value: {@value}. + */ + public static final String SELECT_INPUT_COMPRESSION = FS_S3A_SELECT + + "input.compression"; + + /** + * No compression. + * Value: {@value}. + */ + public static final String COMPRESSION_OPT_NONE = "none"; + + /** + * Gzipped. + * Value: {@value}. + */ + public static final String COMPRESSION_OPT_GZIP = "gzip"; + + /** + * Prefix for all CSV input options. + * Value: {@value}. + */ + public static final String FS_S3A_SELECT_INPUT_CSV = + "fs.s3a.select.input.csv."; + + /** + * Prefix for all CSV output options. + * Value: {@value}. + */ + public static final String FS_S3A_SELECT_OUTPUT_CSV = + "fs.s3a.select.output.csv."; + + /** + * String which indicates the row is actually a comment. + * Value: {@value}. + */ + public static final String CSV_INPUT_COMMENT_MARKER = + FS_S3A_SELECT_INPUT_CSV + "comment.marker"; + + /** + * Default marker. + * Value: {@value}. + */ + public static final String CSV_INPUT_COMMENT_MARKER_DEFAULT = "#"; + + /** + * Record delimiter. CR, LF, etc. + * Value: {@value}. + */ + public static final String CSV_INPUT_RECORD_DELIMITER = + FS_S3A_SELECT_INPUT_CSV + "record.delimiter"; + + /** + * Default delimiter + * Value: {@value}. + */ + public static final String CSV_INPUT_RECORD_DELIMITER_DEFAULT = "\n"; + + /** + * Field delimiter. + * Value: {@value}. + */ + public static final String CSV_INPUT_INPUT_FIELD_DELIMITER = + FS_S3A_SELECT_INPUT_CSV + "field.delimiter"; + + /** + * Default field delimiter. + * Value: {@value}. + */ + public static final String CSV_INPUT_FIELD_DELIMITER_DEFAULT = ","; + + /** + * Quote Character. + * Value: {@value}. + */ + public static final String CSV_INPUT_QUOTE_CHARACTER = + FS_S3A_SELECT_INPUT_CSV + "quote.character"; + + /** + * Default Quote Character. + * Value: {@value}. + */ + public static final String CSV_INPUT_QUOTE_CHARACTER_DEFAULT = "\""; + + /** + * Character to escape quotes. + * If empty: no escaping. + * Value: {@value}. + */ + public static final String CSV_INPUT_QUOTE_ESCAPE_CHARACTER = + FS_S3A_SELECT_INPUT_CSV + "quote.escape.character"; + + /** + * Default quote escape character. + * Value: {@value}. + */ + public static final String CSV_INPUT_QUOTE_ESCAPE_CHARACTER_DEFAULT = "\\"; + + /** + * How should headers be used? + * Value: {@value}. + */ + public static final String CSV_INPUT_HEADER = + FS_S3A_SELECT_INPUT_CSV + "header"; + + /** + * No header: first row is data. + * Value: {@value}. + */ + public static final String CSV_HEADER_OPT_NONE = "none"; + + /** + * Ignore the header. + * Value: {@value}. + */ + public static final String CSV_HEADER_OPT_IGNORE = "ignore"; + + /** + * Use the header. + * Value: {@value}. + */ + public static final String CSV_HEADER_OPT_USE = "use"; + + /** + * Default header mode: {@value}. + */ + public static final String CSV_INPUT_HEADER_OPT_DEFAULT = + CSV_HEADER_OPT_IGNORE; + + /** + * Record delimiter. CR, LF, etc. + * Value: {@value}. + */ + public static final String CSV_OUTPUT_RECORD_DELIMITER = + FS_S3A_SELECT_OUTPUT_CSV + "record.delimiter"; + + /** + * Default delimiter + * Value: {@value}. + */ + public static final String CSV_OUTPUT_RECORD_DELIMITER_DEFAULT = "\n"; + + /** + * Field delimiter. + * Value: {@value}. + */ + public static final String CSV_OUTPUT_FIELD_DELIMITER = + FS_S3A_SELECT_OUTPUT_CSV + "field.delimiter"; + + /** + * Default field delimiter. + * Value: {@value}. + */ + public static final String CSV_OUTPUT_FIELD_DELIMITER_DEFAULT = ","; + + /** + * Quote Character. + * Value: {@value}. + */ + public static final String CSV_OUTPUT_QUOTE_CHARACTER = + FS_S3A_SELECT_OUTPUT_CSV + "quote.character"; + + /** + * Default Quote Character. + * Value: {@value}. + */ + public static final String CSV_OUTPUT_QUOTE_CHARACTER_DEFAULT = "\""; + + /** + * Should CSV fields be quoted? + * One of : ALWAYS, ASNEEDED + * Value: {@value}. + */ + public static final String CSV_OUTPUT_QUOTE_FIELDS = + FS_S3A_SELECT_OUTPUT_CSV + "quote.fields"; + + /** + * Output quotation policy (default): {@value}. + */ + public static final String CSV_OUTPUT_QUOTE_FIELDS_ALWAYS = "always"; + + /** + * Output quotation policy: {@value}. + */ + public static final String CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED = "asneeded"; + + /** + * Character to escape quotes. + * If empty: no escaping. + * Value: {@value}. + */ + public static final String CSV_OUTPUT_QUOTE_ESCAPE_CHARACTER = + FS_S3A_SELECT_OUTPUT_CSV + "quote.escape.character"; + + /** + * Default quote escape character. + * Value: {@value}. + */ + public static final String CSV_OUTPUT_QUOTE_ESCAPE_CHARACTER_DEFAULT = ""; + +} diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectInputStream.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectInputStream.java new file mode 100644 index 0000000000..f4bd8d1170 --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectInputStream.java @@ -0,0 +1,457 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.select; + +import java.io.EOFException; +import java.io.IOException; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; + +import com.amazonaws.AbortedException; +import com.amazonaws.services.s3.model.SelectObjectContentEvent; +import com.amazonaws.services.s3.model.SelectObjectContentEventVisitor; +import com.amazonaws.services.s3.model.SelectObjectContentResult; +import com.amazonaws.services.s3.model.SelectRecordsInputStream; +import com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.fs.CanSetReadahead; +import org.apache.hadoop.fs.FSExceptionMessages; +import org.apache.hadoop.fs.FSInputStream; +import org.apache.hadoop.fs.PathIOException; +import org.apache.hadoop.fs.s3a.Retries; +import org.apache.hadoop.fs.s3a.S3AInstrumentation; +import org.apache.hadoop.fs.s3a.S3AReadOpContext; +import org.apache.hadoop.fs.s3a.S3ObjectAttributes; +import org.apache.hadoop.io.IOUtils; + +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.commons.lang3.StringUtils.isNotEmpty; +import static org.apache.hadoop.fs.s3a.Invoker.once; +import static org.apache.hadoop.fs.s3a.S3AInputStream.validateReadahead; + +/** + * An input stream for S3 Select return values. + * This is simply an end-to-end GET request, without any + * form of seek or recovery from connectivity failures. + * + * Currently only seek and positioned read operations on the current + * location are supported. + * + * The normal S3 input counters are updated by this stream. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +public class SelectInputStream extends FSInputStream implements + CanSetReadahead { + + private static final Logger LOG = + LoggerFactory.getLogger(SelectInputStream.class); + + public static final String SEEK_UNSUPPORTED = "seek()"; + + /** + * Same set of arguments as for an S3AInputStream. + */ + private final S3ObjectAttributes objectAttributes; + + /** + * Tracks the current position. + */ + private AtomicLong pos = new AtomicLong(0); + + /** + * Closed flag. + */ + private final AtomicBoolean closed = new AtomicBoolean(false); + + /** + * Did the read complete successfully? + */ + private final AtomicBoolean completedSuccessfully = new AtomicBoolean(false); + + /** + * Abortable response stream. + * This is guaranteed to never be null. + */ + private final SelectRecordsInputStream wrappedStream; + + private final String bucket; + + private final String key; + + private final String uri; + + private final S3AReadOpContext readContext; + + private final S3AInstrumentation.InputStreamStatistics streamStatistics; + + private long readahead; + + /** + * Create the stream. + * The read attempt is initiated immediately. + * @param readContext read context + * @param objectAttributes object attributes from a HEAD request + * @param selectResponse response from the already executed call + * @throws IOException failure + */ + @Retries.OnceTranslated + public SelectInputStream( + final S3AReadOpContext readContext, + final S3ObjectAttributes objectAttributes, + final SelectObjectContentResult selectResponse) throws IOException { + Preconditions.checkArgument(isNotEmpty(objectAttributes.getBucket()), + "No Bucket"); + Preconditions.checkArgument(isNotEmpty(objectAttributes.getKey()), + "No Key"); + this.objectAttributes = objectAttributes; + this.bucket = objectAttributes.getBucket(); + this.key = objectAttributes.getKey(); + this.uri = "s3a://" + this.bucket + "/" + this.key; + this.readContext = readContext; + this.readahead = readContext.getReadahead(); + this.streamStatistics = readContext.getInstrumentation() + .newInputStreamStatistics(); + SelectRecordsInputStream stream = once( + "S3 Select", + uri, + () -> selectResponse.getPayload() + .getRecordsInputStream(new SelectObjectContentEventVisitor() { + @Override + public void visit(final SelectObjectContentEvent.EndEvent event) { + LOG.debug("Completed successful S3 select read from {}", uri); + completedSuccessfully.set(true); + } + })); + this.wrappedStream = checkNotNull(stream); + // this stream is already opened, so mark as such in the statistics. + streamStatistics.streamOpened(); + } + + @Override + public void close() throws IOException { + long skipped = 0; + boolean aborted = false; + if (!closed.getAndSet(true)) { + try { + // set up for aborts. + // if we know the available amount > readahead. Abort. + // + boolean shouldAbort = wrappedStream.available() > readahead; + if (!shouldAbort) { + // read our readahead range worth of data + skipped = wrappedStream.skip(readahead); + shouldAbort = wrappedStream.read() >= 0; + } + // now, either there is data left or not. + if (shouldAbort) { + // yes, more data. Abort and add this fact to the stream stats + aborted = true; + wrappedStream.abort(); + } + } catch (IOException | AbortedException e) { + LOG.debug("While closing stream", e); + } finally { + IOUtils.cleanupWithLogger(LOG, wrappedStream); + streamStatistics.streamClose(aborted, skipped); + streamStatistics.close(); + super.close(); + } + } + } + + /** + * Verify that the input stream is open. Non blocking; this gives + * the last state of the atomic {@link #closed} field. + * @throws PathIOException if the connection is closed. + */ + private void checkNotClosed() throws IOException { + if (closed.get()) { + throw new PathIOException(uri, FSExceptionMessages.STREAM_IS_CLOSED); + } + } + + @Override + public int available() throws IOException { + checkNotClosed(); + return wrappedStream.available(); + } + + @Override + @Retries.OnceTranslated + public synchronized long skip(final long n) throws IOException { + checkNotClosed(); + long skipped = once("skip", uri, () -> wrappedStream.skip(n)); + pos.addAndGet(skipped); + // treat as a forward skip for stats + streamStatistics.seekForwards(skipped); + return skipped; + } + + @Override + public long getPos() { + return pos.get(); + } + + /** + * Set the readahead. + * @param readahead The readahead to use. null means to use the default. + */ + @Override + public void setReadahead(Long readahead) { + this.readahead = validateReadahead(readahead); + } + + /** + * Get the current readahead value. + * @return the readahead + */ + public long getReadahead() { + return readahead; + } + + /** + * Read a byte. There's no attempt to recover, but AWS-SDK exceptions + * such as {@code SelectObjectContentEventException} are translated into + * IOExceptions. + * @return a byte read or -1 for an end of file. + * @throws IOException failure. + */ + @Override + @Retries.OnceTranslated + public synchronized int read() throws IOException { + checkNotClosed(); + int byteRead; + try { + byteRead = once("read()", uri, () -> wrappedStream.read()); + } catch (EOFException e) { + // this could be one of: end of file, some IO failure + if (completedSuccessfully.get()) { + // read was successful + return -1; + } else { + // the stream closed prematurely + LOG.info("Reading of S3 Select data from {} failed before all results " + + " were generated.", uri); + streamStatistics.readException(); + throw new PathIOException(uri, + "Read of S3 Select data did not complete"); + } + } + + if (byteRead >= 0) { + incrementBytesRead(1); + } + return byteRead; + } + + @SuppressWarnings("NullableProblems") + @Override + @Retries.OnceTranslated + public synchronized int read(final byte[] buf, final int off, final int len) + throws IOException { + checkNotClosed(); + validatePositionedReadArgs(pos.get(), buf, off, len); + if (len == 0) { + return 0; + } + + int bytesRead; + try { + streamStatistics.readOperationStarted(pos.get(), len); + bytesRead = wrappedStream.read(buf, off, len); + } catch (EOFException e) { + streamStatistics.readException(); + // the base implementation swallows EOFs. + return -1; + } + + incrementBytesRead(bytesRead); + streamStatistics.readOperationCompleted(len, bytesRead); + return bytesRead; + } + + /** + * Forward seeks are supported, but not backwards ones. + * Forward seeks are implemented using read, so + * means that long-distance seeks will be (literally) expensive. + * + * @param newPos new seek position. + * @throws PathIOException Backwards seek attempted. + * @throws EOFException attempt to seek past the end of the stream. + * @throws IOException IO failure while skipping bytes + */ + @Override + @Retries.OnceTranslated + public synchronized void seek(long newPos) throws IOException { + long current = getPos(); + long distance = newPos - current; + if (distance < 0) { + throw unsupported(SEEK_UNSUPPORTED + + " backwards from " + current + " to " + newPos); + } + if (distance == 0) { + LOG.debug("ignoring seek to current position."); + } else { + // the complicated one: Forward seeking. Useful for split files. + LOG.debug("Forward seek by reading {} bytes", distance); + long bytesSkipped = 0; + // read byte-by-byte, hoping that buffering will compensate for this. + // doing it this way ensures that the seek stops at exactly the right + // place. skip(len) can return a smaller value, at which point + // it's not clear what to do. + while(distance > 0) { + int r = read(); + if (r == -1) { + // reached an EOF too early + throw new EOFException("Seek to " + newPos + + " reached End of File at offset " + getPos()); + } + distance--; + bytesSkipped++; + } + // read has finished. + streamStatistics.seekForwards(bytesSkipped); + } + } + + /** + * Build an exception to raise when an operation is not supported here. + * @param action action which is unsupported. + * @return an exception to throw. + */ + protected PathIOException unsupported(final String action) { + return new PathIOException( + String.format("s3a://%s/%s", bucket, key), + action + " not supported"); + } + + @Override + public boolean seekToNewSource(long targetPos) throws IOException { + return false; + } + + // Not supported. + @Override + public boolean markSupported() { + return false; + } + + @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") + @Override + public void mark(int readLimit) { + // Do nothing + } + + @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") + @Override + public void reset() throws IOException { + throw unsupported("Mark"); + } + + /** + * Aborts the IO. + */ + public void abort() { + if (!closed.get()) { + LOG.debug("Aborting"); + wrappedStream.abort(); + } + } + + /** + * Read at a specific position. + * Reads at a position earlier than the current {@link #getPos()} position + * will fail with a {@link PathIOException}. See {@link #seek(long)}. + * Unlike the base implementation And the requirements of the filesystem + * specification, this updates the stream position as returned in + * {@link #getPos()}. + * @param position offset in the stream. + * @param buffer buffer to read in to. + * @param offset offset within the buffer + * @param length amount of data to read. + * @return the result. + * @throws PathIOException Backwards seek attempted. + * @throws EOFException attempt to seek past the end of the stream. + * @throws IOException IO failure while seeking in the stream or reading data. + */ + @Override + public int read(final long position, + final byte[] buffer, + final int offset, + final int length) + throws IOException { + // maybe seek forwards to the position. + seek(position); + return read(buffer, offset, length); + } + + /** + * Increment the bytes read counter if there is a stats instance + * and the number of bytes read is more than zero. + * This also updates the {@link #pos} marker by the same value. + * @param bytesRead number of bytes read + */ + private void incrementBytesRead(long bytesRead) { + if (bytesRead > 0) { + pos.addAndGet(bytesRead); + } + streamStatistics.bytesRead(bytesRead); + if (readContext.getStats() != null && bytesRead > 0) { + readContext.getStats().incrementBytesRead(bytesRead); + } + } + + /** + * Get the Stream statistics. + * @return the statistics for this stream. + */ + @InterfaceAudience.Private + @InterfaceStability.Unstable + public S3AInstrumentation.InputStreamStatistics getS3AStreamStatistics() { + return streamStatistics; + } + + /** + * String value includes statistics as well as stream state. + * Important: there are no guarantees as to the stability + * of this value. + * @return a string value for printing in logs/diagnostics + */ + @Override + @InterfaceStability.Unstable + public String toString() { + String s = streamStatistics.toString(); + synchronized (this) { + final StringBuilder sb = new StringBuilder( + "SelectInputStream{"); + sb.append(uri); + sb.append("; state ").append(!closed.get() ? "open" : "closed"); + sb.append("; pos=").append(getPos()); + sb.append("; readahead=").append(readahead); + sb.append('\n').append(s); + sb.append('}'); + return sb.toString(); + } + } +} diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java new file mode 100644 index 0000000000..8c87694570 --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/select/SelectTool.java @@ -0,0 +1,355 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.select; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Locale; +import java.util.Optional; +import java.util.Scanner; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FutureDataInputStreamBuilder; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.impl.FutureIOSupport; +import org.apache.hadoop.fs.s3a.S3AFileSystem; +import org.apache.hadoop.fs.s3a.commit.Duration; +import org.apache.hadoop.fs.s3a.commit.DurationInfo; +import org.apache.hadoop.fs.s3a.s3guard.S3GuardTool; +import org.apache.hadoop.fs.shell.CommandFormat; +import org.apache.hadoop.util.ExitUtil; + +import static org.apache.commons.lang3.StringUtils.isNotEmpty; +import static org.apache.hadoop.io.IOUtils.cleanupWithLogger; +import static org.apache.hadoop.service.launcher.LauncherExitCodes.*; +import static org.apache.hadoop.fs.s3a.select.SelectConstants.*; + +/** + * This is a CLI tool for the select operation, which is available + * through the S3Guard command. + * + * Usage: + *
    + *   hadoop s3guard select [options] Path Statement
    + * 
    + */ +public class SelectTool extends S3GuardTool { + + private static final Logger LOG = + LoggerFactory.getLogger(SelectTool.class); + + public static final String NAME = "select"; + + public static final String PURPOSE = "make an S3 Select call"; + + private static final String USAGE = NAME + + " [OPTIONS]" + + " [-limit rows]" + + " [-header (use|none|ignore)]" + + " [-out path]" + + " [-expected rows]" + + " [-compression (gzip|bzip2|none)]" + + " [-inputformat csv]" + + " [-outputformat csv]" + + " +``` + +The output is printed, followed by some summary statistics, unless the `-out` +option is used to declare a destination file. In this mode +status will be logged to the console, but the output of the query will be +saved directly to the output file. + +### Example 1 + +Read the first 100 rows of the landsat dataset where cloud cover is zero: + +```bash +hadoop s3guard select -header use -compression gzip -limit 100 \ + s3a://landsat-pds/scene_list.gz \ + "SELECT * FROM S3OBJECT s WHERE s.cloudCover = '0.0'" +``` + +### Example 2 + +Return the `entityId` column for all rows in the dataset where the cloud +cover was "0.0", and save it to the file `output.csv`: + +```bash +hadoop s3guard select -header use -out s3a://mybucket/output.csv \ + -compression gzip \ + s3a://landsat-pds/scene_list.gz \ + "SELECT s.entityId from S3OBJECT s WHERE s.cloudCover = '0.0'" +``` + +This file will: + +1. Be UTF-8 encoded. +1. Have quotes on all columns returned. +1. Use commas as a separator. +1. Not have any header. + +The output can be saved to a file with the `-out` option. Note also that +`-D key=value` settings can be used to control the operation, if placed after +the `s3guard` command and before `select` + + +```bash +hadoop s3guard \ + -D s.s3a.select.output.csv.quote.fields=asneeded \ + select \ + -header use \ + -compression gzip \ + -limit 500 \ + -inputformat csv \ + -outputformat csv \ + -out s3a://hwdev-steve-new/output.csv \ + s3a://landsat-pds/scene_list.gz \ + "SELECT s.entityId from S3OBJECT s WHERE s.cloudCover = '0.0'" +``` + + +## Use in MR/Analytics queries: Work in Progress + +S3 Select support in analytics queries is a work in progress. It does +not work reliably with large source files where the work is split up. + +As a proof of concept *only*, S3 Select queries can be made through +MapReduce jobs which use any Hadoop `RecordReader` +class which uses the new `openFile()` API. + +Currently this consists of the following MRv2 readers. + +``` +org.apache.hadoop.mapreduce.lib.input.LineRecordReader +org.apache.hadoop.mapreduce.lib.input.FixedLengthRecordReader +``` + +And a limited number of the MRv1 record readers: + +``` +org.apache.hadoop.mapred.LineRecordReader +``` + +All of these readers use the new API and can be have its optional/mandatory +options set via the `JobConf` used when creating/configuring the reader. + +These readers are instantiated within input formats; the following +formats therefore support S3 Select. + +``` +org.apache.hadoop.mapreduce.lib.input.FixedLengthInputFormat +org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat +org.apache.hadoop.mapreduce.lib.input.NLineInputFormat +org.apache.hadoop.mapreduce.lib.input.TextInputFormat +org.apache.hadoop.mapred.KeyValueTextInputFormat +org.apache.hadoop.mapred.TextInputFormat +org.apache.hadoop.mapred.lib.NLineInputFormat +``` + +All `JobConf` options which begin with the prefix `mapreduce.job.input.file.option.` +will have that prefix stripped and the remainder used as the name for an option +when opening the file. + +All `JobConf` options which being with the prefix `mapreduce.job.input.file.must.` +will be converted into mandatory options. + +To use an S3 Select call, set the following options + +``` +mapreduce.job.input.file.must.fs.s3a.select.sql = +mapreduce.job.input.file.must.fs.s3a.select.input.format = CSV +mapreduce.job.input.file.must.fs.s3a.select.output.format = CSV +``` + +Further options may be set to tune the behaviour, for example: + +```java +jobConf.set("mapreduce.job.input.file.must.fs.s3a.select.input.csv.header", "use"); +``` + +*Note* How to tell if a reader has migrated to the new `openFile()` builder +API: + +Set a mandatory option which is not known; if the job does not fail then +an old reader is being used. + +```java +jobConf.set("mapreduce.job.input.file.must.unknown.option", "anything"); +``` + + +### Querying Compressed objects + +S3 Select queries can be made against gzipped source files; the S3A input +stream receives the output in text format, rather than as a (re)compressed +stream. + +To read a gzip file, set `fs.s3a.select.input.compression` to `gzip`. + +```java +jobConf.set("mapreduce.job.input.file.must.fs.s3a.select.input.compression", + "gzip"); +``` + + +Most of the Hadoop RecordReader classes automatically choose a decompressor +based on the extension of the source file. This causes problems when +reading `.gz` files, because S3 Select is automatically decompressing and +returning csv-formatted text. + +By default, a query across gzipped files will fail with the error +"IOException: not a gzip file" + +To avoid this problem, declare that the job should switch to the +"Passthrough Codec" for all files with a ".gz" extension: + +```java +jobConf.set("io.compression.codecs", + "org.apache.hadoop.io.compress.PassthroughCodec"); +jobConf.set("io.compress.passthrough.extension", ".gz"); +``` + +Obviously, this breaks normal `.gz` decompression: only set it on S3 Select +jobs. + +## S3 Select configuration options. + +Consult the javadocs for `org.apache.hadoop.fs.s3a.select.SelectConstants`. + +The listed options can be set in `core-site.xml`, supported by S3A per-bucket +configuration, and can be set programmatically on the `Configuration` object +use to configure a new filesystem instance. + +Any of these options can be set in the builder returned by the `openFile()` call +—simply set them through a chain of `builder.must()` operations. + +```xml + + fs.s3a.select.input.format + csv + Input format + + + + fs.s3a.select.output.format + csv + Output format + + + + fs.s3a.select.input.csv.comment.marker + # + In S3 Select queries: the marker for comment lines in CSV files + + + + fs.s3a.select.input.csv.record.delimiter + \n + In S3 Select queries over CSV files: the record delimiter. + \t is remapped to the TAB character, \r to CR \n to newline. \\ to \ + and \" to " + + + + + fs.s3a.select.input.csv.field.delimiter + , + In S3 Select queries over CSV files: the field delimiter. + \t is remapped to the TAB character, \r to CR \n to newline. \\ to \ + and \" to " + + + + + fs.s3a.select.input.csv.quote.character + " + In S3 Select queries over CSV files: quote character. + \t is remapped to the TAB character, \r to CR \n to newline. \\ to \ + and \" to " + + + + + fs.s3a.select.input.csv.quote.escape.character + \\ + In S3 Select queries over CSV files: quote escape character. + \t is remapped to the TAB character, \r to CR \n to newline. \\ to \ + and \" to " + + + + + fs.s3a.select.input.csv.header + none + In S3 Select queries over CSV files: what is the role of the header? One of "none", "ignore" and "use" + + + + fs.s3a.select.input.compression + none + In S3 Select queries, the source compression + algorithm. One of: "none" and "gzip" + + + + fs.s3a.select.output.csv.quote.fields + always + + In S3 Select queries: should fields in generated CSV Files be quoted? + One of: "always", "asneeded". + + + + + fs.s3a.select.output.csv.quote.character + " + + In S3 Select queries: the quote character for generated CSV Files. + + + + + fs.s3a.select.output.csv.quote.escape.character + \\ + + In S3 Select queries: the quote escape character for generated CSV Files. + + + + + fs.s3a.select.output.csv.record.delimiter + \n + + In S3 Select queries: the record delimiter for generated CSV Files. + + + + + fs.s3a.select.output.csv.field.delimiter + , + + In S3 Select queries: the field delimiter for generated CSV Files. + + + + + fs.s3a.select.errors.include.sql + false + + Include the SQL statement in errors: this is useful for development but + may leak security and Personally Identifying Information in production, + so must be disabled there. + + +``` + +## Security and Privacy + +SQL Injection attacks are the classic attack on data. +Because S3 Select is a read-only API, the classic ["Bobby Tables"](https://xkcd.com/327/) +attack to gain write access isn't going to work. Even so: sanitize your inputs. + +CSV does have security issues of its own, specifically: + +*Excel and other spreadsheets may interpret some fields beginning with special +characters as formula, and execute them* + +S3 Select does not appear vulnerable to this, but in workflows where untrusted +data eventually ends up in a spreadsheet (including Google Document spreadsheets), +the data should be sanitized/audited first. There is no support for +such sanitization in S3 Select or in the S3A connector. + +Logging Select statements may expose secrets if they are in the statement. +Even if they are just logged, this may potentially leak Personally Identifying +Information as covered in the EU GDPR legislation and equivalents. + +For both privacy and security reasons, SQL statements are not included +in exception strings by default, nor logged at INFO level. + +To enable them, set `fs.s3a.select.errors.include.sql` to `true`, either in the +site/application configuration, or as an option in the builder for a +single request. When set, the request will also be logged at +the INFO level of the log `org.apache.hadoop.fs.s3a.select.SelectBinding`. + +Personal Identifiable Information is not printed in the AWS S3 logs. +Those logs contain only the SQL keywords from the query planner. +All column names and literals are masked. Following is a sample log example: + +*Query:* + +```sql +SELECT * FROM S3OBJECT s; +``` + +*Log:* + +```sql +select (project (list (project_all))) (from (as str0 (id str1 case_insensitive))) +``` + +Note also that: + +1. Debug-level Hadoop logs for the module `org.apache.hadoop.fs.s3a` and other +components's debug logs may also log the SQL statements (e.g. aws-sdk HTTP logs). + +The best practise here is: only enable SQL in exceptions while developing +SQL queries, especially in an application/notebook where the exception +text is a lot easier to see than the application logs. + +In production: don't log or report. If you do, all logs and output must be +considered sensitive from security and privacy perspectives. + +The `hadoop s3guard select` command does enable the logging, so +can be used as an initial place to experiment with the SQL syntax. +Rationale: if you are constructing SQL queries on the command line, +your shell history is already tainted with the query. + +### Links + +* [CVE-2014-3524](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2014-3524). +* [The Absurdly Underestimated Dangers of CSV Injection](http://georgemauer.net/2017/10/07/csv-injection.html). +* [Comma Separated Vulnerabilities](https://www.contextis.com/blog/comma-separated-vulnerabilities). + +### SQL Syntax + +The SQL Syntax directly supported by the AWS S3 Select API is [documented by +Amazon](https://docs.aws.amazon.com/AmazonS3/latest/dev/s3-glacier-select-sql-reference.html). + +* Use single quotes for all constants, not double quotes. +* All CSV column values are strings unless cast to a type +* Simple `SELECT` calls, no `JOIN`. + +### CSV formats + +"CSV" is less a format, more "a term meaning the data is in some nonstandard +line-by-line" text file, and there are even "multiline CSV files". + +S3 Select only supports a subset of the loose "CSV" concept, as covered in +the AWS documentation. There are also limits on how many columns and how +large a single line may be. + +The specific quotation character, field and record delimiters, comments and escape +characters can be configured in the Hadoop configuration. + +### Consistency, Concurrency and Error handling + +**Consistency** + +* Assume the usual S3 consistency model applies. + +* When enabled, S3Guard's DynamoDB table will declare whether or not +a newly deleted file is visible: if it is marked as deleted, the +select request will be rejected with a `FileNotFoundException`. + +* When an existing S3-hosted object is changed, the S3 select operation +may return the results of a SELECT call as applied to either the old +or new version. + +* We don't know whether you can get partially consistent reads, or whether +an extended read ever picks up a later value. + +* The AWS S3 load balancers can briefly cache 404/Not-Found entries +from a failed HEAD/GET request against a nonexistent file; this cached +entry can briefly create create inconsistency, despite the +AWS "Create is consistent" model. There is no attempt to detect or recover from +this. + +**Concurrency** + +The outcome of what happens when source file is overwritten while the result of +a select call is overwritten is undefined. + +The input stream returned by the operation is *NOT THREAD SAFE*. + +**Error Handling** + +If an attempt to issue an S3 select call fails, the S3A connector will +reissue the request if-and-only-if it believes a retry may succeed. +That is: it considers the operation to be idempotent and if the failure is +considered to be a recoverable connectivity problem or a server-side rejection +which can be retried (500, 503). + +If an attempt to read data from an S3 select stream (`org.apache.hadoop.fs.s3a.select.SelectInputStream)` fails partway through the read, *no attempt is made to retry the operation* + +In contrast, the normal S3A input stream tries to recover from (possibly transient) +failures by attempting to reopen the file. + + +## Performance + +The select operation is best when the least amount of data is returned by +the query, as this reduces the amount of data downloaded. + +* Limit the number of columns projected to only those needed. +* Use `LIMIT` to set an upper limit on the rows read, rather than implementing +a row counter in application code and closing the stream when reached. +This avoids having to abort the HTTPS connection and negotiate a new one +on the next S3 request. + +The select call itself can be slow, especially when the source is a multi-MB +compressed file with aggressive filtering in the `WHERE` clause. +Assumption: the select query starts at row 1 and scans through each row, +and does not return data until it has matched one or more rows. + +If the asynchronous nature of the `openFile().build().get()` sequence +can be taken advantage of, by performing other work before or in parallel +to the `get()` call: do it. + +## Troubleshooting + +Getting S3 Select code to work is hard, though those knowledgeable in SQL +will find it easier. + +Problems can be split into: + +1. Basic configuration of the client to issue the query. +1. Bad SQL select syntax and grammar. +1. Datatype casting issues +1. Bad records/data in source files. +1. Failure to configure MR jobs to work correctly. +1. Failure of MR jobs due to + +The exceptions here are all based on the experience during writing tests; +more may surface with broader use. + +All failures other than network errors on request initialization are considered +unrecoverable and will not be reattempted. + +As parse-time errors always state the line and column of an error, you can +simplify debugging by breaking a SQL statement across lines, e.g. + +```java +String sql = "SELECT\n" + + "s.entityId \n" + + "FROM " + "S3OBJECT s WHERE\n" + + "s.\"cloudCover\" = '100.0'\n" + + " LIMIT 100"; +``` +Now if the error is declared as "line 4", it will be on the select conditions; +the column offset will begin from the first character on that row. + +The SQL Statements issued are only included in exceptions if `fs.s3a.select.errors.include.sql` +is explicitly set to true. This can be done in an application during development, +or in a `openFile()` option parameter. This should only be done during development, +to reduce the risk of logging security or privacy information. + + +### "mid-query" failures on large datasets + +S3 Select returns paged results; the source file is _not_ filtered in +one go in the initial request. + +This means that errors related to the content of the data (type casting, etc) +may only surface partway through the read. The errors reported in such a +case may be different than those raised on reading the first page of data, +where it will happen earlier on in the read process. + +### External Resources on for troubleshooting + +See: + +* [SELECT Command Reference](https://docs.aws.amazon.com/AmazonS3/latest/dev/s3-glacier-select-sql-reference-select.html) +* [SELECT Object Content](https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html) + +### IOException: "not a gzip file" + +This surfaces when trying to read in data from a `.gz` source file through an MR +or other analytics query, and the gzip codec has tried to parse it. + +``` +java.io.IOException: not a gzip file +at org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.processBasicHeader(BuiltInGzipDecompressor.java:496) +at org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.executeHeaderState(BuiltInGzipDecompressor.java:257) +at org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.decompress(BuiltInGzipDecompressor.java:186) +at org.apache.hadoop.io.compress.DecompressorStream.decompress(DecompressorStream.java:111) +at org.apache.hadoop.io.compress.DecompressorStream.read(DecompressorStream.java:105) +at java.io.InputStream.read(InputStream.java:101) +at org.apache.hadoop.util.LineReader.fillBuffer(LineReader.java:182) +at org.apache.hadoop.util.LineReader.readCustomLine(LineReader.java:306) +at org.apache.hadoop.util.LineReader.readLine(LineReader.java:174) +at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.skipUtfByteOrderMark(LineRecordReader.java:158) +at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.nextKeyValue(LineRecordReader.java:198) +``` + +The underlying problem is that the gzip decompressor is automatically enabled +when the the source file ends with the ".gz" extension. Because S3 Select +returns decompressed data, the codec fails. + +The workaround here is to declare that the job should add the "Passthrough Codec" +to its list of known decompressors, and that this codec should declare the +file format it supports to be ".gz". + +``` +io.compression.codecs = org.apache.hadoop.io.compress.PassthroughCodec +io.compress.passthrough.extension = .gz +``` + +### AWSBadRequestException `InvalidColumnIndex` + + +Your SQL is wrong and the element at fault is considered an unknown column +name. + +``` +org.apache.hadoop.fs.s3a.AWSBadRequestException: + Select: SELECT * FROM S3OBJECT WHERE odd = true on test/testSelectOddLines.csv: + com.amazonaws.services.s3.model.AmazonS3Exception: + The column index at line 1, column 30 is invalid. + Please check the service documentation and try again. + (Service: Amazon S3; Status Code: 400; Error Code: InvalidColumnIndex; +``` + +Here it's the first line of the query, column 30. Paste the query +into an editor and position yourself on the line and column at fault. + +```sql +SELECT * FROM S3OBJECT WHERE odd = true + ^ HERE +``` + +Another example: + +``` +org.apache.hadoop.fs.s3a.AWSBadRequestException: Select: +SELECT * FROM S3OBJECT s WHERE s._1 = "true" on test/testSelectOddLines.csv: + com.amazonaws.services.s3.model.AmazonS3Exception: + The column index at line 1, column 39 is invalid. + Please check the service documentation and try again. + (Service: Amazon S3; Status Code: 400; + Error Code: InvalidColumnIndex; +``` + +Here it is because strings must be single quoted, not double quoted. + +```sql +SELECT * FROM S3OBJECT s WHERE s._1 = "true" + ^ HERE +``` + +S3 select uses double quotes to wrap column names, interprets the string +as column "true", and fails with a non-intuitive message. + +*Tip*: look for the element at fault and treat the `InvalidColumnIndex` +message as a parse-time message, rather than the definitive root +cause of the problem. + +### AWSBadRequestException `ParseInvalidPathComponent` + +Your SQL is wrong. + +``` +org.apache.hadoop.fs.s3a.AWSBadRequestException: +Select: SELECT * FROM S3OBJECT s WHERE s.'odd' is "true" on test/testSelectOddLines.csv +: com.amazonaws.services.s3.model.AmazonS3Exception: Invalid Path component, + expecting either an IDENTIFIER or STAR, got: LITERAL,at line 1, column 34. + (Service: Amazon S3; Status Code: 400; Error Code: ParseInvalidPathComponent; + +``` + +``` +SELECT * FROM S3OBJECT s WHERE s.'odd' is "true" on test/testSelectOddLines.csv + ^ HERE +``` + + +### AWSBadRequestException `ParseExpectedTypeName` + +Your SQL is still wrong. + +``` + +org.apache.hadoop.fs.s3a.AWSBadRequestException: + Select: SELECT * FROM S3OBJECT s WHERE s.odd = "true" +on test/testSelectOddLines.csv: +com.amazonaws.services.s3.model.AmazonS3Exception +: Expected type name, found QUOTED_IDENTIFIER:'true' at line 1, column 41. +(Service: Amazon S3; Status Code: 400; Error Code: ParseExpectedTypeName; +``` + +### `ParseUnexpectedToken` + +Your SQL is broken. + +``` +org.apache.hadoop.fs.s3a.AWSBadRequestException: +Select: SELECT * FROM S3OBJECT s WHERE s.5 = `true` on test/testSelectOddLines.csv: +com.amazonaws.services.s3.model.AmazonS3Exception: +Unexpected token found LITERAL:5d-1 at line 1, column 33. +(Service: Amazon S3; Status Code: 400; Error Code: ParseUnexpectedToken; +``` +### `ParseUnexpectedOperator` + +Your SQL is broken. + +``` +com.amazonaws.services.s3.model.AmazonS3Exception: Unexpected operator OPERATOR:'%' at line 1, column 45. +(Service: Amazon S3; Status Code: 400; +Error Code: ParseUnexpectedOperator; Request ID: E87F30C57436B459; +S3 Extended Request ID: UBFOIgkQxBBL+bcBFPaZaPBsjdnd8NRz3NFWAgcctqm3n6f7ib9FMOpR+Eu1Cy6cNMYHCpJbYEY + =:ParseUnexpectedOperator: Unexpected operator OPERATOR:'%' at line 1, column 45. +at java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357) +at java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1895) +``` + +### `MissingHeaders` + +``` +org.apache.hadoop.fs.s3a.AWSBadRequestException: +Select: SELECT * FROM S3OBJECT s WHERE s."odd" = `true` on test/testSelectOddLines.csv: +com.amazonaws.services.s3.model.AmazonS3Exception: +Some headers in the query are missing from the file. +Please check the file and try again. +(Service: Amazon S3; Status Code: 400; Error Code: MissingHeaders; +``` + +1. There's a header used in the query which doesn't match any in the document +itself. +1. The header option for the select query is set to "none" or "ignore", and +you are trying to use a header named there. + +This can happen if you are trying to use double quotes for constants in the +SQL expression. + +``` +SELECT * FROM S3OBJECT s WHERE s."odd" = "true" on test/testSelectOddLines.csv: + ^ HERE +``` + +Double quotes (") may only be used when naming columns; for constants +single quotes are required. + +### Method not allowed + +``` +org.apache.hadoop.fs.s3a.AWSS3IOException: Select on test/testSelectWholeFile: +com.amazonaws.services.s3.model.AmazonS3Exception: The specified method is not +allowed against this resource. (Service: Amazon S3; Status Code: 405; +Error Code: MethodNotAllowed; +``` + +You are trying to use S3 Select to read data which for some reason +you are not allowed to. + +### AWSBadRequestException `InvalidTextEncoding` + +The file couldn't be parsed. This can happen if you try to read a `.gz` file +and forget to set the compression in the select request. + +That can be done through the `fs.s3a.select.compression` option. + +``` +org.apache.hadoop.fs.s3a.AWSBadRequestException: + Select: '" SELECT * FROM S3OBJECT s WHERE endstation_name = 'Bayswater Road: Hyde Park' " + on s3a://example/dataset.csv.gz: + com.amazonaws.services.s3.model.AmazonS3Exception: + UTF-8 encoding is required. The text encoding error was found near byte 8,192. + (Service: Amazon S3; Status Code: 400; Error Code: InvalidTextEncoding +``` + +### AWSBadRequestException `InvalidCompressionFormat` "GZIP is not applicable to the queried object" + +A SELECT call has been made using a compression which doesn't match that of the +source object, such as it being a plain text file. + +``` +org.apache.hadoop.fs.s3a.AWSBadRequestException: Select: + '" SELECT * FROM S3OBJECT s WHERE endstation_name = 'Bayswater Road: Hyde Park' " + on s3a://example/dataset.csv: + com.amazonaws.services.s3.model.AmazonS3Exception: + GZIP is not applicable to the queried object. Please correct the request and try again. + (Service: Amazon S3; Status Code: 400; Error Code: InvalidCompressionFormat; + at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:212) + at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:111) +... +Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: GZIP is not applicable to the queried object. + Please correct the request and try again. + Service: Amazon S3; Status Code: 400; Error Code: InvalidCompressionFormat; + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse + ... +``` + +### `PathIOException`: "seek() not supported" + +The input stream returned by the select call does not support seeking +backwards in the stream. + +Similarly, `PositionedReadable` operations will fail when used to read +data any offset other than that of `getPos()`. + +``` +org.apache.hadoop.fs.PathIOException: `s3a://landsat-pds/landsat.csv.gz': seek() not supported + + at org.apache.hadoop.fs.s3a.select.SelectInputStream.unsupported(SelectInputStream.java:254) + at org.apache.hadoop.fs.s3a.select.SelectInputStream.seek(SelectInputStream.java:243) + at org.apache.hadoop.fs.FSDataInputStream.seek(FSDataInputStream.java:66) +``` + +There is no fix for this. You can move forward in a file using `skip(offset)`; +bear in mind that the return value indicates what offset was skipped -it +may be less than expected. + +### `IllegalArgumentException`: "Unknown mandatory key "fs.s3a.select.sql" + +The filesystem is not an S3A filesystem, and the s3a select option is not recognized. + +``` +java.lang.IllegalArgumentException: Unknown mandatory key "fs.s3a.select.sql" +at com.google.common.base.Preconditions.checkArgument(Preconditions.java:88) +at org.apache.hadoop.fs.AbstractFSBuilder.lambda$rejectUnknownMandatoryKeys$0(AbstractFSBuilder.java:331) +at java.lang.Iterable.forEach(Iterable.java:75) +at java.util.Collections$UnmodifiableCollection.forEach(Collections.java:1080) +at org.apache.hadoop.fs.AbstractFSBuilder.rejectUnknownMandatoryKeys(AbstractFSBuilder.java:330) +at org.apache.hadoop.fs.filesystem.openFileWithOptions(FileSystem.java:3541) +at org.apache.hadoop.fs.FileSystem$FSDataInputStreamBuilder.build(FileSystem.java:4442) +``` + +* Verify that the URL has an "s3a:" prefix. +* If it does, there may be a non-standard S3A implementation, or some +a filtering/relaying class has been placed in front of the S3AFilesystem. + +### `IllegalArgumentException`: "Unknown mandatory key in non-select file I/O" + +The file options to tune an S3 select call are only valid when a SQL expression +is set in the `fs.s3a.select.sql` option. If not, any such option added as a `must()` value +will fail. + +``` +java.lang.IllegalArgumentException: Unknown mandatory key for s3a://example/test/testSelectOptionsOnlyOnSelectCalls.csv in non-select file I/O "fs.s3a.select.input.csv.header" + + at com.google.common.base.Preconditions.checkArgument(Preconditions.java:115) + at org.apache.hadoop.fs.impl.AbstractFSBuilderImpl.lambda$rejectUnknownMandatoryKeys$0(AbstractFSBuilderImpl.java:352) + at java.lang.Iterable.forEach(Iterable.java:75) + at java.util.Collections$UnmodifiableCollection.forEach(Collections.java:1080) + at org.apache.hadoop.fs.impl.AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(AbstractFSBuilderImpl.java:351) + at org.apache.hadoop.fs.s3a.S3AFileSystem.openFileWithOptions(S3AFileSystem.java:3736) + at org.apache.hadoop.fs.FileSystem$FSDataInputStreamBuilder.build(FileSystem.java:4471) +``` + +Requiring these options without providing a SQL query is invariably an error. +Fix: add the SQL statement, or use `opt()` calls to set the option. + +If the `fs.s3a.select.sql` option is set, and still a key is rejected, then +either the spelling of the key is wrong, it has leading or trailing spaces, +or it is an option not supported in that specific release of Hadoop. + + +### PathIOException : "seek() backwards from not supported" + +Backwards seeks in an S3 Select `SelectInputStream` are not supported. + +``` +org.apache.hadoop.fs.PathIOException: `s3a://landsat-pds/scene_list.gz': + seek() backwards from 16387 to 0 not supported + + at org.apache.hadoop.fs.s3a.select.SelectInputStream.unsupported(SelectInputStream.java:288) + at org.apache.hadoop.fs.s3a.select.SelectInputStream.seek(SelectInputStream.java:253) + at org.apache.hadoop.fs.FSDataInputStream.seek(FSDataInputStream.java:66) +``` + +### InvalidTableAlias + +The SELECT refers to the name of a column which is not recognized + +* the name of a column is wrong, here `s.oddf`. +* headers are not enabled for the CSV source file. Fix: enable. +* a generated alias is used e.g `s._1`, but headers have been enabled. +Fix. disable, or use the header name. + +``` +org.apache.hadoop.fs.s3a.AWSBadRequestException: + SELECT * FROM S3OBJECT WHERE s."oddf" = 'true' + on s3a://example/test/testParseBrokenCSVFile: + com.amazonaws.services.s3.model.AmazonS3Exception: + Invalid table alias is specified at line 1, column 30. + Please check the file and try again. (Service: Amazon S3; Status Code: 400; Error Code: InvalidTableAlias; + Invalid table alias is specified at line 1, column 30. Please check the file and try again. + (Service: Amazon S3; Status Code: 400; + Error Code: InvalidTableAlias; + Request ID: 8693B86A52CFB91C; + at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:225) + at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:111) + at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$3(Invoker.java:265) + ... +Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: + Invalid table alias is specified at line 1, column 30. + Please check the file and try again. + (Service: Amazon S3; Status Code: 400; Error Code: InvalidTableAlias; Request ID: 8693B86A52CFB91C; + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1640) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1304) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1058) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:743) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:717) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:699) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:667) + at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:649) +``` + +### `AWSBadRequestException` "Attempt to convert from one data type to another failed: cast from STRING to TIMESTAMP." + +A string field could not be converted to a timestamp because one or more of its entries were not parseable +with the given timestamp. + +Example, from a spreadsheet where "timestamp" is normally a well-formatted timestamp field, +but in one column it is just "Tuesday" + +```sql +SELECT CAST(s.date AS TIMESTAMP) FROM S3OBJECT s +``` + +``` +org.apache.hadoop.fs.s3a.AWSBadRequestException: Select on s3a://example/test/testParseBrokenCSVFile: +com.amazonaws.services.s3.model.AmazonS3Exception: +Attempt to convert from one data type to another failed: cast from STRING to TIMESTAMP. +(Service: Amazon S3; Status Code: 400; Error Code: CastFailed; +Request ID: E2158FE45AF2049A; S3 Extended Request ID: iM40fzGuaPt6mQo0QxDDX+AY1bAgSVD1sKErFq6Y4GDJYHIAnmc00i0EvGGnH+0MFCFhKIivIrQ=), +S3 Extended Request ID: iM40fzGuaPt6mQo0QxDDX+AY1bAgSVD1sKErFq6Y4GDJYHIAnmc00i0EvGGnH+0MFCFhKIivIrQ=:CastFailed: +Attempt to convert from one data type to another failed: cast from STRING to TIMESTAMP. +(Service: Amazon S3; Status Code: 400; Error Code: CastFailed; Request ID: E2158FE45AF2049A; S3 Extended Request ID: iM40fzGuaPt6mQo0QxDDX+AY1bAgSVD1sKErFq6Y4GDJYHIAnmc00i0EvGGnH+0MFCFhKIivIrQ=) + at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:225) + at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:111) + at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$3(Invoker.java:265) +Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: + Attempt to convert from one data type to another failed: cast from STRING to TIMESTAMP. + (Service: Amazon S3; Status Code: 400; Error Code: CastFailed;) + +``` + +There's no way to recover from a bad record here; no option to skip invalid +rows. + +*Note:* This is an example stack trace *without* the SQL being printed. diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md index 34ba02919e..068269cfa5 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/testing.md @@ -478,6 +478,22 @@ the `fs.s3a.scale.test.csvfile` option set to its path. (yes, the space is necessary. The Hadoop `Configuration` class treats an empty value as "do not override the default"). +### Turning off S3 Select + +The S3 select tests are skipped when the S3 endpoint doesn't support S3 Select. + +```xml + + fs.s3a.select.enabled + false + +``` + +If your endpoint doesn't support that feature, this option should be in +your `core-site.xml` file, so that trying to use S3 select fails fast with +a meaningful error ("S3 Select not supported") rather than a generic Bad Request +exception. + ### Testing Session Credentials diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java index 267646ca25..9e8a871ef7 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AAWSCredentialsProvider.java @@ -39,6 +39,7 @@ import static org.apache.hadoop.fs.s3a.Constants.*; import static org.apache.hadoop.fs.s3a.S3ATestConstants.*; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.getCSVTestPath; import static org.apache.hadoop.fs.s3a.S3AUtils.*; import static org.junit.Assert.*; @@ -150,8 +151,7 @@ public void testAnonymousProvider() throws Exception { Configuration conf = new Configuration(); conf.set(AWS_CREDENTIALS_PROVIDER, AnonymousAWSCredentialsProvider.class.getName()); - Path testFile = new Path( - conf.getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE)); + Path testFile = getCSVTestPath(conf); FileSystem fs = FileSystem.newInstance(testFile.toUri(), conf); assertNotNull(fs); assertTrue(fs instanceof S3AFileSystem); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java index 5cd737969b..5c2b5a399d 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AFailureHandling.java @@ -25,7 +25,6 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.junit.Assume; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,6 +36,7 @@ import java.util.List; import static org.apache.hadoop.fs.contract.ContractTestUtils.*; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.getLandsatCSVPath; import static org.apache.hadoop.test.LambdaTestUtils.*; /** @@ -152,12 +152,9 @@ public void testMultiObjectDeleteSomeFiles() throws Throwable { @Test public void testMultiObjectDeleteNoPermissions() throws Throwable { - Configuration conf = getConfiguration(); - String csvFile = conf.getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE); - Assume.assumeTrue("CSV test file is not the default", - DEFAULT_CSVTEST_FILE.equals(csvFile)); - Path testFile = new Path(csvFile); - S3AFileSystem fs = (S3AFileSystem)testFile.getFileSystem(conf); + Path testFile = getLandsatCSVPath(getConfiguration()); + S3AFileSystem fs = (S3AFileSystem)testFile.getFileSystem( + getConfiguration()); intercept(MultiObjectDeleteException.class, () -> removeKeys(fs, fs.pathToKey(testFile))); } diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java index 484f079e3e..e15c24aced 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/S3ATestUtils.java @@ -47,7 +47,6 @@ import org.hamcrest.core.Is; import org.junit.Assert; import org.junit.Assume; -import org.junit.internal.AssumptionViolatedException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -63,6 +62,7 @@ import java.util.concurrent.Callable; import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_CREDENTIAL_PROVIDER_PATH; +import static org.apache.commons.lang3.StringUtils.isNotEmpty; import static org.apache.hadoop.fs.contract.ContractTestUtils.skip; import static org.apache.hadoop.fs.s3a.FailureInjectionPolicy.*; import static org.apache.hadoop.fs.s3a.S3ATestConstants.*; @@ -144,7 +144,6 @@ public static S3AFileSystem createTestFileSystem(Configuration conf) * @param purge flag to enable Multipart purging * @return the FS * @throws IOException IO Problems - * @throws AssumptionViolatedException if the FS is not named */ public static S3AFileSystem createTestFileSystem(Configuration conf, boolean purge) @@ -158,12 +157,10 @@ public static S3AFileSystem createTestFileSystem(Configuration conf, testURI = URI.create(fsname); liveTest = testURI.getScheme().equals(Constants.FS_S3A); } - if (!liveTest) { - // This doesn't work with our JUnit 3 style test cases, so instead we'll - // make this whole class not run by default - throw new AssumptionViolatedException( - "No test filesystem in " + TEST_FS_S3A_NAME); - } + // This doesn't work with our JUnit 3 style test cases, so instead we'll + // make this whole class not run by default + Assume.assumeTrue("No test filesystem in " + TEST_FS_S3A_NAME, + liveTest); // patch in S3Guard options maybeEnableS3Guard(conf); S3AFileSystem fs1 = new S3AFileSystem(); @@ -192,7 +189,6 @@ public static void enableMultipartPurge(Configuration conf, int seconds) { * @param conf configuration * @return the FS * @throws IOException IO Problems - * @throws AssumptionViolatedException if the FS is not named */ public static FileContext createTestFileContext(Configuration conf) throws IOException { @@ -204,12 +200,10 @@ public static FileContext createTestFileContext(Configuration conf) testURI = URI.create(fsname); liveTest = testURI.getScheme().equals(Constants.FS_S3A); } - if (!liveTest) { - // This doesn't work with our JUnit 3 style test cases, so instead we'll - // make this whole class not run by default - throw new AssumptionViolatedException("No test filesystem in " - + TEST_FS_S3A_NAME); - } + // This doesn't work with our JUnit 3 style test cases, so instead we'll + // make this whole class not run by default + Assume.assumeTrue("No test filesystem in " + TEST_FS_S3A_NAME, + liveTest); // patch in S3Guard options maybeEnableS3Guard(conf); FileContext fc = FileContext.getFileContext(testURI, conf); @@ -327,10 +321,56 @@ public static String getTestProperty(Configuration conf, String defVal) { String confVal = conf != null ? conf.getTrimmed(key, defVal) : defVal; String propval = System.getProperty(key); - return StringUtils.isNotEmpty(propval) && !UNSET_PROPERTY.equals(propval) + return isNotEmpty(propval) && !UNSET_PROPERTY.equals(propval) ? propval : confVal; } + /** + * Get the test CSV file; assume() that it is not empty. + * @param conf test configuration + * @return test file. + */ + public static String getCSVTestFile(Configuration conf) { + String csvFile = conf + .getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE); + Assume.assumeTrue("CSV test file is not the default", + isNotEmpty(csvFile)); + return csvFile; + } + + /** + * Get the test CSV path; assume() that it is not empty. + * @param conf test configuration + * @return test file as a path. + */ + public static Path getCSVTestPath(Configuration conf) { + return new Path(getCSVTestFile(conf)); + } + + /** + * Get the test CSV file; assume() that it is not modified (i.e. we haven't + * switched to a new storage infrastructure where the bucket is no longer + * read only). + * @return test file. + * @param conf test configuration + */ + public static String getLandsatCSVFile(Configuration conf) { + String csvFile = getCSVTestFile(conf); + Assume.assumeTrue("CSV test file is not the default", + DEFAULT_CSVTEST_FILE.equals(csvFile)); + return csvFile; + } + /** + * Get the test CSV file; assume() that it is not modified (i.e. we haven't + * switched to a new storage infrastructure where the bucket is no longer + * read only). + * @param conf test configuration + * @return test file as a path. + */ + public static Path getLandsatCSVPath(Configuration conf) { + return new Path(getLandsatCSVFile(conf)); + } + /** * Verify the class of an exception. If it is not as expected, rethrow it. * Comparison is on the exact class, not subclass-of inference as diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java index e8467e781d..3822ee781d 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AAWSCredentialsProvider.java @@ -101,8 +101,7 @@ public void testInstantiationChain() throws Throwable { TemporaryAWSCredentialsProvider.NAME + ", \t" + SimpleAWSCredentialsProvider.NAME + " ,\n " + AnonymousAWSCredentialsProvider.NAME); - Path testFile = new Path( - conf.getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE)); + Path testFile = getCSVTestPath(conf); AWSCredentialProviderList list = createAWSCredentialProviderSet( testFile.toUri(), conf); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractCommitITest.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractCommitITest.java index 246bf9d613..69a6ed662b 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractCommitITest.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/commit/AbstractCommitITest.java @@ -178,7 +178,7 @@ public void setup() throws Exception { * @return fork ID string in a format parseable by Jobs * @throws Exception failure */ - protected String randomJobId() throws Exception { + public static String randomJobId() throws Exception { String testUniqueForkId = System.getProperty(TEST_UNIQUE_FORK_ID, "0001"); int l = testUniqueForkId.length(); String trailingDigits = testUniqueForkId.substring(l - 4, l); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java index e10c85b965..71e9975c73 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/AbstractS3GuardToolTestBase.java @@ -24,7 +24,6 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; -import java.io.PrintStream; import java.net.URI; import java.util.Collection; import java.util.HashSet; @@ -37,7 +36,6 @@ import org.apache.hadoop.util.StopWatch; import com.google.common.base.Preconditions; import org.apache.hadoop.fs.FileSystem; -import org.junit.Assume; import org.junit.Test; import org.apache.hadoop.conf.Configuration; @@ -64,6 +62,7 @@ import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.E_NO_METASTORE_OR_FILESYSTEM; import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.E_USAGE; import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.SUCCESS; +import static org.apache.hadoop.fs.s3a.s3guard.S3GuardToolTestHelper.exec; import static org.apache.hadoop.test.LambdaTestUtils.intercept; /** @@ -88,11 +87,21 @@ protected static void expectResult(int expected, assertEquals(message, expected, tool.run(args)); } - protected static void expectSuccess( + /** + * Expect a command to succeed. + * @param message any extra text to include in the assertion error message + * @param tool tool to run + * @param args arguments to the command + * @return the output of any successful run + * @throws Exception failure + */ + protected static String expectSuccess( String message, S3GuardTool tool, String... args) throws Exception { - assertEquals(message, SUCCESS, tool.run(args)); + ByteArrayOutputStream buf = new ByteArrayOutputStream(); + exec(SUCCESS, message, tool, buf, args); + return buf.toString(); } /** @@ -450,58 +459,6 @@ public void testInitFailsIfNoBucketNameOrDDBTableSet() throws Exception { () -> run(S3GuardTool.Init.NAME)); } - /** - * Get the test CSV file; assume() that it is not modified (i.e. we haven't - * switched to a new storage infrastructure where the bucket is no longer - * read only). - * @return test file. - */ - protected String getLandsatCSVFile() { - String csvFile = getConfiguration() - .getTrimmed(KEY_CSVTEST_FILE, DEFAULT_CSVTEST_FILE); - Assume.assumeTrue("CSV test file is not the default", - DEFAULT_CSVTEST_FILE.equals(csvFile)); - return csvFile; - } - - /** - * Execute a command, returning the buffer if the command actually completes. - * If an exception is raised the output is logged instead. - * @param cmd command - * @param args argument list - * @throws Exception on any failure - */ - public String exec(S3GuardTool cmd, String...args) throws Exception { - ByteArrayOutputStream buf = new ByteArrayOutputStream(); - try { - exec(cmd, buf, args); - return buf.toString(); - } catch (AssertionError e) { - throw e; - } catch (Exception e) { - LOG.error("Command {} failed: \n{}", cmd, buf); - throw e; - } - } - - /** - * Execute a command, saving the output into the buffer. - * @param cmd command - * @param buf buffer to use for tool output (not SLF4J output) - * @param args argument list - * @throws Exception on any failure - */ - protected void exec(S3GuardTool cmd, ByteArrayOutputStream buf, String...args) - throws Exception { - LOG.info("exec {}", (Object) args); - int r = 0; - try(PrintStream out =new PrintStream(buf)) { - r = cmd.run(args, out); - out.flush(); - } - assertEquals("Command " + cmd + " failed\n"+ buf, 0, r); - } - @Test public void testDiffCommand() throws Exception { @@ -537,7 +494,7 @@ protected void exec(S3GuardTool cmd, ByteArrayOutputStream buf, String...args) ByteArrayOutputStream buf = new ByteArrayOutputStream(); S3GuardTool.Diff cmd = new S3GuardTool.Diff(fs.getConf()); cmd.setStore(ms); - exec(cmd, buf, "diff", "-meta", DYNAMODB_TABLE, testPath.toString()); + exec(0, "", cmd, buf, "diff", "-meta", DYNAMODB_TABLE, testPath.toString()); Set actualOnS3 = new HashSet<>(); Set actualOnMS = new HashSet<>(); diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolDynamoDB.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolDynamoDB.java index 97173feeda..aa88b0b118 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolDynamoDB.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolDynamoDB.java @@ -49,7 +49,7 @@ import static org.apache.hadoop.fs.s3a.Constants.S3GUARD_DDB_TABLE_TAG; import static org.apache.hadoop.fs.s3a.s3guard.DynamoDBMetadataStore.*; import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.*; -import static org.apache.hadoop.test.LambdaTestUtils.intercept; +import static org.apache.hadoop.fs.s3a.s3guard.S3GuardToolTestHelper.exec; /** * Test S3Guard related CLI commands against DynamoDB. diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolLocal.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolLocal.java index 1ee3cde80d..6a4d45e9ea 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolLocal.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/ITestS3GuardToolLocal.java @@ -40,7 +40,9 @@ import org.apache.hadoop.fs.s3a.S3AFileSystem; import static org.apache.hadoop.fs.s3a.MultipartTestUtils.*; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.getLandsatCSVFile; import static org.apache.hadoop.fs.s3a.s3guard.S3GuardTool.*; +import static org.apache.hadoop.fs.s3a.s3guard.S3GuardToolTestHelper.exec; import static org.apache.hadoop.test.LambdaTestUtils.intercept; /** @@ -97,7 +99,7 @@ public void testImportCommand() throws Exception { public void testDestroyBucketExistsButNoTable() throws Throwable { run(Destroy.NAME, "-meta", LOCAL_METADATA, - getLandsatCSVFile()); + getLandsatCSVFile(getConfiguration())); } @Test @@ -161,7 +163,7 @@ public void testInitTwice() throws Throwable { public void testLandsatBucketUnguarded() throws Throwable { run(BucketInfo.NAME, "-" + BucketInfo.UNGUARDED_FLAG, - getLandsatCSVFile()); + getLandsatCSVFile(getConfiguration())); } @Test @@ -169,14 +171,15 @@ public void testLandsatBucketRequireGuarded() throws Throwable { runToFailure(E_BAD_STATE, BucketInfo.NAME, "-" + BucketInfo.GUARDED_FLAG, - ITestS3GuardToolLocal.this.getLandsatCSVFile()); + getLandsatCSVFile( + ITestS3GuardToolLocal.this.getConfiguration())); } @Test public void testLandsatBucketRequireUnencrypted() throws Throwable { run(BucketInfo.NAME, "-" + BucketInfo.ENCRYPTION_FLAG, "none", - getLandsatCSVFile()); + getLandsatCSVFile(getConfiguration())); } @Test @@ -184,7 +187,8 @@ public void testLandsatBucketRequireEncrypted() throws Throwable { runToFailure(E_BAD_STATE, BucketInfo.NAME, "-" + BucketInfo.ENCRYPTION_FLAG, - "AES256", ITestS3GuardToolLocal.this.getLandsatCSVFile()); + "AES256", getLandsatCSVFile( + ITestS3GuardToolLocal.this.getConfiguration())); } @Test @@ -367,7 +371,7 @@ private void uploadCommandAssertCount(S3AFileSystem fs, String options[], allOptions.add(String.valueOf(ageSeconds)); } allOptions.add(path.toString()); - exec(cmd, buf, allOptions.toArray(new String[0])); + exec(0, "", cmd, buf, allOptions.toArray(new String[0])); try (BufferedReader reader = new BufferedReader( new InputStreamReader(new ByteArrayInputStream(buf.toByteArray())))) { diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardToolTestHelper.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardToolTestHelper.java new file mode 100644 index 0000000000..f22aa3606b --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/s3guard/S3GuardToolTestHelper.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.s3guard; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static org.junit.Assert.assertEquals; + +/** + * Helper class for tests which make CLI invocations of the S3Guard tools. + * That's {@link AbstractS3GuardToolTestBase} and others. + */ +public final class S3GuardToolTestHelper { + + private static final Logger LOG = LoggerFactory.getLogger( + S3GuardToolTestHelper.class); + + private S3GuardToolTestHelper() { + } + + /** + * Execute a command, returning the buffer if the command actually completes. + * If an exception is raised the output is logged instead. + * @param cmd command + * @param args argument list + * @throws Exception on any failure + */ + public static String exec(S3GuardTool cmd, String... args) throws Exception { + ByteArrayOutputStream buf = new ByteArrayOutputStream(); + try { + exec(0, "", cmd, buf, args); + return buf.toString(); + } catch (AssertionError e) { + throw e; + } catch (Exception e) { + LOG.error("Command {} failed: \n{}", cmd, buf); + throw e; + } + } + + /** + * Execute a command, saving the output into the buffer. + * @param expectedResult expected result of the command. + * @param errorText error text to include in the assertion. + * @param cmd command + * @param buf buffer to use for tool output (not SLF4J output) + * @param args argument list + * @throws Exception on any failure + */ + public static void exec(final int expectedResult, + final String errorText, + final S3GuardTool cmd, + final ByteArrayOutputStream buf, + final String... args) + throws Exception { + LOG.info("exec {}", (Object) args); + int r; + try (PrintStream out = new PrintStream(buf)) { + r = cmd.run(args, out); + out.flush(); + } + if (expectedResult != r) { + String message = errorText.isEmpty() ? "" : (errorText + ": ") + + "Command " + cmd + " failed\n" + buf; + assertEquals(message, expectedResult, r); + } + } + +} diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/AbstractS3SelectTest.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/AbstractS3SelectTest.java new file mode 100644 index 0000000000..18138a616b --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/AbstractS3SelectTest.java @@ -0,0 +1,746 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.select; + +import java.io.BufferedReader; +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStreamReader; +import java.time.Duration; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.Optional; +import java.util.Scanner; +import java.util.function.Consumer; + +import org.junit.Assume; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeys; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileContext; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FutureDataInputStreamBuilder; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.StreamCapabilities; +import org.apache.hadoop.fs.s3a.AWSServiceIOException; +import org.apache.hadoop.fs.s3a.AbstractS3ATestBase; +import org.apache.hadoop.fs.s3a.S3AFileSystem; +import org.apache.hadoop.fs.s3a.commit.AbstractCommitITest; +import org.apache.hadoop.fs.s3a.commit.DurationInfo; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.PassthroughCodec; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapreduce.MRJobConfig; +import org.apache.hadoop.mapreduce.RecordReader; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.TaskAttemptID; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; +import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; +import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; + +import static org.apache.hadoop.fs.impl.FutureIOSupport.awaitFuture; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.getLandsatCSVPath; +import static org.apache.hadoop.fs.s3a.select.CsvFile.ALL_QUOTES; +import static org.apache.hadoop.fs.s3a.select.SelectConstants.*; +import static org.apache.hadoop.test.LambdaTestUtils.intercept; + +/** + * Superclass for S3 Select tests. + * A lot of the work here goes into creating and querying a simple CSV test + * format, with various datatypes which can be used in type-casting queries. + *
    + * 1  "ID": index of the row
    + * 2  "date": date as ISO 8601
    + * 3  "timestamp": timestamp in seconds of epoch
    + * 4  "name", entry-$row
    + * 5  "odd", odd/even as boolean. True means odd,
    + * 6  "oddint", odd/even as int : 1 for odd, 0 for even
    + * 7  "oddrange": odd/even as 1 for odd, -1 for even
    + * 
    + */ +public abstract class AbstractS3SelectTest extends AbstractS3ATestBase { + + /** + * Number of columns in the CSV file: {@value}. + */ + public static final int CSV_COLUMN_COUNT = 7; + + protected static final String TRUE = q("TRUE"); + + protected static final String FALSE = q("FALSE"); + + public static final String SELECT_EVERYTHING = "SELECT * FROM S3OBJECT s"; + + public static final String SELECT_EVEN_ROWS_NO_HEADER = + "SELECT * FROM S3OBJECT s WHERE s._5 = " + TRUE; + public static final String SELECT_ODD_ROWS + = "SELECT s.name FROM S3OBJECT s WHERE s.odd = " + TRUE; + + public static final String SELECT_ODD_ENTRIES + = "SELECT * FROM S3OBJECT s WHERE s.odd = `TRUE`"; + + public static final String SELECT_ODD_ENTRIES_BOOL + = "SELECT * FROM S3OBJECT s WHERE CAST(s.odd AS BOOL) = TRUE"; + + public static final String SELECT_ODD_ENTRIES_INT + = "SELECT * FROM S3OBJECT s WHERE CAST(s.\"oddint\" AS INT) = 1"; + + public static final String SELECT_ODD_ENTRIES_DECIMAL + = "SELECT * FROM S3OBJECT s WHERE CAST(s.\"oddint\" AS DECIMAL) = 1"; + + /** + * Playing with timestamps: {@value}. + */ + public static final String SELECT_TO_DATE + = "SELECT\n" + + "CAST(s.\"date\" AS TIMESTAMP)\n" + + "FROM S3OBJECT s"; + + + /** + * How many rows are being generated. + */ + protected static final int ALL_ROWS_COUNT = 10; + + /** + * Row count of all rows + header. + */ + protected static final int ALL_ROWS_COUNT_WITH_HEADER = ALL_ROWS_COUNT + 1; + + /** + * Number of odd rows expected: {@value}. + */ + protected static final int ODD_ROWS_COUNT = ALL_ROWS_COUNT / 2; + + /** + * Number of even rows expected: {@value}. + * This is the same as the odd row count; it's separate just to + * be consistent on tests which select even results. + */ + protected static final int EVEN_ROWS_COUNT = ODD_ROWS_COUNT; + + protected static final String ENTRY_0001 = "\"entry-0001\""; + + protected static final String ENTRY_0002 = "\"entry-0002\""; + + /** + * Path to the landsat csv.gz file. + */ + private Path landsatGZ; + + /** + * The filesystem with the landsat data. + */ + private S3AFileSystem landsatFS; + + + // A random task attempt id for testing. + private String attempt0; + + private TaskAttemptID taskAttempt0; + + private String jobId; + + /** + * Base CSV file is headers. + *
    +   * 1  "ID": index of the row
    +   * 2  "date": date as Date.toString
    +   * 3  "timestamp": timestamp in seconds of epoch
    +   * 4  "name", entry-$row
    +   * 5  "odd", odd/even as boolean
    +   * 6  "oddint", odd/even as int : 1 for odd, 0 for even
    +   * 7  "oddrange": odd/even as 1 for odd, -1 for even
    +   * 
    + * @param fs filesystem + * @param path path to write + * @param header should the standard header be printed? + * @param quoteHeaderPolicy what the header quote policy is. + * @param quoteRowPolicy what the row quote policy is. + * @param rows number of rows + * @param separator column separator + * @param eol end of line characters + * @param quote quote char + * @param footer callback to run after the main CSV file is written + * @throws IOException IO failure. + */ + public static void createStandardCsvFile( + final FileSystem fs, + final Path path, + final boolean header, + final long quoteHeaderPolicy, + final long quoteRowPolicy, + final int rows, + final String separator, + final String eol, + final String quote, + final Consumer footer) throws IOException { + try (CsvFile csv = new CsvFile(fs, + path, + true, + separator, + eol, + quote)) { + + if (header) { + writeStandardHeader(csv, quoteHeaderPolicy); + } + DateTimeFormatter formatter + = DateTimeFormatter.ISO_OFFSET_DATE_TIME; + ZonedDateTime timestamp = ZonedDateTime.now(); + Duration duration = Duration.ofHours(20); + // loop is at 1 for use in counters and flags + for (int i = 1; i <= rows; i++) { + // flip the odd flags + boolean odd = (i & 1) == 1; + // and move the timestamp back + timestamp = timestamp.minus(duration); + csv.row(quoteRowPolicy, + i, + timestamp.format(formatter), + timestamp.toEpochSecond(), + String.format("entry-%04d", i), + odd ? "TRUE" : "FALSE", + odd ? 1 : 0, + odd ? 1 : -1 + ); + } + // write the footer + footer.accept(csv); + } + } + + /** + * Write out the standard header to a CSV file. + * @param csv CSV file to use. + * @param quoteHeaderPolicy quote policy. + * @return the input file. + * @throws IOException failure to write. + */ + private static CsvFile writeStandardHeader(final CsvFile csv, + final long quoteHeaderPolicy) throws IOException { + return csv.row(quoteHeaderPolicy, + "id", + "date", + "timestamp", + "name", + "odd", + "oddint", + "oddrange"); + } + + /** + * Verify that an exception has a specific error code. + * if not: an assertion is raised containing the original value. + * @param code expected code. + * @param ex exception caught + * @throws AssertionError on a mismatch + */ + protected static AWSServiceIOException verifyErrorCode(final String code, + final AWSServiceIOException ex) { + logIntercepted(ex); + if (!code.equals(ex.getErrorCode())) { + throw new AssertionError("Expected Error code" + code + + " actual " + ex.getErrorCode(), + ex); + } + return ex; + } + + /** + * Probe for a filesystem instance supporting S3 Select. + * @param filesystem filesystem + * @return true iff the filesystem supports S3 Select. + */ + boolean isSelectAvailable(final FileSystem filesystem) { + return filesystem instanceof StreamCapabilities + && ((StreamCapabilities) filesystem) + .hasCapability(S3_SELECT_CAPABILITY); + } + + /** + * Setup: requires select to be available. + */ + @Override + public void setup() throws Exception { + super.setup(); + Assume.assumeTrue("S3 Select is not enabled on " + + getFileSystem().getUri(), + isSelectAvailable(getFileSystem())); + Configuration conf = getConfiguration(); + landsatGZ = getLandsatCSVPath(conf); + landsatFS = (S3AFileSystem) landsatGZ.getFileSystem(conf); + Assume.assumeTrue("S3 Select is not enabled on " + landsatFS.getUri(), + isSelectAvailable(landsatFS)); + // create some job info + jobId = AbstractCommitITest.randomJobId(); + attempt0 = "attempt_" + jobId + "_m_000000_0"; + taskAttempt0 = TaskAttemptID.forName(attempt0); + } + + /** + * Build the SQL statement, using String.Format rules. + * @param template template + * @param args arguments for the template + * @return the template to use + */ + protected static String sql( + final String template, + final Object... args) { + return args.length > 0 ? String.format(template, args) : template; + } + + /** + * Quote a constant with the SQL quote logic. + * @param c constant + * @return quoted constant + */ + protected static String q(String c) { + return '\'' + c + '\''; + } + + /** + * Select from a source file. + * @param fileSystem FS. + * @param source source file. + * @param conf config for the select call. + * @param sql template for a formatted SQL request. + * @param args arguments for the formatted request. + * @return the input stream. + * @throws IOException failure + */ + protected FSDataInputStream select( + final FileSystem fileSystem, + final Path source, + final Configuration conf, + final String sql, + final Object... args) + throws IOException { + String expression = sql(sql, args); + describe("Execution Select call: %s", expression); + FutureDataInputStreamBuilder builder = + fileSystem.openFile(source) + .must(SELECT_SQL, expression); + // propagate all known options + for (String key : InternalSelectConstants.SELECT_OPTIONS) { + String value = conf.get(key); + if (value != null) { + builder.must(key, value); + } + } + return awaitFuture(builder.build()); + } + + /** + * Select from a source file via the file context API. + * @param fc file context + * @param source source file. + * @param conf config for the select call. + * @param sql template for a formatted SQL request. + * @param args arguments for the formatted request. + * @return the input stream. + * @throws IOException failure + */ + protected FSDataInputStream select( + final FileContext fc, + final Path source, + final Configuration conf, + final String sql, + final Object... args) + throws IOException { + String expression = sql(sql, args); + describe("Execution Select call: %s", expression); + FutureDataInputStreamBuilder builder = fc.openFile(source) + .must(SELECT_SQL, expression); + // propagate all known options + InternalSelectConstants.SELECT_OPTIONS.forEach((key) -> + Optional.ofNullable(conf.get(key)) + .map((v) -> builder.must(key, v))); + return awaitFuture(builder.build()); + } + + /** + * Parse a selection to lines; log at info. + * @param selection selection input + * @return a list of lines. + * @throws IOException if raised during the read. + */ + protected List parseToLines(final FSDataInputStream selection) + throws IOException { + return parseToLines(selection, getMaxLines()); + } + + /** + * Enable the passthrough codec for a job, with the given extension. + * @param conf configuration to update + * @param extension extension to use + */ + protected void enablePassthroughCodec(final Configuration conf, + final String extension) { + conf.set(CommonConfigurationKeys.IO_COMPRESSION_CODECS_KEY, + PassthroughCodec.CLASSNAME); + conf.set(PassthroughCodec.OPT_EXTENSION, extension); + } + + /** + * Override if a test suite is likely to ever return more lines. + * @return the max number for parseToLines/1 + */ + protected int getMaxLines() { + return 100; + } + + /** + * Parse a selection to lines; log at info. + * @param selection selection input + * @param maxLines maximum number of lines. + * @return a list of lines. + * @throws IOException if raised during the read. + */ + protected List parseToLines(final FSDataInputStream selection, + int maxLines) + throws IOException { + List result = new ArrayList<>(); + String stats; + // the scanner assumes that any IOE => EOF; we don't want + // that and so will check afterwards. + try (Scanner scanner = new Scanner( + new BufferedReader(new InputStreamReader(selection)))) { + scanner.useDelimiter(CSV_INPUT_RECORD_DELIMITER_DEFAULT); + while (maxLines > 0) { + try { + String l = scanner.nextLine(); + LOG.info("{}", l); + result.add(l); + maxLines--; + } catch (NoSuchElementException e) { + // EOL or an error + break; + } + } + stats = selection.toString(); + describe("Result line count: %s\nStatistics\n%s", + result.size(), stats); + // look for any raised error. + IOException ioe = scanner.ioException(); + if (ioe != null && !(ioe instanceof EOFException)) { + throw ioe; + } + } + return result; + } + + /** + * Verify the selection count; return the original list. + * If there's a mismatch, the whole list is logged at error, then + * an assertion raised. + * @param expected expected value. + * @param expression expression -for error messages. + * @param selection selected result. + * @return the input list. + */ + protected List verifySelectionCount( + final int expected, + final String expression, + final List selection) { + return verifySelectionCount(expected, expected, expression, selection); + } + + /** + * Verify the selection count is within a given range; + * return the original list. + * If there's a mismatch, the whole list is logged at error, then + * an assertion raised. + * @param min min value (exclusive). + * @param max max value (exclusive). If -1: no maximum. + * @param expression expression -for error messages. + * @param selection selected result. + * @return the input list. + */ + protected List verifySelectionCount( + final int min, + final int max, + final String expression, + final List selection) { + int size = selection.size(); + if (size < min || (max > -1 && size > max)) { + // mismatch: log and then fail + String listing = prepareToPrint(selection); + LOG.error("\n{} => \n{}", expression, listing); + fail("row count from select call " + expression + + " is out of range " + min + " to " + max + + ": " + size + + " \n" + listing); + } + return selection; + } + + /** + * Do whatever is needed to prepare a string for logging. + * @param selection selection + * @return something printable. + */ + protected String prepareToPrint(final List selection) { + return String.join("\n", selection); + } + + /** + * Create "the standard" CSV file with the default row count. + * @param fs filesystem + * @param path path to write + * @param quoteRowPolicy what the row quote policy is. + * @throws IOException IO failure. + */ + protected void createStandardCsvFile( + final FileSystem fs, + final Path path, + final long quoteRowPolicy) + throws IOException { + createStandardCsvFile( + fs, path, + true, + ALL_QUOTES, + quoteRowPolicy, + ALL_ROWS_COUNT, + ",", + "\n", + "\"", + c -> {}); + } + + /** + * Set an MR Job input option. + * @param conf configuration + * @param key key to set + * @param val value + */ + void inputOpt(Configuration conf, String key, String val) { + conf.set(MRJobConfig.INPUT_FILE_OPTION_PREFIX + key, val); + } + + /** + * Set a mandatory MR Job input option. + * @param conf configuration + * @param key key to set + * @param val value + */ + void inputMust(Configuration conf, String key, String val) { + conf.set(MRJobConfig.INPUT_FILE_MANDATORY_PREFIX + key, + val); + } + + /** + * Reads lines through a v2 RecordReader, as if it were part of a + * MRv2 job. + * @param conf job conf + * @param path path to query + * @param sql sql to add to the configuration. + * @param initialCapacity capacity of the read + * @param reader reader: this is closed after the read + * @return the selected lines. + * @throws Exception failure + */ + protected List readRecords(JobConf conf, + Path path, + String sql, + RecordReader reader, + int initialCapacity) throws Exception { + + inputMust(conf, SELECT_SQL, sql); + List lines = new ArrayList<>(initialCapacity); + try { + reader.initialize( + createSplit(conf, path), + createTaskAttemptContext(conf)); + while (reader.nextKeyValue()) { + lines.add(reader.getCurrentValue().toString()); + } + } finally { + reader.close(); + } + return lines; + } + /** + * Reads lines through a v1 RecordReader, as if it were part of a + * MRv1 job. + * @param conf job conf + * @param reader reader: this is closed after the read + * @param initialCapacity capacity of the read + * @return the selected lines. + * @throws Exception failure + */ + protected List readRecordsV1(JobConf conf, + org.apache.hadoop.mapred.RecordReader reader, + K key, + V value, + int initialCapacity) throws Exception { + List lines = new ArrayList<>(initialCapacity); + try { + while (reader.next(key, value)) { + lines.add(value.toString()); + } + } finally { + reader.close(); + } + return lines; + } + + /** + * Create a task attempt context for a job, creating a random JobID to + * do this. + * @param conf job configuration. + * @return a new task attempt context containing the job conf + * @throws Exception failure. + */ + protected TaskAttemptContext createTaskAttemptContext(final JobConf conf) + throws Exception { + String id = AbstractCommitITest.randomJobId(); + return new TaskAttemptContextImpl(conf, + TaskAttemptID.forName("attempt_" + id + "_m_000000_0")); + } + + /** + * Create an MRv2 file input split. + * @param conf job configuration + * @param path path to file + * @return the split + * @throws IOException problems reading the file. + */ + protected FileSplit createSplit(final JobConf conf, final Path path) + throws IOException { + FileSystem fs = path.getFileSystem(conf); + FileStatus status = fs.getFileStatus(path); + return new FileSplit(path, 0, status.getLen(), + new String[]{"localhost"}); + } + + /** + * Create an MRv1 file input split. + * @param conf job configuration + * @param path path to file + * @return the split + * @throws IOException problems reading the file. + */ + protected org.apache.hadoop.mapred.FileSplit + createSplitV1(final JobConf conf, final Path path) + throws IOException { + FileSystem fs = path.getFileSystem(conf); + FileStatus status = fs.getFileStatus(path); + return new org.apache.hadoop.mapred.FileSplit(path, 0, status.getLen(), + new String[]{"localhost"}); + } + + /** + * Create a v2 line record reader expecting newlines as the EOL marker. + * @return a reader + */ + protected RecordReader createLineRecordReader() { + return new LineRecordReader(new byte[]{'\n'}); + } + + /** + * Create a v1 line record reader. + * @return a reader + */ + protected org.apache.hadoop.mapred.RecordReader + createLineRecordReaderV1( + final JobConf conf, + final Path path) throws IOException { + return new org.apache.hadoop.mapred.LineRecordReader( + conf, createSplitV1(conf, path)); + } + + /** + * Get the path to the landsat file. + * @return the landsat CSV.GZ path. + */ + protected Path getLandsatGZ() { + return landsatGZ; + } + + /** + * Get the filesystem for the landsat file. + * @return the landsat FS. + */ + protected S3AFileSystem getLandsatFS() { + return landsatFS; + } + + /** + * Perform a seek: log duration of the operation. + * @param stream stream to seek. + * @param target target position. + * @throws IOException on an error + */ + protected void seek(final FSDataInputStream stream, final long target) + throws IOException { + try(DurationInfo ignored = + new DurationInfo(LOG, "Seek to %d", target)) { + stream.seek(target); + } + } + + /** + * Execute a seek so far past the EOF that it will be rejected. + * If the seek did not fail, the exception raised includes the toString() + * value of the stream. + * @param seekStream stream to seek in. + * @param newpos new position + * @return the EOF Exception raised. + * @throws Exception any other exception. + */ + protected EOFException expectSeekEOF(final FSDataInputStream seekStream, + final int newpos) throws Exception { + return intercept(EOFException.class, + () -> { + seek(seekStream, newpos); + // return this for the test failure reports. + return "Stream after seek to " + newpos + ": " + seekStream; + }); + } + + public String getAttempt0() { + return attempt0; + } + + public TaskAttemptID getTaskAttempt0() { + return taskAttempt0; + } + + public String getJobId() { + return jobId; + } + + /** + * Logs intercepted exceptions. + * This generates the stack traces for the documentation. + * @param ex exception + * @return the exception passed in (for chaining) + */ + protected static T logIntercepted(T ex) { + LOG.info("Intercepted Exception is ", ex); + return ex; + } +} diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/CsvFile.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/CsvFile.java new file mode 100644 index 0000000000..06e6d2a78a --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/CsvFile.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.select; + +import java.io.Closeable; +import java.io.IOException; +import java.io.PrintWriter; + +import com.google.common.base.Preconditions; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +/** + * Writer for generating test CSV files. + * + * Quotes are manged by passing in a long whose specific bits control + * whether or not a row is quoted, bit 0 for column 0, etc. + */ +class CsvFile implements Closeable { + + + /** constant to quote all columns. */ + public static final long ALL_QUOTES = 0x7fffffff; + + /** quote nothing: {@value}. */ + public static final long NO_QUOTES = 0; + + private final Path path; + + private final PrintWriter out; + + private final String separator; + + private final String eol; + + private final String quote; + + CsvFile(final FileSystem fs, + final Path path, + boolean overwrite, + final String separator, + final String eol, + final String quote) throws IOException { + this.path = path; + this.separator = Preconditions.checkNotNull(separator); + this.eol = Preconditions.checkNotNull(eol); + this.quote = Preconditions.checkNotNull(quote); + out = new PrintWriter(fs.create(path, overwrite)); + } + + + /** + * Close the file, if not already done. + * @throws IOException on a failure. + */ + @Override + public synchronized void close() throws IOException { + if (out != null) { + out.close(); + } + } + + public Path getPath() { + return path; + } + + public String getSeparator() { + return separator; + } + + public String getEol() { + return eol; + } + + /** + * Write a row. + * Entries are quoted if the bit for that column is true. + * @param quotes quote policy: every bit defines the rule for that element + * @param columns columns to write + * @return self for ease of chaining. + */ + public CsvFile row(long quotes, Object... columns) { + for (int i = 0; i < columns.length; i++) { + if (i != 0) { + out.write(separator); + } + boolean toQuote = (quotes & 1) == 1; + // unsigned right shift to make next column flag @ position 0 + quotes = quotes >>> 1; + if (toQuote) { + out.write(quote); + } + out.write(columns[i].toString()); + if (toQuote) { + out.write(quote); + } + } + out.write(eol); + return this; + } + + /** + * Write a line. + * @param line line to print + * @return self for ease of chaining. + * @throws IOException IO failure + */ + public CsvFile line(String line) { + out.write(line); + out.write(eol); + return this; + } + + /** + * Get the output stream. + * @return the stream. + */ + public PrintWriter getOut() { + return out; + } +} diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java new file mode 100644 index 0000000000..5fe4e2bb67 --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3Select.java @@ -0,0 +1,967 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.select; + +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.List; +import java.util.concurrent.CompletableFuture; + +import org.junit.Assume; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSExceptionMessages; +import org.apache.hadoop.fs.FileContext; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FutureDataInputStreamBuilder; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathIOException; +import org.apache.hadoop.fs.Seekable; +import org.apache.hadoop.fs.contract.ContractTestUtils; +import org.apache.hadoop.fs.impl.AbstractFSBuilderImpl; +import org.apache.hadoop.fs.s3a.AWSBadRequestException; +import org.apache.hadoop.fs.s3a.AWSServiceIOException; +import org.apache.hadoop.fs.s3a.Constants; +import org.apache.hadoop.fs.s3a.S3AFileSystem; +import org.apache.hadoop.fs.s3a.S3AInputStream; +import org.apache.hadoop.fs.s3a.S3AInstrumentation; +import org.apache.hadoop.fs.s3a.S3ATestUtils; +import org.apache.hadoop.fs.s3a.Statistic; +import org.apache.hadoop.fs.s3a.commit.DurationInfo; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.task.JobContextImpl; + +import static org.apache.hadoop.fs.s3a.Constants.INPUT_FADVISE; +import static org.apache.hadoop.fs.s3a.Constants.INPUT_FADV_NORMAL; +import static org.apache.hadoop.fs.s3a.Constants.READAHEAD_RANGE; +import static org.apache.hadoop.fs.s3a.select.CsvFile.ALL_QUOTES; +import static org.apache.hadoop.fs.s3a.select.SelectBinding.expandBackslashChars; +import static org.apache.hadoop.fs.s3a.select.SelectConstants.*; +import static org.apache.hadoop.test.LambdaTestUtils.intercept; +import static org.apache.hadoop.test.LambdaTestUtils.interceptFuture; +import static org.hamcrest.CoreMatchers.hasItem; +import static org.hamcrest.CoreMatchers.not; +import static org.hamcrest.collection.IsCollectionWithSize.hasSize; + +/** + * Test the S3 Select feature with some basic SQL Commands. + * Executed if the destination store declares its support for the feature. + */ +public class ITestS3Select extends AbstractS3SelectTest { + + private static final Logger LOG = + LoggerFactory.getLogger(ITestS3Select.class); + + public static final String E_CAST_FAILED = "CastFailed"; + + public static final String E_PARSE_INVALID_PATH_COMPONENT + = "ParseInvalidPathComponent"; + + public static final String E_INVALID_TABLE_ALIAS = "InvalidTableAlias"; + + private Configuration selectConf; + + /** well formed CSV. */ + private Path csvPath; + + /** CSV file with fewer columns than expected, all fields parse badly. */ + private Path brokenCSV; + + @Override + public void setup() throws Exception { + super.setup(); + Assume.assumeTrue("S3 Select is not enabled", + getFileSystem().hasCapability(S3_SELECT_CAPABILITY)); + csvPath = path(getMethodName() + ".csv"); + selectConf = new Configuration(false); + selectConf.setBoolean(SELECT_ERRORS_INCLUDE_SQL, true); + createStandardCsvFile(getFileSystem(), csvPath, ALL_QUOTES); + // create the broken CSV file. + brokenCSV = path("testParseBrokenCSVFile"); + createStandardCsvFile( + getFileSystem(), brokenCSV, + true, + ALL_QUOTES, + ALL_ROWS_COUNT, + ALL_ROWS_COUNT, + ",", + "\n", + "\"", + csv -> csv + .line("# comment") + .row(ALL_QUOTES, "bad", "Tuesday", 0, "entry-bad", "yes", false)); + } + + @Override + public void teardown() throws Exception { + describe("teardown"); + try { + if (csvPath != null) { + getFileSystem().delete(csvPath, false); + } + if (brokenCSV != null) { + getFileSystem().delete(brokenCSV, false); + } + } finally { + super.teardown(); + } + } + + @Test + public void testCapabilityProbe() throws Throwable { + + // this should always hold true if we get past test setup + assertTrue("Select is not available on " + getFileSystem(), + isSelectAvailable(getFileSystem())); + } + + @SuppressWarnings("NestedAssignment") + @Test + public void testReadWholeFileClassicAPI() throws Throwable { + describe("create and read the whole file. Verifies setup working"); + int lines; + try (BufferedReader reader = new BufferedReader( + new InputStreamReader( + getFileSystem().open(csvPath)))) { + lines = 0; + // seek to 0, which is what some input formats do + String line; + while ((line = reader.readLine()) != null) { + lines++; + LOG.info("{}", line); + } + } + assertEquals("line count", ALL_ROWS_COUNT_WITH_HEADER, lines); + } + + @Test + public void testSelectWholeFileNoHeader() throws Throwable { + describe("Select the entire file, expect all rows but the header"); + expectSelected( + ALL_ROWS_COUNT, + selectConf, + CSV_HEADER_OPT_USE, + "SELECT * FROM S3OBJECT"); + } + + @Test + public void testSelectFirstColumnNoHeader() throws Throwable { + describe("Select the entire file, expect all rows but the header"); + expectSelected( + ALL_ROWS_COUNT_WITH_HEADER, + selectConf, + CSV_HEADER_OPT_NONE, + "SELECT s._1 FROM S3OBJECT s"); + } + + @Test + public void testSelectSelfNoHeader() throws Throwable { + describe("Select the entire file, expect all rows but the header"); + expectSelected( + ALL_ROWS_COUNT_WITH_HEADER, + selectConf, + CSV_HEADER_OPT_NONE, + "SELECT s._1 FROM S3OBJECT s WHERE s._1 = s._1"); + } + + @Test + public void testSelectSelfUseHeader() throws Throwable { + describe("Select the entire file, expect all rows including the header"); + expectSelected( + ALL_ROWS_COUNT, + selectConf, + CSV_HEADER_OPT_USE, + "SELECT s.id FROM S3OBJECT s WHERE s.id = s.id"); + } + + @Test + public void testSelectID2UseHeader() throws Throwable { + describe("Select where ID=2; use the header"); + expectSelected( + 1, + selectConf, + CSV_HEADER_OPT_USE, + "SELECT s.id FROM S3OBJECT s WHERE s.id = '2'"); + } + + @Test + public void testSelectNoMatchingID() throws Throwable { + describe("Select where there is no match; expect nothing back"); + expectSelected( + 0, + selectConf, + CSV_HEADER_OPT_USE, + "SELECT s.id FROM S3OBJECT s WHERE s.id = '0x8000'"); + } + + @Test + public void testSelectId1() throws Throwable { + describe("Select the first element in the file"); + expectSelected( + 1, + selectConf, + CSV_HEADER_OPT_NONE, + "SELECT * FROM S3OBJECT s WHERE s._1 = '1'", + TRUE); + } + + @Test + public void testSelectEmptySQL() throws Throwable { + describe("An empty SQL statement fails fast"); + FutureDataInputStreamBuilder builder = getFileSystem().openFile( + csvPath) + .must(SELECT_SQL, ""); + interceptFuture(IllegalArgumentException.class, + SELECT_SQL, + builder.build()); + } + + @Test + public void testSelectEmptyFile() throws Throwable { + describe("Select everything from an empty file"); + Path path = path("testSelectEmptyFile"); + S3AFileSystem fs = getFileSystem(); + ContractTestUtils.touch(fs, path); + parseToLines(fs.openFile(path) + .must(SELECT_SQL, SELECT_EVERYTHING) + .build() + .get(), + 0); + } + + @Test + public void testSelectEmptyFileWithConditions() throws Throwable { + describe("Select everything from an empty file with a more complex SQL"); + Path path = path("testSelectEmptyFileWithConditions"); + S3AFileSystem fs = getFileSystem(); + ContractTestUtils.touch(fs, path); + String sql = "SELECT * FROM S3OBJECT s WHERE s._1 = `TRUE`"; + CompletableFuture future = fs.openFile(path) + .must(SELECT_SQL, sql).build(); + assertEquals("Not at the end of the file", -1, future.get().read()); + } + + @Test + public void testSelectSeek() throws Throwable { + describe("Verify forward seeks work, not others"); + + // start: read in the full data through the initial select + // this makes asserting that contents match possible + Path path = csvPath; + S3AFileSystem fs = getFileSystem(); + int len = (int) fs.getFileStatus(path).getLen(); + byte[] fullData = new byte[len]; + int actualLen; + try (DurationInfo ignored = + new DurationInfo(LOG, "Initial read of %s", path); + FSDataInputStream sourceStream = + select(fs, path, + selectConf, + SELECT_EVERYTHING)) { + // read it in + actualLen = IOUtils.read(sourceStream, fullData); + } + int seekRange = 20; + + try (FSDataInputStream seekStream = + select(fs, path, + selectConf, + SELECT_EVERYTHING)) { + SelectInputStream sis + = (SelectInputStream) seekStream.getWrappedStream(); + S3AInstrumentation.InputStreamStatistics streamStats + = sis.getS3AStreamStatistics(); + // lazy seek doesn't raise a problem here + seekStream.seek(0); + assertEquals("first byte read", fullData[0], seekStream.read()); + + // and now the pos has moved, again, seek will be OK + seekStream.seek(1); + seekStream.seek(1); + // but trying to seek elsewhere now fails + PathIOException ex = intercept(PathIOException.class, + SelectInputStream.SEEK_UNSUPPORTED, + () -> seekStream.seek(0)); + LOG.info("Seek error is as expected", ex); + // positioned reads from the current location work. + byte[] buffer = new byte[1]; + long pos = seekStream.getPos(); + seekStream.readFully(pos, buffer); + // but positioned backwards fail. + intercept(PathIOException.class, + SelectInputStream.SEEK_UNSUPPORTED, + () -> seekStream.readFully(0, buffer)); + // the position has now moved on. + assertPosition(seekStream, pos + 1); + // so a seek to the old pos will fail + intercept(PathIOException.class, + SelectInputStream.SEEK_UNSUPPORTED, + () -> seekStream.readFully(pos, buffer)); + + // set the readahead to the default. + // This verifies it reverts to the default. + seekStream.setReadahead(null); + assertEquals("Readahead in ", + Constants.DEFAULT_READAHEAD_RANGE, sis.getReadahead()); + // forward seeks are implemented as 1+ skip + long target = seekStream.getPos() + seekRange; + seek(seekStream, target); + assertPosition(seekStream, target); + // now do a read and compare values + assertEquals("byte at seek position", + fullData[(int)seekStream.getPos()], seekStream.read()); + assertEquals("Seek bytes skipped in " + streamStats, + seekRange, streamStats.bytesSkippedOnSeek); + + // try an invalid readahead range + intercept(IllegalArgumentException.class, + S3AInputStream.E_NEGATIVE_READAHEAD_VALUE, + () -> seekStream.setReadahead(-1L)); + + // do a slightly forward offset read + int read = seekStream.read(seekStream.getPos() + 2, buffer, 0, 1); + assertEquals(1, read); + + // final fun: seek way past the EOF + logIntercepted(expectSeekEOF(seekStream, actualLen * 2)); + assertPosition(seekStream, actualLen); + assertEquals(-1, seekStream.read()); + LOG.info("Seek statistics {}", streamStats); + // this will return no, but not fail + assertFalse("Failed to seek to new source in " + seekStream, + seekStream.seekToNewSource(0)); + // and set the readahead to 0 to see that close path works + seekStream.setReadahead(0L); + // then do a manual close even though there's one in the try resource. + // which will verify that a double close is harmless + seekStream.close(); + LOG.info("Final stream state {}", sis); + } + } + + /** + * Assert that a stream is in a specific position. + * @param stream stream or other seekable. + * @param pos expected position. + * @throws IOException failure of the getPos() call. + * @throws AssertionError mismatch between expected and actual. + */ + private void assertPosition(Seekable stream, long pos) + throws IOException { + assertEquals("Wrong stream position in " + stream, + pos, stream.getPos()); + } + + @Test + public void testSelectOddLinesNoHeader() throws Throwable { + describe("Select odd lines, ignoring the header"); + expectSelected( + ODD_ROWS_COUNT, + selectConf, + CSV_HEADER_OPT_IGNORE, + "SELECT * FROM S3OBJECT s WHERE s._5 = `TRUE`"); + // and do a quick check on the instrumentation + long bytesRead = getFileSystem().getInstrumentation() + .getCounterValue(Statistic.STREAM_SEEK_BYTES_READ); + assertNotEquals("No bytes read count", 0, bytesRead); + } + + @Test + public void testSelectOddLinesHeader() throws Throwable { + describe("Select the odd values"); + List selected = expectSelected( + ODD_ROWS_COUNT, + selectConf, + CSV_HEADER_OPT_USE, + SELECT_ODD_ROWS); + // the list includes odd values + assertThat(selected, hasItem(ENTRY_0001)); + // but not the evens + assertThat(selected, not(hasItem(ENTRY_0002))); + } + + @Test + public void testSelectOddLinesHeaderTSVOutput() throws Throwable { + describe("Select the odd values with tab spaced output"); + selectConf.set(CSV_OUTPUT_FIELD_DELIMITER, "\t"); + selectConf.set(CSV_OUTPUT_QUOTE_CHARACTER, "'"); + selectConf.set(CSV_OUTPUT_QUOTE_FIELDS, + CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED); + selectConf.set(CSV_OUTPUT_RECORD_DELIMITER, "\r"); + List selected = expectSelected( + ODD_ROWS_COUNT, + selectConf, + CSV_HEADER_OPT_USE, + SELECT_ODD_ENTRIES_BOOL); + // the list includes odd values + String row1 = selected.get(0); + + // split that first line into columns: This is why TSV is better for code + // to work with than CSV + String[] columns = row1.split("\t", -1); + assertEquals("Wrong column count from tab split line <" + row1 + ">", + CSV_COLUMN_COUNT, columns.length); + assertEquals("Wrong column value from tab split line <" + row1 + ">", + "entry-0001", columns[3]); + } + + @Test + public void testSelectNotOperationHeader() throws Throwable { + describe("Select the even values with a NOT call; quote the header name"); + List selected = expectSelected( + EVEN_ROWS_COUNT, + selectConf, + CSV_HEADER_OPT_USE, + "SELECT s.name FROM S3OBJECT s WHERE NOT s.\"odd\" = %s", + TRUE); + // the list includes no odd values + assertThat(selected, not(hasItem(ENTRY_0001))); + // but has the evens + assertThat(selected, hasItem(ENTRY_0002)); + } + + @Test + public void testBackslashExpansion() throws Throwable { + assertEquals("\t\r\n", expandBackslashChars("\t\r\n")); + assertEquals("\t", expandBackslashChars("\\t")); + assertEquals("\r", expandBackslashChars("\\r")); + assertEquals("\r \n", expandBackslashChars("\\r \\n")); + assertEquals("\\", expandBackslashChars("\\\\")); + } + + /** + * This is an expanded example for the documentation. + * Also helps catch out unplanned changes to the configuration strings. + */ + @Test + public void testSelectFileExample() throws Throwable { + describe("Select the entire file, expect all rows but the header"); + int len = (int) getFileSystem().getFileStatus(csvPath).getLen(); + FutureDataInputStreamBuilder builder = + getFileSystem().openFile(csvPath) + .must("fs.s3a.select.sql", + SELECT_ODD_ENTRIES) + .must("fs.s3a.select.input.format", "CSV") + .must("fs.s3a.select.input.compression", "NONE") + .must("fs.s3a.select.input.csv.header", "use") + .must("fs.s3a.select.output.format", "CSV"); + + CompletableFuture future = builder.build(); + try (FSDataInputStream select = future.get()) { + // process the output + byte[] bytes = new byte[len]; + int actual = select.read(bytes); + LOG.info("file length is {}; length of selected data is {}", + len, actual); + } + } + + /** + * This is an expanded example for the documentation. + * Also helps catch out unplanned changes to the configuration strings. + */ + @Test + public void testSelectUnsupportedInputFormat() throws Throwable { + describe("Request an unsupported input format"); + FutureDataInputStreamBuilder builder = getFileSystem().openFile(csvPath) + .must(SELECT_SQL, SELECT_ODD_ENTRIES) + .must(SELECT_INPUT_FORMAT, "pptx"); + interceptFuture(IllegalArgumentException.class, + "pptx", + builder.build()); + } + + /** + * Ask for an invalid output format. + */ + @Test + public void testSelectUnsupportedOutputFormat() throws Throwable { + describe("Request a (currently) unsupported output format"); + FutureDataInputStreamBuilder builder = getFileSystem().openFile(csvPath) + .must(SELECT_SQL, SELECT_ODD_ENTRIES) + .must(SELECT_INPUT_FORMAT, "csv") + .must(SELECT_OUTPUT_FORMAT, "json"); + interceptFuture(IllegalArgumentException.class, + "json", + builder.build()); + } + + /** + * Missing files fail lazy. + */ + @Test + public void testSelectMissingFile() throws Throwable { + + describe("Select a missing file, expect it to surface in the future"); + + Path missing = path("missing"); + + FutureDataInputStreamBuilder builder = + getFileSystem().openFile(missing) + .must(SELECT_SQL, SELECT_ODD_ENTRIES); + + interceptFuture(FileNotFoundException.class, + "", builder.build()); + } + + @Test + public void testSelectDirectoryFails() throws Throwable { + describe("Verify that secondary select options are only valid on select" + + " queries"); + S3AFileSystem fs = getFileSystem(); + Path dir = path("dir"); + // this will be an empty dir marker + fs.mkdirs(dir); + + FutureDataInputStreamBuilder builder = + getFileSystem().openFile(dir) + .must(SELECT_SQL, SELECT_ODD_ENTRIES); + interceptFuture(PathIOException.class, + "", builder.build()); + + // try the parent + builder = getFileSystem().openFile(dir.getParent()) + .must(SELECT_SQL, + SELECT_ODD_ENTRIES); + interceptFuture(PathIOException.class, + "", builder.build()); + } + + @Test + public void testSelectRootFails() throws Throwable { + describe("verify root dir selection is rejected"); + FutureDataInputStreamBuilder builder = + getFileSystem().openFile(path("/")) + .must(SELECT_SQL, SELECT_ODD_ENTRIES); + interceptFuture(PathIOException.class, + "", builder.build()); + } + + /** + * Validate the abort logic. + */ + @Test + public void testCloseWithAbort() throws Throwable { + describe("Close the stream with the readahead outstanding"); + S3ATestUtils.MetricDiff readOps = new S3ATestUtils.MetricDiff( + getFileSystem(), + Statistic.STREAM_READ_OPERATIONS_INCOMPLETE); + selectConf.setInt(READAHEAD_RANGE, 2); + + FSDataInputStream stream = select(getFileSystem(), csvPath, selectConf, + "SELECT * FROM S3OBJECT s"); + SelectInputStream sis = (SelectInputStream) stream.getWrappedStream(); + assertEquals("Readahead on " + sis, 2, sis.getReadahead()); + stream.setReadahead(1L); + assertEquals("Readahead on " + sis, 1, sis.getReadahead()); + stream.read(); + S3AInstrumentation.InputStreamStatistics stats + = sis.getS3AStreamStatistics(); + assertEquals("Read count in " + sis, + 1, stats.bytesRead); + stream.close(); + assertEquals("Abort count in " + sis, + 1, stats.aborted); + readOps.assertDiffEquals("Read operations are still considered active", + 0); + intercept(PathIOException.class, FSExceptionMessages.STREAM_IS_CLOSED, + () -> stream.read()); + } + + @Test + public void testCloseWithNoAbort() throws Throwable { + describe("Close the stream with the readahead outstandingV"); + FSDataInputStream stream = select(getFileSystem(), csvPath, selectConf, + "SELECT * FROM S3OBJECT s"); + stream.setReadahead(0x1000L); + SelectInputStream sis = (SelectInputStream) stream.getWrappedStream(); + S3AInstrumentation.InputStreamStatistics stats + = sis.getS3AStreamStatistics(); + stream.close(); + assertEquals("Close count in " + sis, 1, stats.closed); + assertEquals("Abort count in " + sis, 0, stats.aborted); + assertTrue("No bytes read in close of " + sis, stats.bytesReadInClose > 0); + } + + @Test + public void testFileContextIntegration() throws Throwable { + describe("Test that select works through FileContext"); + FileContext fc = S3ATestUtils.createTestFileContext(getConfiguration()); + selectConf.set(CSV_INPUT_HEADER, CSV_HEADER_OPT_USE); + + List selected = + verifySelectionCount(ODD_ROWS_COUNT, SELECT_ODD_ENTRIES_INT, + parseToLines( + select(fc, csvPath, selectConf, SELECT_ODD_ROWS))); + // the list includes odd values + assertThat(selected, hasItem(ENTRY_0001)); + // but not the evens + assertThat(selected, not(hasItem(ENTRY_0002))); + } + + @Test + public void testSelectOptionsOnlyOnSelectCalls() throws Throwable { + describe("Secondary select options are only valid on select" + + " queries"); + String key = CSV_INPUT_HEADER; + intercept(IllegalArgumentException.class, key, + () -> getFileSystem().openFile(csvPath) + .must(key, CSV_HEADER_OPT_USE).build()); + } + + @Test + public void testSelectMustBeEnabled() throws Throwable { + describe("Verify that the FS must have S3 select enabled."); + Configuration conf = new Configuration(getFileSystem().getConf()); + conf.setBoolean(FS_S3A_SELECT_ENABLED, false); + try (FileSystem fs2 = FileSystem.newInstance(csvPath.toUri(), conf)) { + intercept(UnsupportedOperationException.class, + SELECT_UNSUPPORTED, + () -> { + assertFalse("S3 Select Capability must be disabled on " + fs2, + isSelectAvailable(fs2)); + return fs2.openFile(csvPath) + .must(SELECT_SQL, SELECT_ODD_ROWS) + .build(); + }); + } + } + + @Test + public void testSelectOptionsRejectedOnNormalOpen() throws Throwable { + describe("Verify that a normal open fails on select must() options"); + intercept(IllegalArgumentException.class, + AbstractFSBuilderImpl.UNKNOWN_MANDATORY_KEY, + () -> getFileSystem().openFile(csvPath) + .must(CSV_INPUT_HEADER, CSV_HEADER_OPT_USE) + .build()); + } + + @Test + public void testSelectOddRecordsWithHeader() + throws Throwable { + describe("work through a record reader"); + JobConf conf = createJobConf(); + inputMust(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_USE); + expectRecordsRead(ODD_ROWS_COUNT, conf, SELECT_ODD_ENTRIES_DECIMAL); + } + + @Test + public void testSelectDatestampsConverted() + throws Throwable { + describe("timestamp conversion in record IIO"); + JobConf conf = createJobConf(); + inputMust(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_USE); + inputMust(conf, CSV_OUTPUT_QUOTE_FIELDS, + CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED); + String sql = SELECT_TO_DATE; + List records = expectRecordsRead(ALL_ROWS_COUNT, conf, sql); + LOG.info("Result of {}\n{}", sql, prepareToPrint(records)); + } + + @Test + public void testSelectNoMatch() + throws Throwable { + describe("when there's no match to a query, 0 records are returned,"); + JobConf conf = createJobConf(); + inputMust(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_USE); + expectRecordsRead(0, conf, + "SELECT * FROM S3OBJECT s WHERE s.odd = " + q("maybe")); + } + + @Test + public void testSelectOddRecordsIgnoreHeader() + throws Throwable { + describe("work through a record reader"); + JobConf conf = createJobConf(); + inputOpt(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_NONE); + inputMust(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_IGNORE); + expectRecordsRead(EVEN_ROWS_COUNT, conf, + SELECT_EVEN_ROWS_NO_HEADER); + } + + @Test + public void testSelectRecordsUnknownMustOpt() + throws Throwable { + describe("verify reader key validation is remapped"); + JobConf conf = createJobConf(); + inputOpt(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_NONE); + inputMust(conf, CSV_INPUT_HEADER + ".something", CSV_HEADER_OPT_IGNORE); + intercept(IllegalArgumentException.class, + AbstractFSBuilderImpl.UNKNOWN_MANDATORY_KEY, + () -> readRecords(conf, SELECT_EVEN_ROWS_NO_HEADER)); + } + + @Test + public void testSelectOddRecordsWithHeaderV1() + throws Throwable { + describe("work through a V1 record reader"); + JobConf conf = createJobConf(); + inputMust(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_USE); + // using a double backslash here makes the string "\t" which will then + // be parsed in the SelectBinding code as it if had come in on from an XML + // entry + inputMust(conf, CSV_OUTPUT_FIELD_DELIMITER, "\\t"); + inputMust(conf, CSV_OUTPUT_QUOTE_CHARACTER, "'"); + inputMust(conf, CSV_OUTPUT_QUOTE_FIELDS, + CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED); + inputMust(conf, CSV_OUTPUT_RECORD_DELIMITER, "\n"); + verifySelectionCount(ODD_ROWS_COUNT, + SELECT_ODD_ROWS, + readRecordsV1(conf, SELECT_ODD_ROWS)); + } + + /** + * Create a job conf for line reader tests. + * This patches the job with the passthrough codec for + * CSV files. + * @return a job configuration + */ + private JobConf createJobConf() { + JobConf conf = new JobConf(getConfiguration()); + enablePassthroughCodec(conf, ".csv"); + return conf; + } + + @Test + public void testSelectOddRecordsIgnoreHeaderV1() + throws Throwable { + describe("work through a V1 record reader"); + JobConf conf = createJobConf(); + inputOpt(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_NONE); + inputMust(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_IGNORE); + inputMust(conf, INPUT_FADVISE, INPUT_FADV_NORMAL); + inputMust(conf, SELECT_ERRORS_INCLUDE_SQL, "true"); + verifySelectionCount(EVEN_ROWS_COUNT, + SELECT_EVEN_ROWS_NO_HEADER, + readRecordsV1(conf, SELECT_EVEN_ROWS_NO_HEADER)); + } + + protected List expectRecordsRead(final int expected, + final JobConf conf, + final String sql) throws Exception { + return verifySelectionCount(expected, sql, readRecords(conf, sql)); + } + + /** + * Reads lines through {@link LineRecordReader}, as if it were an MR + * job. + * @param conf jpb conf + * @param sql sql to add to the configuration. + * @return the selected lines. + * @throws Exception failure + */ + private List readRecords(JobConf conf, String sql) throws Exception { + return readRecords(conf, + csvPath, + sql, + createLineRecordReader(), + ALL_ROWS_COUNT_WITH_HEADER); + } + + /** + * Reads lines through a v1 LineRecordReader}. + * @param conf jpb conf + * @param sql sql to add to the configuration. + * @return the selected lines. + * @throws Exception failure + */ + private List readRecordsV1(JobConf conf, String sql) + throws Exception { + inputMust(conf, SELECT_SQL, sql); + return super.readRecordsV1(conf, + createLineRecordReaderV1(conf, csvPath), + new LongWritable(), + new Text(), + ALL_ROWS_COUNT_WITH_HEADER); + } + + /** + * Issue a select call, expect the specific number of rows back. + * Error text will include the SQL. + * @param expected expected row count. + * @param conf config for the select call. + * @param header header option + * @param sql template for a formatted SQL request. + * @param args arguments for the formatted request. + * @return the lines selected + * @throws IOException failure + */ + private List expectSelected( + final int expected, + final Configuration conf, + final String header, + final String sql, + final Object...args) throws Exception { + conf.set(CSV_INPUT_HEADER, header); + return verifySelectionCount(expected, sql(sql, args), + selectCsvFile(conf, sql, args)); + } + + /** + * Select from the CSV file. + * @param conf config for the select call. + * @param sql template for a formatted SQL request. + * @param args arguments for the formatted request. + * @return the lines selected + * @throws IOException failure + */ + private List selectCsvFile( + final Configuration conf, + final String sql, + final Object...args) + throws Exception { + + return parseToLines( + select(getFileSystem(), csvPath, conf, sql, args)); + } + + @Test + public void testCommentsSkipped() throws Throwable { + describe("Verify that comments are skipped"); + selectConf.set(CSV_INPUT_HEADER, CSV_HEADER_OPT_USE); + + List lines = verifySelectionCount( + ALL_ROWS_COUNT_WITH_HEADER, + "select s.id", + parseToLines( + select(getFileSystem(), brokenCSV, selectConf, + "SELECT * FROM S3OBJECT s"))); + LOG.info("\n{}", prepareToPrint(lines)); + } + + @Test + public void testEmptyColumnsRegenerated() throws Throwable { + describe("if you ask for a column but your row doesn't have it," + + " an empty column is inserted"); + selectConf.set(CSV_INPUT_HEADER, CSV_HEADER_OPT_USE); + + List lines = verifySelectionCount( + ALL_ROWS_COUNT_WITH_HEADER, "select s.oddrange", + parseToLines( + select(getFileSystem(), brokenCSV, selectConf, + "SELECT s.oddrange FROM S3OBJECT s"))); + LOG.info("\n{}", prepareToPrint(lines)); + assertEquals("Final oddrange column is not regenerated empty", + "\"\"", lines.get(lines.size() - 1)); + } + + @Test + public void testIntCastFailure() throws Throwable { + describe("Verify that int casts fail"); + expectSelectFailure(E_CAST_FAILED, SELECT_ODD_ENTRIES_INT); + + } + + @Test + public void testSelectToDateParseFailure() throws Throwable { + describe("Verify date parsing failure"); + expectSelectFailure(E_CAST_FAILED, SELECT_TO_DATE); + } + + @Test + public void testParseInvalidPathComponent() throws Throwable { + describe("Verify bad SQL parseing"); + expectSelectFailure(E_PARSE_INVALID_PATH_COMPONENT, + "SELECT * FROM S3OBJECT WHERE s.'oddf' = true"); + } + + @Test + public void testSelectInvalidTableAlias() throws Throwable { + describe("select with unknown column name"); + expectSelectFailure(E_INVALID_TABLE_ALIAS, + "SELECT * FROM S3OBJECT WHERE s.\"oddf\" = 'true'"); + } + + @Test + public void testSelectGeneratedAliases() throws Throwable { + describe("select with a ._2 column when headers are enabled"); + expectSelectFailure(E_INVALID_TABLE_ALIAS, + "SELECT * FROM S3OBJECT WHERE s._2 = 'true'"); + } + + /** + * Expect select against the broken CSV file to fail with a specific + * AWS exception error code. + * If the is no failure, the results are included in the assertion raised. + * @param expectedErrorCode error code in getErrorCode() + * @param sql SQL to invoke + * @return the exception, if it is as expected. + * @throws Exception any other failure + * @throws AssertionError when an exception is raised, but its error code + * is different, or when no exception was raised. + */ + protected AWSServiceIOException expectSelectFailure( + String expectedErrorCode, + String sql) + throws Exception { + selectConf.set(CSV_INPUT_HEADER, CSV_HEADER_OPT_USE); + return verifyErrorCode(expectedErrorCode, + intercept(AWSBadRequestException.class, + () -> + prepareToPrint( + parseToLines( + select(getFileSystem(), brokenCSV, selectConf, sql) + )))); + + } + + + @Test + public void testInputSplit() + throws Throwable { + describe("Verify that only a single file is used for splits"); + JobConf conf = new JobConf(getConfiguration()); + + + inputMust(conf, CSV_INPUT_HEADER, CSV_HEADER_OPT_USE); + final Path input = csvPath; + S3AFileSystem fs = getFileSystem(); + final Path output = path("testLandsatSelect") + .makeQualified(fs.getUri(), fs.getWorkingDirectory()); + conf.set(FileInputFormat.INPUT_DIR, input.toString()); + conf.set(FileOutputFormat.OUTDIR, output.toString()); + + final Job job = Job.getInstance(conf, "testInputSplit"); + JobContext jobCtx = new JobContextImpl(job.getConfiguration(), + getTaskAttempt0().getJobID()); + + TextInputFormat tif = new TextInputFormat(); + List splits = tif.getSplits(jobCtx); + assertThat("split count wrong", splits, hasSize(1)); + + } + +} diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java new file mode 100644 index 0000000000..c04cf8bff7 --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectCLI.java @@ -0,0 +1,347 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.select; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.List; + +import org.junit.Test; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.s3a.S3AFileSystem; +import org.apache.hadoop.fs.s3a.S3ATestUtils; +import org.apache.hadoop.fs.s3a.Statistic; +import org.apache.hadoop.fs.s3a.commit.Duration; +import org.apache.hadoop.fs.s3a.s3guard.S3GuardTool; +import org.apache.hadoop.util.ExitUtil; +import org.apache.hadoop.util.ToolRunner; + +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.disableFilesystemCaching; +import static org.apache.hadoop.fs.s3a.s3guard.S3GuardToolTestHelper.exec; +import static org.apache.hadoop.fs.s3a.select.ITestS3SelectLandsat.SELECT_NOTHING; +import static org.apache.hadoop.fs.s3a.select.ITestS3SelectLandsat.SELECT_SUNNY_ROWS_NO_LIMIT; +import static org.apache.hadoop.fs.s3a.select.SelectConstants.*; +import static org.apache.hadoop.fs.s3a.select.SelectTool.*; +import static org.apache.hadoop.service.launcher.LauncherExitCodes.EXIT_COMMAND_ARGUMENT_ERROR; +import static org.apache.hadoop.service.launcher.LauncherExitCodes.EXIT_NOT_FOUND; +import static org.apache.hadoop.service.launcher.LauncherExitCodes.EXIT_SERVICE_UNAVAILABLE; +import static org.apache.hadoop.service.launcher.LauncherExitCodes.EXIT_SUCCESS; +import static org.apache.hadoop.service.launcher.LauncherExitCodes.EXIT_USAGE; +import static org.apache.hadoop.test.LambdaTestUtils.intercept; + +/** + * Test the S3 Select CLI through some operations against landsat + * and files generated from it. + */ +public class ITestS3SelectCLI extends AbstractS3SelectTest { + + public static final int LINE_COUNT = 100; + + public static final String SELECT_EVERYTHING = "SELECT * FROM S3OBJECT s"; + + private SelectTool selectTool; + + private Configuration selectConf; + + public static final String D = "-D"; + + private File localFile; + + private String landsatSrc; + + @Override + public void setup() throws Exception { + super.setup(); + selectTool = new SelectTool(getConfiguration()); + selectConf = new Configuration(getConfiguration()); + localFile = getTempFilename(); + landsatSrc = getLandsatGZ().toString(); + } + + @Override + public void teardown() throws Exception { + super.teardown(); + if (localFile != null) { + localFile.delete(); + } + } + + /** + * Expect a command to succeed. + * @param message any extra text to include in the assertion error message + * @param tool tool to run + * @param args arguments to the command + * @return the output of any successful run + * @throws Exception failure + */ + protected static String expectSuccess( + String message, + S3GuardTool tool, + String... args) throws Exception { + ByteArrayOutputStream buf = new ByteArrayOutputStream(); + exec(EXIT_SUCCESS, message, tool, buf, args); + return buf.toString(); + } + + /** + * Run a S3GuardTool command from a varags list and the + * configuration returned by {@code getConfiguration()}. + * @param conf config to use + * @param args argument list + * @return the return code + * @throws Exception any exception + */ + protected int run(Configuration conf, S3GuardTool tool, + String... args) throws Exception { + return ToolRunner.run(conf, tool, args); + } + + /** + * Run a S3GuardTool command from a varags list, catch any raised + * ExitException and verify the status code matches that expected. + * @param status expected status code of the exception + * @param conf config to use + * @param args argument list + * @throws Exception any exception + */ + protected void runToFailure(int status, Configuration conf, + String message, + S3GuardTool tool, String... args) + throws Exception { + final ExitUtil.ExitException ex = + intercept(ExitUtil.ExitException.class, message, + () -> ToolRunner.run(conf, tool, args)); + if (ex.status != status) { + throw ex; + } + + } + + @Test + public void testLandsatToFile() throws Throwable { + describe("select part of the landsat to a file"); + int lineCount = LINE_COUNT; + S3AFileSystem landsatFS = + (S3AFileSystem) getLandsatGZ().getFileSystem(getConfiguration()); + S3ATestUtils.MetricDiff selectCount = new S3ATestUtils.MetricDiff(landsatFS, + Statistic.OBJECT_SELECT_REQUESTS); + + run(selectConf, selectTool, + D, v(CSV_OUTPUT_QUOTE_CHARACTER, "'"), + D, v(CSV_OUTPUT_QUOTE_FIELDS, CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED), + "select", + o(OPT_HEADER), CSV_HEADER_OPT_USE, + o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP, + o(OPT_LIMIT), Integer.toString(lineCount), + o(OPT_OUTPUT), localFile.toString(), + landsatSrc, + SELECT_SUNNY_ROWS_NO_LIMIT); + List lines = IOUtils.readLines(new FileInputStream(localFile), + Charset.defaultCharset()); + LOG.info("Result from select:\n{}", lines.get(0)); + assertEquals(lineCount, lines.size()); + selectCount.assertDiffEquals("select count", 1); + Duration duration = selectTool.getSelectDuration(); + assertTrue("Select duration was not measured", + duration.value() > 0); + } + + private File getTempFilename() throws IOException { + File dest = File.createTempFile("landat", ".csv"); + dest.delete(); + return dest; + } + + @Test + public void testLandsatToConsole() throws Throwable { + describe("select part of the landsat to the console"); + // this verifies the input stream was actually closed + S3ATestUtils.MetricDiff readOps = new S3ATestUtils.MetricDiff( + getFileSystem(), + Statistic.STREAM_READ_OPERATIONS_INCOMPLETE); + run(selectConf, selectTool, + D, v(CSV_OUTPUT_QUOTE_CHARACTER, "'"), + D, v(CSV_OUTPUT_QUOTE_FIELDS, CSV_OUTPUT_QUOTE_FIELDS_ALWAYS), + "select", + o(OPT_HEADER), CSV_HEADER_OPT_USE, + o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP, + o(OPT_LIMIT), Integer.toString(LINE_COUNT), + landsatSrc, + SELECT_SUNNY_ROWS_NO_LIMIT); + assertEquals("Lines read and printed to console", + LINE_COUNT, selectTool.getLinesRead()); + readOps.assertDiffEquals("Read operations are still considered active", + 0); } + + @Test + public void testSelectNothing() throws Throwable { + describe("an empty select is not an error"); + run(selectConf, selectTool, + "select", + o(OPT_HEADER), CSV_HEADER_OPT_USE, + o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP, + o(OPT_INPUTFORMAT), "csv", + o(OPT_OUTPUTFORMAT), "csv", + o(OPT_EXPECTED), "0", + o(OPT_LIMIT), Integer.toString(LINE_COUNT), + landsatSrc, + SELECT_NOTHING); + assertEquals("Lines read and printed to console", + 0, selectTool.getLinesRead()); + } + + @Test + public void testLandsatToRemoteFile() throws Throwable { + describe("select part of the landsat to a file"); + Path dest = path("testLandsatToRemoteFile.csv"); + run(selectConf, selectTool, + D, v(CSV_OUTPUT_QUOTE_CHARACTER, "'"), + D, v(CSV_OUTPUT_QUOTE_FIELDS, CSV_OUTPUT_QUOTE_FIELDS_ALWAYS), + "select", + o(OPT_HEADER), CSV_HEADER_OPT_USE, + o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP, + o(OPT_LIMIT), Integer.toString(LINE_COUNT), + o(OPT_OUTPUT), dest.toString(), + landsatSrc, + SELECT_SUNNY_ROWS_NO_LIMIT); + FileStatus status = getFileSystem().getFileStatus(dest); + assertEquals( + "Mismatch between bytes selected and file len in " + status, + selectTool.getBytesRead(), status.getLen()); + assertIsFile(dest); + + // now select on that + Configuration conf = getConfiguration(); + SelectTool tool2 = new SelectTool(conf); + run(conf, tool2, + "select", + o(OPT_HEADER), CSV_HEADER_OPT_NONE, + dest.toString(), + SELECT_EVERYTHING); + } + + @Test + public void testUsage() throws Throwable { + runToFailure(EXIT_USAGE, getConfiguration(), TOO_FEW_ARGUMENTS, + selectTool, "select"); + } + + @Test + public void testRejectionOfNonS3FS() throws Throwable { + File dest = getTempFilename(); + runToFailure(EXIT_SERVICE_UNAVAILABLE, + getConfiguration(), + WRONG_FILESYSTEM, + selectTool, "select", dest.toString(), + SELECT_EVERYTHING); + } + + @Test + public void testFailMissingFile() throws Throwable { + Path dest = path("testFailMissingFile.csv"); + runToFailure(EXIT_NOT_FOUND, + getConfiguration(), + "", + selectTool, "select", dest.toString(), + SELECT_EVERYTHING); + } + + @Test + public void testS3SelectDisabled() throws Throwable { + Configuration conf = getConfiguration(); + conf.setBoolean(FS_S3A_SELECT_ENABLED, false); + disableFilesystemCaching(conf); + runToFailure(EXIT_SERVICE_UNAVAILABLE, + conf, + SELECT_IS_DISABLED, + selectTool, "select", + o(OPT_HEADER), CSV_HEADER_OPT_USE, + o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP, + o(OPT_LIMIT), Integer.toString(LINE_COUNT), + landsatSrc, + SELECT_SUNNY_ROWS_NO_LIMIT); + } + + @Test + public void testSelectBadLimit() throws Throwable { + runToFailure(EXIT_USAGE, + getConfiguration(), + "", + selectTool, "select", + o(OPT_HEADER), CSV_HEADER_OPT_USE, + o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP, + o(OPT_LIMIT), "-1", + landsatSrc, + SELECT_NOTHING); + } + + @Test + public void testSelectBadInputFormat() throws Throwable { + runToFailure(EXIT_COMMAND_ARGUMENT_ERROR, + getConfiguration(), + "", + selectTool, "select", + o(OPT_HEADER), CSV_HEADER_OPT_USE, + o(OPT_INPUTFORMAT), "pptx", + o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP, + landsatSrc, + SELECT_NOTHING); + } + + @Test + public void testSelectBadOutputFormat() throws Throwable { + runToFailure(EXIT_COMMAND_ARGUMENT_ERROR, + getConfiguration(), + "", + selectTool, "select", + o(OPT_HEADER), CSV_HEADER_OPT_USE, + o(OPT_OUTPUTFORMAT), "pptx", + o(OPT_COMPRESSION), COMPRESSION_OPT_GZIP, + landsatSrc, + SELECT_NOTHING); + } + + /** + * Take an option and add the "-" prefix. + * @param in input option + * @return value for the tool args list. + */ + private static String o(String in) { + return "-" + in; + } + + /** + * Create the key=value bit of the -D key=value pair. + * @param key key to set + * @param value value to use + * @return a string for the tool args list. + */ + private static String v(String key, String value) { + return checkNotNull(key) + "=" + checkNotNull(value); + } + +} diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java new file mode 100644 index 0000000000..780040e6a4 --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectLandsat.java @@ -0,0 +1,432 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.select; + +import java.io.IOException; +import java.util.List; + +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.CommonConfigurationKeys; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileContext; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathIOException; +import org.apache.hadoop.fs.s3a.S3AFileSystem; +import org.apache.hadoop.fs.s3a.S3AInstrumentation; +import org.apache.hadoop.fs.s3a.S3ATestUtils; +import org.apache.hadoop.fs.s3a.Statistic; +import org.apache.hadoop.fs.s3a.commit.DurationInfo; +import org.apache.hadoop.mapred.JobConf; + +import static org.apache.hadoop.fs.s3a.S3ATestUtils.assume; +import static org.apache.hadoop.fs.s3a.S3ATestUtils.getTestPropertyBool; +import static org.apache.hadoop.fs.s3a.scale.S3AScaleTestBase._1KB; +import static org.apache.hadoop.fs.s3a.scale.S3AScaleTestBase._1MB; +import static org.apache.hadoop.fs.s3a.select.SelectConstants.*; +import static org.apache.hadoop.test.LambdaTestUtils.intercept; +import static org.hamcrest.CoreMatchers.containsString; +import static org.hamcrest.CoreMatchers.not; + +/** + * Test the S3 Select feature with the Landsat dataset. + * + * This helps explore larger datasets, compression and the like. + * + * This suite is only executed if the destination store declares its support for + * the feature and the test CSV file configuration option points to the + * standard landsat GZip file. That's because these tests require the specific + * format of the landsat file. + * + * Normally working with the landsat file is a scale test. + * Here, because of the select operations, there's a lot less data + * to download. + * For this to work: write aggressive select calls: filtering, using LIMIT + * and projecting down to a few columns. + * + * For the structure, see + * Landsat on AWS + * + * + * entityId: String LC80101172015002LGN00 + * acquisitionDate: String 2015-01-02 15:49:05.571384 + * cloudCover: Float (possibly -ve) 80.81 + * processingLevel: String L1GT + * path: Int 10 + * row: Int 117 + * min_lat: Float -79.09923 + * min_lon: Float -139.66082 + * max_lat: Float -77.7544 + * max_lon: Float 125.09297 + * download_url: HTTPS URL https://s3-us-west-2.amazonaws.com/landsat-pds/L8/010/117/LC80101172015002LGN00/index.html + * + * Ranges + *
      + *
    1. Latitude should range in -180 <= lat <= 180
    2. + *
    3. Longitude in 0 <= lon <= 360
    4. + *
    5. Standard Greenwich Meridian (not the french one which still surfaces)
    6. + *
    7. Cloud cover Should be 0-100, but there are some negative ones.
    8. + *
    + * + * Head of the file: + * + entityId,acquisitionDate,cloudCover,processingLevel,path,row,min_lat,min_lon,max_lat,max_lon,download_url + * LC80101172015002LGN00,2015-01-02 15:49:05.571384,80.81,L1GT,10,117,-79.09923,-139.66082,-77.7544,-125.09297,https://s3-us-west-2.amazonaws.com/landsat-pds/L8/010/117/LC80101172015002LGN00/index.html + * LC80260392015002LGN00,2015-01-02 16:56:51.399666,90.84,L1GT,26,39,29.23106,-97.48576,31.36421,-95.16029,https://s3-us-west-2.amazonaws.com/landsat-pds/L8/026/039/LC80260392015002LGN00/index.html + * LC82270742015002LGN00,2015-01-02 13:53:02.047000,83.44,L1GT,227,74,-21.28598,-59.27736,-19.17398,-57.07423,https://s3-us-west-2.amazonaws.com/landsat-pds/L8/227/074/LC82270742015002LGN00/index.html + * LC82270732015002LGN00,2015-01-02 13:52:38.110317,52.29,L1T,227,73,-19.84365,-58.93258,-17.73324,-56.74692,https://s3-us-west-2.amazonaws.com/landsat-pds/L8/227/073/LC82270732015002LGN00/index.html + * + * + * For the Curious this is the Scala/Spark declaration of the schema. + * + * def addLandsatColumns(csv: DataFrame): DataFrame = { + * csv + * .withColumnRenamed("entityId", "id") + * .withColumn("acquisitionDate", + * csv.col("acquisitionDate").cast(TimestampType)) + * .withColumn("cloudCover", csv.col("cloudCover").cast(DoubleType)) + * .withColumn("path", csv.col("path").cast(IntegerType)) + * .withColumn("row", csv.col("row").cast(IntegerType)) + * .withColumn("min_lat", csv.col("min_lat").cast(DoubleType)) + * .withColumn("min_lon", csv.col("min_lon").cast(DoubleType)) + * .withColumn("max_lat", csv.col("max_lat").cast(DoubleType)) + * .withColumn("max_lon", csv.col("max_lon").cast(DoubleType)) + * .withColumn("year", + * year(col("acquisitionDate"))) + * .withColumn("month", + * month(col("acquisitionDate"))) + * .withColumn("day", + * month(col("acquisitionDate"))) + * } + * + */ +public class ITestS3SelectLandsat extends AbstractS3SelectTest { + + private static final Logger LOG = + LoggerFactory.getLogger(ITestS3SelectLandsat.class); + + private JobConf selectConf; + + /** + * Normal limit for select operations. + * Value: {@value}. + */ + public static final int SELECT_LIMIT = 250; + + /** + * And that select limit as a limit string. + */ + public static final String LIMITED = " LIMIT " + SELECT_LIMIT; + + /** + * Select days with 100% cloud cover, limited to {@link #SELECT_LIMIT}. + * Value: {@value}. + */ + public static final String SELECT_ENTITY_ID_ALL_CLOUDS = + "SELECT\n" + + "s.entityId from\n" + + "S3OBJECT s WHERE\n" + + "s.\"cloudCover\" = '100.0'\n" + + LIMITED; + + /** + * Select sunny days. There's no limit on the returned values, so + * set one except for a scale test. + * Value: {@value}. + */ + public static final String SELECT_SUNNY_ROWS_NO_LIMIT + = "SELECT * FROM S3OBJECT s WHERE s.cloudCover = '0.0'"; + + /** + * A Select call which returns nothing, always. + * Value: {@value}. + */ + public static final String SELECT_NOTHING + = "SELECT * FROM S3OBJECT s WHERE s.cloudCover = 'sunny'"; + + /** + * Select the processing level; no limit. + * Value: {@value}. + */ + public static final String SELECT_PROCESSING_LEVEL_NO_LIMIT = + "SELECT\n" + + "s.processingLevel from\n" + + "S3OBJECT s"; + + @Override + public void setup() throws Exception { + super.setup(); + + selectConf = new JobConf(false); + // file is compressed. + selectConf.set(SELECT_INPUT_COMPRESSION, COMPRESSION_OPT_GZIP); + // and has a header + selectConf.set(CSV_INPUT_HEADER, CSV_HEADER_OPT_USE); + selectConf.setBoolean(SELECT_ERRORS_INCLUDE_SQL, true); + inputMust(selectConf, CSV_INPUT_HEADER, CSV_HEADER_OPT_USE); + inputMust(selectConf, SELECT_INPUT_FORMAT, SELECT_FORMAT_CSV); + inputMust(selectConf, SELECT_OUTPUT_FORMAT, SELECT_FORMAT_CSV); + inputMust(selectConf, SELECT_INPUT_COMPRESSION, COMPRESSION_OPT_GZIP); + // disable the gzip codec, so that the record readers do not + // get confused + enablePassthroughCodec(selectConf, ".gz"); + } + + protected int getMaxLines() { + return SELECT_LIMIT * 2; + } + + @Test + public void testSelectCloudcoverIgnoreHeader() throws Throwable { + describe("select ignoring the header"); + selectConf.set(CSV_INPUT_HEADER, CSV_HEADER_OPT_IGNORE); + String sql = "SELECT\n" + + "* from\n" + + "S3OBJECT s WHERE\n" + + "s._3 = '0.0'\n" + + LIMITED; + List list = selectLandsatFile(selectConf, sql); + LOG.info("Line count: {}", list.size()); + verifySelectionCount(1, SELECT_LIMIT, sql, list); + } + + @Test + public void testSelectCloudcoverUseHeader() throws Throwable { + describe("select 100% cover using the header, " + + "+ verify projection and incrementing select statistics"); + S3ATestUtils.MetricDiff selectCount = new S3ATestUtils.MetricDiff( + getLandsatFS(), + Statistic.OBJECT_SELECT_REQUESTS); + + List list = selectLandsatFile(selectConf, + SELECT_ENTITY_ID_ALL_CLOUDS); + LOG.info("Line count: {}", list.size()); + verifySelectionCount(1, SELECT_LIMIT, SELECT_ENTITY_ID_ALL_CLOUDS, list); + String line1 = list.get(0); + assertThat("no column filtering from " + SELECT_ENTITY_ID_ALL_CLOUDS, + line1, not(containsString("100.0"))); + selectCount.assertDiffEquals("select count", 1); + } + + @Test + public void testFileContextIntegration() throws Throwable { + describe("Test that select works through FileContext"); + FileContext fc = S3ATestUtils.createTestFileContext(getConfiguration()); + + // there's a limit on the number of rows to read; this is larger + // than the SELECT_LIMIT call to catch any failure where more than + // that is returned, newline parsing fails, etc etc. + List list = parseToLines( + select(fc, getLandsatGZ(), selectConf, SELECT_ENTITY_ID_ALL_CLOUDS), + SELECT_LIMIT * 2); + LOG.info("Line count: {}", list.size()); + verifySelectionCount(1, SELECT_LIMIT, SELECT_ENTITY_ID_ALL_CLOUDS, list); + } + + @Test + public void testReadLandsatRecords() throws Throwable { + describe("Use a record reader to read the records"); + inputMust(selectConf, CSV_OUTPUT_FIELD_DELIMITER, "\\t"); + inputMust(selectConf, CSV_OUTPUT_QUOTE_CHARACTER, "'"); + inputMust(selectConf, CSV_OUTPUT_QUOTE_FIELDS, + CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED); + inputMust(selectConf, CSV_OUTPUT_RECORD_DELIMITER, "\n"); + List records = readRecords( + selectConf, + getLandsatGZ(), + SELECT_ENTITY_ID_ALL_CLOUDS, + createLineRecordReader(), + SELECT_LIMIT); + verifySelectionCount(1, SELECT_LIMIT, SELECT_ENTITY_ID_ALL_CLOUDS, records); + } + + @Test + public void testReadLandsatRecordsNoMatch() throws Throwable { + describe("Verify the v2 record reader does not fail" + + " when there are no results"); + verifySelectionCount(0, 0, SELECT_NOTHING, + readRecords( + selectConf, + getLandsatGZ(), + SELECT_NOTHING, + createLineRecordReader(), + SELECT_LIMIT)); + } + + @Test + public void testReadLandsatRecordsGZipEnabled() throws Throwable { + describe("Verify that by default, the gzip codec is connected to .gz" + + " files, and so fails"); + // implicitly re-enable the gzip codec. + selectConf.unset(CommonConfigurationKeys.IO_COMPRESSION_CODECS_KEY); + intercept(IOException.class, "gzip", + () -> readRecords( + selectConf, + getLandsatGZ(), + SELECT_ENTITY_ID_ALL_CLOUDS, + createLineRecordReader(), + SELECT_LIMIT)); + } + + @Test + public void testReadLandsatRecordsV1() throws Throwable { + describe("Use a record reader to read the records"); + + verifySelectionCount(1, SELECT_LIMIT, SELECT_ENTITY_ID_ALL_CLOUDS, + readRecords( + selectConf, + getLandsatGZ(), + SELECT_ENTITY_ID_ALL_CLOUDS, + createLineRecordReader(), + SELECT_LIMIT)); + } + + @Test + public void testReadLandsatRecordsV1NoResults() throws Throwable { + describe("verify that a select with no results is not an error"); + + verifySelectionCount(0, 0, SELECT_NOTHING, + readRecords( + selectConf, + getLandsatGZ(), + SELECT_NOTHING, + createLineRecordReader(), + SELECT_LIMIT)); + } + + /** + * Select from the landsat file. + * @param conf config for the select call. + * @param sql template for a formatted SQL request. + * @param args arguments for the formatted request. + * @return the lines selected + * @throws IOException failure + */ + private List selectLandsatFile( + final Configuration conf, + final String sql, + final Object... args) + throws Exception { + + // there's a limit on the number of rows to read; this is larger + // than the SELECT_LIMIT call to catch any failure where more than + // that is returned, newline parsing fails, etc etc. + return parseToLines( + select(getLandsatFS(), getLandsatGZ(), conf, sql, args)); + } + + /** + * This is a larger-scale version of {@link ITestS3Select#testSelectSeek()}. + */ + @Test + public void testSelectSeekFullLandsat() throws Throwable { + describe("Verify forward seeks work, not others"); + + boolean enabled = getTestPropertyBool( + getConfiguration(), + KEY_SCALE_TESTS_ENABLED, + DEFAULT_SCALE_TESTS_ENABLED); + assume("Scale test disabled", enabled); + + // start: read in the full data through the initial select + // this makes asserting that contents match possible + final Path path = getLandsatGZ(); + S3AFileSystem fs = getLandsatFS(); + + int len = (int) fs.getFileStatus(path).getLen(); + byte[] dataset = new byte[4 * _1MB]; + int actualLen; + try (DurationInfo ignored = + new DurationInfo(LOG, "Initial read of %s", path); + FSDataInputStream sourceStream = + select(fs, path, + selectConf, + SELECT_EVERYTHING)) { + // read it in + actualLen = IOUtils.read(sourceStream, dataset); + } + int seekRange = 16 * _1KB; + + try (FSDataInputStream seekStream = + select(fs, path, + selectConf, + SELECT_EVERYTHING)) { + SelectInputStream sis + = (SelectInputStream) seekStream.getWrappedStream(); + S3AInstrumentation.InputStreamStatistics streamStats + = sis.getS3AStreamStatistics(); + // lazy seek doesn't raise a problem here + seekStream.seek(0); + assertEquals("first byte read", dataset[0], seekStream.read()); + + // and now the pos has moved, again, seek will be OK + seekStream.seek(1); + seekStream.seek(1); + // but trying to seek elsewhere now fails + intercept(PathIOException.class, + SelectInputStream.SEEK_UNSUPPORTED, + () -> seekStream.seek(0)); + // positioned reads from the current location work. + byte[] buffer = new byte[1]; + seekStream.readFully(seekStream.getPos(), buffer); + // but positioned backwards fail. + intercept(PathIOException.class, + SelectInputStream.SEEK_UNSUPPORTED, + () -> seekStream.readFully(0, buffer)); + // forward seeks are implemented as 1+ skip + long target = seekStream.getPos() + seekRange; + seek(seekStream, target); + assertEquals("Seek position in " + seekStream, + target, seekStream.getPos()); + // now do a read and compare values + assertEquals("byte at seek position", + dataset[(int) seekStream.getPos()], seekStream.read()); + assertEquals("Seek bytes skipped in " + streamStats, + seekRange, streamStats.bytesSkippedOnSeek); + long offset; + long increment = 64 * _1KB; + + // seek forward, comparing bytes + for(offset = 32 * _1KB; offset < actualLen; offset += increment) { + seek(seekStream, offset); + assertEquals("Seek position in " + seekStream, + offset, seekStream.getPos()); + // now do a read and compare values + assertEquals("byte at seek position", + dataset[(int) seekStream.getPos()], seekStream.read()); + } + for(; offset < len; offset += _1MB) { + seek(seekStream, offset); + assertEquals("Seek position in " + seekStream, + offset, seekStream.getPos()); + } + // there's no knowledge of how much data is left, but with Gzip + // involved there can be a lot. To keep the test duration down, + // this test, unlike the simpler one, doesn't try to read past the + // EOF. Know this: it will be slow. + + LOG.info("Seek statistics {}", streamStats); + } + } + +} diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java new file mode 100644 index 0000000000..86d1590fce --- /dev/null +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/select/ITestS3SelectMRJob.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs.s3a.select; + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicLong; + +import org.junit.Test; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.examples.WordCount; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.impl.FutureIOSupport; +import org.apache.hadoop.fs.impl.WrappedIOException; +import org.apache.hadoop.fs.s3a.S3AFileSystem; +import org.apache.hadoop.fs.s3a.S3ATestUtils; +import org.apache.hadoop.fs.s3a.S3AUtils; +import org.apache.hadoop.fs.s3a.commit.DurationInfo; +import org.apache.hadoop.fs.s3a.commit.files.SuccessData; +import org.apache.hadoop.fs.s3a.commit.staging.StagingCommitter; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.hadoop.yarn.server.MiniYARNCluster; + +import static org.apache.hadoop.fs.s3a.commit.CommitConstants.FS_S3A_COMMITTER_NAME; +import static org.apache.hadoop.fs.s3a.commit.CommitConstants.FS_S3A_COMMITTER_STAGING_UNIQUE_FILENAMES; +import static org.apache.hadoop.fs.s3a.select.SelectConstants.*; + +/** + * Run an MR job with a select query. + * This is the effective end-to-end test which verifies: + *
      + *
    1. Passing of select parameters through an MR job conf.
    2. + *
    3. Automatic pick-up of these parameter through TextInputFormat's use + * of the mapreduce.lib.input.LineRecordReaderLineRecordReader.
    4. + *
    5. Issuing of S3 Select queries in mapper processes.
    6. + *
    7. Projection of columns in a select.
    8. + *
    9. Ability to switch to the Passthrough decompressor in an MR job.
    10. + *
    11. Saving of results through the S3A Staging committer.
    12. + *
    13. Basic validation of results.
    14. + *
    + * This makes it the most complex of the MR jobs in the hadoop-aws test suite. + * + * The query used is + * {@link ITestS3SelectLandsat#SELECT_PROCESSING_LEVEL_NO_LIMIT}, + * which lists the processing level of all records in the source file, + * and counts the number in each one by way of the normal word-count + * routines. + * This works because the SQL is projecting only the processing level. + * + * The result becomes something like (with tabs between fields): + *
    + * L1GT   370231
    + * L1T    689526
    + * 
    + */ +public class ITestS3SelectMRJob extends AbstractS3SelectTest { + + private final Configuration conf = new YarnConfiguration(); + + private S3AFileSystem fs; + + private MiniYARNCluster yarnCluster; + + private Path rootPath; + + @Override + public void setup() throws Exception { + super.setup(); + fs = S3ATestUtils.createTestFileSystem(conf); + rootPath = path("ITestS3SelectMRJob"); + Path workingDir = path("working"); + fs.setWorkingDirectory(workingDir); + fs.mkdirs(new Path(rootPath, "input/")); + + yarnCluster = new MiniYARNCluster("ITestS3SelectMRJob", // testName + 1, // number of node managers + 1, // number of local log dirs per node manager + 1); // number of hdfs dirs per node manager + yarnCluster.init(conf); + yarnCluster.start(); + } + + @Override + public void teardown() throws Exception { + if (yarnCluster != null) { + yarnCluster.stop(); + } + super.teardown(); + } + + @Test + public void testLandsatSelect() throws Exception { + final Path input = getLandsatGZ(); + final Path output = path("testLandsatSelect") + .makeQualified(fs.getUri(), fs.getWorkingDirectory()); + + final Job job = Job.getInstance(conf, "process level count"); + job.setJarByClass(WordCount.class); + job.setMapperClass(WordCount.TokenizerMapper.class); + job.setCombinerClass(WordCount.IntSumReducer.class); + job.setReducerClass(WordCount.IntSumReducer.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(IntWritable.class); + FileInputFormat.addInputPath(job, input); + FileOutputFormat.setOutputPath(job, output); + + // job with use the staging committer + final JobConf jobConf = (JobConf) job.getConfiguration(); + jobConf.set(FS_S3A_COMMITTER_NAME, StagingCommitter.NAME); + jobConf.setBoolean(FS_S3A_COMMITTER_STAGING_UNIQUE_FILENAMES, + false); + + final String query + = ITestS3SelectLandsat.SELECT_PROCESSING_LEVEL_NO_LIMIT; + inputMust(jobConf, SELECT_SQL, + query); + inputMust(jobConf, SELECT_INPUT_COMPRESSION, COMPRESSION_OPT_GZIP); + + // input settings + inputMust(jobConf, SELECT_INPUT_FORMAT, SELECT_FORMAT_CSV); + inputMust(jobConf, CSV_INPUT_HEADER, CSV_HEADER_OPT_USE); + + // output + inputMust(jobConf, SELECT_OUTPUT_FORMAT, SELECT_FORMAT_CSV); + inputMust(jobConf, CSV_OUTPUT_QUOTE_FIELDS, + CSV_OUTPUT_QUOTE_FIELDS_AS_NEEEDED); + + // disable the gzip codec, so that the record readers do not + // get confused + enablePassthroughCodec(jobConf, ".gz"); + + try (DurationInfo ignored = new DurationInfo(LOG, "SQL " + query)) { + int exitCode = job.waitForCompletion(true) ? 0 : 1; + assertEquals("Returned error code.", 0, exitCode); + } + + // log the success info + Path successPath = new Path(output, "_SUCCESS"); + SuccessData success = SuccessData.load(fs, successPath); + LOG.info("Job _SUCCESS\n{}", success); + + // process the results by ver + // + LOG.info("Results for query \n{}", query); + final AtomicLong parts = new AtomicLong(0); + S3AUtils.applyLocatedFiles(fs.listFiles(output, false), + (status) -> { + Path path = status.getPath(); + // ignore _SUCCESS, any temp files in subdirectories... + if (path.getName().startsWith("part-")) { + parts.incrementAndGet(); + String result = readStringFromFile(path); + LOG.info("{}\n{}", path, result); + String[] lines = result.split("\n", -1); + int l = lines.length; + // add a bit of slack here in case some new processing + // option was added. + assertTrue("Wrong number of lines (" + l + ") in " + result, + l > 0 && l < 15); + } + }); + assertEquals("More part files created than expected", 1, parts.get()); + } + + /** + * Read a file; using Async IO for completeness and to see how + * well the async IO works in practice. + * Summary: checked exceptions cripple Async operations. + */ + private String readStringFromFile(Path path) throws IOException { + int bytesLen = (int)fs.getFileStatus(path).getLen(); + byte[] buffer = new byte[bytesLen]; + return FutureIOSupport.awaitFuture( + fs.openFile(path).build().thenApply(in -> { + try { + IOUtils.readFully(in, buffer, 0, bytesLen); + return new String(buffer); + } catch (IOException ex) { + throw new WrappedIOException(ex); + } + })); + } +} diff --git a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamInputFormat.java b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamInputFormat.java index a77c13762c..77f4e041d5 100644 --- a/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamInputFormat.java +++ b/hadoop-tools/hadoop-streaming/src/main/java/org/apache/hadoop/streaming/mapreduce/StreamInputFormat.java @@ -24,8 +24,12 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FutureDataInputStreamBuilder; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.impl.FutureIOSupport; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; @@ -58,8 +62,14 @@ public RecordReader createRecordReader(InputSplit genericSplit, context.progress(); // Open the file and seek to the start of the split - FileSystem fs = split.getPath().getFileSystem(conf); - FSDataInputStream in = fs.open(split.getPath()); + Path path = split.getPath(); + FileSystem fs = path.getFileSystem(conf); + // open the file + final FutureDataInputStreamBuilder builder = fs.openFile(path); + FutureIOSupport.propagateOptions(builder, conf, + MRJobConfig.INPUT_FILE_OPTION_PREFIX, + MRJobConfig.INPUT_FILE_MANDATORY_PREFIX); + FSDataInputStream in = FutureIOSupport.awaitFuture(builder.build()); // Factory dispatch based on available params.. Class readerClass;