diff --git a/hadoop-tools/hadoop-aws/pom.xml b/hadoop-tools/hadoop-aws/pom.xml index 03a36655fa..10848811b6 100644 --- a/hadoop-tools/hadoop-aws/pom.xml +++ b/hadoop-tools/hadoop-aws/pom.xml @@ -184,7 +184,7 @@ **/ITestS3AContractRootDir.java **/ITestS3NContractRootDir.java **/ITestS3AFileContextStatistics.java - **/ITestS3AEncryptionSSE*.java + **/ITestS3AEncryptionSSEC*.java **/ITestS3AHuge*.java @@ -214,7 +214,7 @@ **/ITestS3NContractRootDir.java **/ITestS3AFileContextStatistics.java **/ITestS3AHuge*.java - **/ITestS3AEncryptionSSE*.java + **/ITestS3AEncryptionSSEC*.java diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md index 2cde730fbf..ceda79e687 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md @@ -1468,6 +1468,52 @@ basis. to set fadvise policies on input streams. Once implemented, this will become the supported mechanism used for configuring the input IO policy. + +### Encrypting objects with S3A + +Currently, S3A only supports S3's Server Side Encryption for at rest data encryption. +It is *encouraged* to read up on the [AWS documentation](https://docs.aws.amazon.com/AmazonS3/latest/dev/serv-side-encryption.html) +for S3 Server Side Encryption before using these options as each behave differently +and the documentation will be more up to date on its behavior. When configuring +an encryption method in the `core-site.xml`, this will apply cluster wide. Any +new files written will be encrypted with this encryption configuration. Any +existing files when read, will decrypt using the existing method (if possible) +and will not be re-encrypted with the new method. It is also possible if mixing +multiple keys that the user does not have access to decrypt the object. It is +**NOT** advised to mix and match encryption types in a bucket, and is *strongly* +recommended to just one type and key per bucket. + +SSE-S3 is where S3 will manage the encryption keys for each object. The parameter +for `fs.s3a.server-side-encryption-algorithm` is `AES256`. + +SSE-KMS is where the user specifies a Customer Master Key(CMK) that is used to +encrypt the objects. The user may specify a specific CMK or leave the +`fs.s3a.server-side-encryption-key` empty to use the default auto-generated key +in AWS IAM. Each CMK configured in AWS IAM is region specific, and cannot be +used in a in a S3 bucket in a different region. There is can also be policies +assigned to the CMK that prohibit or restrict its use for users causing S3A +requests to fail. + +SSE-C is where the user specifies an actual base64 encoded AES-256 key supplied +and managed by the user. + +#### SSE-C Warning + +It is strongly recommended to fully understand how SSE-C works in the S3 +environment before using this encryption type. Please refer to the Server Side +Encryption documentation available from AWS. SSE-C is only recommended for +advanced users with advanced encryption use cases. Failure to properly manage +encryption keys can cause data loss. Currently, the AWS S3 API(and thus S3A) +only supports one encryption key and cannot support decrypting objects during +moves under a previous key to a new destination. It is **NOT** advised to use +multiple encryption keys in a bucket, and is recommended to use one key per +bucket and to not change this key. This is due to when a request is made to S3, +the actual encryption key must be provided to decrypt the object and access the +metadata. Since only one encryption key can be provided at a time, S3A will not +pass the correct encryption key to decrypt the data. Please see the +troubleshooting section for more information. + + ## Troubleshooting S3A Common problems working with S3A are @@ -1931,6 +1977,41 @@ if it is required that the data is persisted durably after every `flush()/hflush()` call. This includes resilient logging, HBase-style journalling and the like. The standard strategy here is to save to HDFS and then copy to S3. + +### S3 Server Side Encryption + +#### Using SSE-KMS + +When performing file operations, the user may run into an issue where the KMS +key arn is invalid. +``` +com.amazonaws.services.s3.model.AmazonS3Exception: +Invalid arn (Service: Amazon S3; Status Code: 400; Error Code: KMS.NotFoundException; Request ID: 708284CF60EE233F), +S3 Extended Request ID: iHUUtXUSiNz4kv3Bdk/hf9F+wjPt8GIVvBHx/HEfCBYkn7W6zmpvbA3XT7Y5nTzcZtfuhcqDunw=: +Invalid arn (Service: Amazon S3; Status Code: 400; Error Code: KMS.NotFoundException; Request ID: 708284CF60EE233F) +``` + +This is due to either, the KMS key id is entered incorrectly, or the KMS key id +is in a different region than the S3 bucket being used. + +#### Using SSE-C +When performing file operations the user may run into an unexpected 400/403 +error such as +``` +org.apache.hadoop.fs.s3a.AWSS3IOException: getFileStatus on fork-4/: com.amazonaws.services.s3.model.AmazonS3Exception: +Bad Request (Service: Amazon S3; Status Code: 400; +Error Code: 400 Bad Request; Request ID: 42F9A1987CB49A99), +S3 Extended Request ID: jU2kcwaXnWj5APB14Cgb1IKkc449gu2+dhIsW/+7x9J4D+VUkKvu78mBo03oh9jnOT2eoTLdECU=: +Bad Request (Service: Amazon S3; Status Code: 400; Error Code: 400 Bad Request; Request ID: 42F9A1987CB49A99) +``` + +This can happen in the cases of not specifying the correct SSE-C encryption key. +Such cases can be as follows: +1. An object is encrypted using SSE-C on S3 and either the wrong encryption type +is used, no encryption is specified, or the SSE-C specified is incorrect. +2. A directory is encrypted with a SSE-C keyA and the user is trying to move a +file using configured SSE-C keyB into that structure. + ### Other issues *Performance slow* diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractTestS3AEncryption.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractTestS3AEncryption.java index 515094295b..108f102c67 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractTestS3AEncryption.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/AbstractTestS3AEncryption.java @@ -85,6 +85,10 @@ protected String createFilename(int len) { return String.format("%s-%04x", methodName.getMethodName(), len); } + protected String createFilename(String name) { + return String.format("%s-%s", methodName.getMethodName(), name); + } + /** * Assert that at path references an encrypted blob. * @param path path diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSEC.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSEC.java index a8cf70b931..91be8b9501 100644 --- a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSEC.java +++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/ITestS3AEncryptionSSEC.java @@ -23,6 +23,8 @@ import static org.apache.hadoop.fs.s3a.S3ATestUtils.skipIfEncryptionTestsDisabled; import static org.apache.hadoop.test.LambdaTestUtils.intercept; +import java.io.IOException; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -51,36 +53,318 @@ protected Configuration createConfiguration() { * This will create and write to a file using encryption key A, then attempt * to read from it again with encryption key B. This will not work as it * cannot decrypt the file. + * + * This is expected AWS S3 SSE-C behavior. + * * @throws Exception */ @Test public void testCreateFileAndReadWithDifferentEncryptionKey() throws - Exception { + Exception { + assumeEnabled(); + skipIfEncryptionTestsDisabled(getConfiguration()); + final Path[] path = new Path[1]; intercept(java.nio.file.AccessDeniedException.class, "Service: Amazon S3; Status Code: 403;", () -> { int len = 2048; - skipIfEncryptionTestsDisabled(getConfiguration()); describe("Create an encrypted file of size " + len); String src = createFilename(len); path[0] = writeThenReadFile(src, len); - Configuration conf = this.createConfiguration(); - conf.set(Constants.SERVER_SIDE_ENCRYPTION_KEY, - "kX7SdwVc/1VXJr76kfKnkQ3ONYhxianyL2+C3rPVT9s="); - - S3AContract contract = (S3AContract) createContract(conf); - contract.init(); - //skip tests if they aren't enabled - assumeEnabled(); //extract the test FS - FileSystem fileSystem = contract.getTestFileSystem(); + FileSystem fileSystem = createNewFileSystemWithSSECKey( + "kX7SdwVc/1VXJr76kfKnkQ3ONYhxianyL2+C3rPVT9s="); byte[] data = dataset(len, 'a', 'z'); ContractTestUtils.verifyFileContents(fileSystem, path[0], data); throw new Exception("Fail"); }); - rm(getFileSystem(), path[0], false, false); + } + + /** + * While each object has it's own key and should be distinct, this verifies + * that hadoop treats object keys as a filesystem path. So if a top level + * dir is encrypted with keyA, a sublevel dir cannot be accessed with a + * different keyB. + * + * This is expected AWS S3 SSE-C behavior. + * + * @throws Exception + */ + @Test + public void testCreateSubdirWithDifferentKey() throws Exception { + assumeEnabled(); + skipIfEncryptionTestsDisabled(getConfiguration()); + + final Path[] path = new Path[1]; + intercept(java.nio.file.AccessDeniedException.class, + "Service: Amazon S3; Status Code: 403;", () -> { + + path[0] = S3ATestUtils.createTestPath( + new Path(createFilename("dir/")) + ); + Path nestedDirectory = S3ATestUtils.createTestPath( + new Path(createFilename("dir/nestedDir/")) + ); + FileSystem fsKeyB = createNewFileSystemWithSSECKey( + "G61nz31Q7+zpjJWbakxfTOZW4VS0UmQWAq2YXhcTXoo="); + getFileSystem().mkdirs(path[0]); + fsKeyB.mkdirs(nestedDirectory); + + throw new Exception("Exception should be thrown."); + }); + rm(getFileSystem(), path[0], true, false); + } + + /** + * Ensures a file can't be created with keyA and then renamed with a different + * key. + * + * This is expected AWS S3 SSE-C behavior. + * + * @throws Exception + */ + @Test + public void testCreateFileThenMoveWithDifferentSSECKey() throws Exception { + assumeEnabled(); + skipIfEncryptionTestsDisabled(getConfiguration()); + + final Path[] path = new Path[1]; + intercept(java.nio.file.AccessDeniedException.class, + "Service: Amazon S3; Status Code: 403;", () -> { + + int len = 2048; + String src = createFilename(len); + path[0] = writeThenReadFile(src, len); + + FileSystem fsKeyB = createNewFileSystemWithSSECKey( + "NTx0dUPrxoo9+LbNiT/gqf3z9jILqL6ilismFmJO50U="); + fsKeyB.rename(path[0], new Path(createFilename("different-path.txt"))); + + throw new Exception("Exception should be thrown."); + }); + } + + /** + * General test to make sure move works with SSE-C with the same key, unlike + * with multiple keys. + * + * @throws Exception + */ + @Test + public void testRenameFile() throws Exception { + assumeEnabled(); + skipIfEncryptionTestsDisabled(getConfiguration()); + + String src = createFilename("original-path.txt"); + Path path = writeThenReadFile(src, 2048); + Path newPath = path(createFilename("different-path.txt")); + getFileSystem().rename(path, newPath); + byte[] data = dataset(2048, 'a', 'z'); + ContractTestUtils.verifyFileContents(getFileSystem(), newPath, data); + } + + /** + * It is possible to list the contents of a directory up to the actual + * end of the nested directories. This is due to how S3A mocks the + * directories and how prefixes work in S3. + * @throws Exception + */ + @Test + public void testListEncryptedDir() throws Exception { + assumeEnabled(); + skipIfEncryptionTestsDisabled(getConfiguration()); + + Path nestedDirectory = S3ATestUtils.createTestPath( + path(createFilename("/a/b/c/")) + ); + assertTrue(getFileSystem().mkdirs(nestedDirectory)); + + FileSystem fsKeyB = createNewFileSystemWithSSECKey( + "msdo3VvvZznp66Gth58a91Hxe/UpExMkwU9BHkIjfW8="); + + fsKeyB.listFiles(S3ATestUtils.createTestPath( + path(createFilename("/a/")) + ), true); + fsKeyB.listFiles(S3ATestUtils.createTestPath( + path(createFilename("/a/b/")) + ), true); + + //Until this point, no exception is thrown about access + intercept(java.nio.file.AccessDeniedException.class, + "Service: Amazon S3; Status Code: 403;", () -> { + fsKeyB.listFiles(S3ATestUtils.createTestPath( + path(createFilename("/a/b/c/")) + ), false); + throw new Exception("Exception should be thrown."); + }); + + Configuration conf = this.createConfiguration(); + conf.unset(Constants.SERVER_SIDE_ENCRYPTION_ALGORITHM); + conf.unset(Constants.SERVER_SIDE_ENCRYPTION_KEY); + + S3AContract contract = (S3AContract) createContract(conf); + contract.init(); + FileSystem unencryptedFileSystem = contract.getTestFileSystem(); + + //unencrypted can access until the final directory + unencryptedFileSystem.listFiles(S3ATestUtils.createTestPath( + path(createFilename("/a/")) + ), true); + unencryptedFileSystem.listFiles(S3ATestUtils.createTestPath( + path(createFilename("/a/b/")) + ), true); + intercept(org.apache.hadoop.fs.s3a.AWSS3IOException.class, + "Bad Request (Service: Amazon S3; Status Code: 400; Error" + + " Code: 400 Bad Request;", () -> { + + unencryptedFileSystem.listFiles(S3ATestUtils.createTestPath( + path(createFilename("/a/b/c/")) + ), false); + throw new Exception("Exception should be thrown."); + }); + rm(getFileSystem(), path(createFilename("/")), true, false); + } + + /** + * Much like the above list encrypted directory test, you cannot get the + * metadata of an object without the correct encryption key. + * @throws Exception + */ + @Test + public void testListStatusEncryptedDir() throws Exception { + assumeEnabled(); + skipIfEncryptionTestsDisabled(getConfiguration()); + + Path nestedDirectory = S3ATestUtils.createTestPath( + path(createFilename("/a/b/c/")) + ); + assertTrue(getFileSystem().mkdirs(nestedDirectory)); + + FileSystem fsKeyB = createNewFileSystemWithSSECKey( + "msdo3VvvZznp66Gth58a91Hxe/UpExMkwU9BHkIjfW8="); + + fsKeyB.listStatus(S3ATestUtils.createTestPath( + path(createFilename("/a/")))); + fsKeyB.listStatus(S3ATestUtils.createTestPath( + path(createFilename("/a/b/")))); + + //Until this point, no exception is thrown about access + intercept(java.nio.file.AccessDeniedException.class, + "Service: Amazon S3; Status Code: 403;", () -> { + fsKeyB.listStatus(S3ATestUtils.createTestPath( + path(createFilename("/a/b/c/")))); + + throw new Exception("Exception should be thrown."); + }); + + //Now try it with an unencrypted filesystem. + Configuration conf = this.createConfiguration(); + conf.unset(Constants.SERVER_SIDE_ENCRYPTION_ALGORITHM); + conf.unset(Constants.SERVER_SIDE_ENCRYPTION_KEY); + + S3AContract contract = (S3AContract) createContract(conf); + contract.init(); + FileSystem unencryptedFileSystem = contract.getTestFileSystem(); + + //unencrypted can access until the final directory + unencryptedFileSystem.listStatus(S3ATestUtils.createTestPath( + path(createFilename("/a/")))); + unencryptedFileSystem.listStatus(S3ATestUtils.createTestPath( + path(createFilename("/a/b/")))); + + intercept(org.apache.hadoop.fs.s3a.AWSS3IOException.class, + "Bad Request (Service: Amazon S3; Status Code: 400; Error Code: 400" + + " Bad Request;", () -> { + + unencryptedFileSystem.listStatus(S3ATestUtils.createTestPath( + path(createFilename("/a/b/c/")))); + throw new Exception("Exception should be thrown."); + }); + rm(getFileSystem(), path(createFilename("/")), true, false); + } + + /** + * Much like trying to access a encrypted directory, an encrypted file cannot + * have its metadata read, since both are technically an object. + * @throws Exception + */ + @Test + public void testListStatusEncryptedFile() throws Exception { + assumeEnabled(); + skipIfEncryptionTestsDisabled(getConfiguration()); + + Path nestedDirectory = S3ATestUtils.createTestPath( + path(createFilename("/a/b/c/")) + ); + assertTrue(getFileSystem().mkdirs(nestedDirectory)); + + String src = createFilename("/a/b/c/fileToStat.txt"); + Path fileToStat = writeThenReadFile(src, 2048); + + FileSystem fsKeyB = createNewFileSystemWithSSECKey( + "msdo3VvvZznp66Gth58a91Hxe/UpExMkwU9BHkIjfW8="); + + //Until this point, no exception is thrown about access + intercept(java.nio.file.AccessDeniedException.class, + "Service: Amazon S3; Status Code: 403;", () -> { + fsKeyB.listStatus(S3ATestUtils.createTestPath(fileToStat)); + + throw new Exception("Exception should be thrown."); + }); + rm(getFileSystem(), path(createFilename("/")), true, false); + } + + + + + /** + * It is possible to delete directories without the proper encryption key and + * the hierarchy above it. + * + * @throws Exception + */ + @Test + public void testDeleteEncryptedObjectWithDifferentKey() throws Exception { + assumeEnabled(); + skipIfEncryptionTestsDisabled(getConfiguration()); + + Path nestedDirectory = S3ATestUtils.createTestPath( + path(createFilename("/a/b/c/")) + ); + assertTrue(getFileSystem().mkdirs(nestedDirectory)); + String src = createFilename("/a/b/c/filetobedeleted.txt"); + Path fileToDelete = writeThenReadFile(src, 2048); + + FileSystem fsKeyB = createNewFileSystemWithSSECKey( + "msdo3VvvZznp66Gth58a91Hxe/UpExMkwU9BHkIjfW8="); + intercept(java.nio.file.AccessDeniedException.class, + "Forbidden (Service: Amazon S3; Status Code: 403; Error Code: " + + "403 Forbidden", () -> { + + fsKeyB.delete(fileToDelete, false); + throw new Exception("Exception should be thrown."); + }); + + //This is possible + fsKeyB.delete(S3ATestUtils.createTestPath( + path(createFilename("/a/b/c/"))), true); + fsKeyB.delete(S3ATestUtils.createTestPath( + path(createFilename("/a/b/"))), true); + fsKeyB.delete(S3ATestUtils.createTestPath( + path(createFilename("/a/"))), true); + } + + private FileSystem createNewFileSystemWithSSECKey(String sseCKey) throws + IOException { + Configuration conf = this.createConfiguration(); + conf.set(Constants.SERVER_SIDE_ENCRYPTION_KEY, sseCKey); + + S3AContract contract = (S3AContract) createContract(conf); + contract.init(); + FileSystem fileSystem = contract.getTestFileSystem(); + return fileSystem; } @Override