From be13e94843bd822f62d4616aee311decc906927d Mon Sep 17 00:00:00 2001 From: DieterDP <90392398+DieterDP-ng@users.noreply.github.com> Date: Fri, 2 Feb 2024 12:49:31 +0100 Subject: [PATCH] HADOOP-18987. Various fixes to FileSystem API docs (#6292) Contributed by Dieter De Paepe --- .../src/site/markdown/filesystem/abortable.md | 3 +- .../site/markdown/filesystem/filesystem.md | 97 ++++++++++--------- .../markdown/filesystem/fsdatainputstream.md | 8 +- .../src/site/markdown/filesystem/model.md | 28 +++--- .../src/site/markdown/filesystem/notation.md | 8 +- 5 files changed, 75 insertions(+), 69 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/abortable.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/abortable.md index 7e6ea01a8f..dc7677bd9a 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/abortable.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/abortable.md @@ -88,14 +88,13 @@ for example. output streams returned by the S3A FileSystem. The stream MUST implement `Abortable` and `StreamCapabilities`. ```python - if unsupported: +if unsupported: throw UnsupportedException if not isOpen(stream): no-op StreamCapabilities.hasCapability("fs.capability.outputstream.abortable") == True - ``` diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md index 5fba8a2515..518026876b 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/filesystem.md @@ -64,13 +64,13 @@ a protected directory, result in such an exception being raised. ### `boolean isDirectory(Path p)` - def isDirectory(FS, p)= p in directories(FS) + def isDir(FS, p) = p in directories(FS) ### `boolean isFile(Path p)` - def isFile(FS, p) = p in files(FS) + def isFile(FS, p) = p in filenames(FS) ### `FileStatus getFileStatus(Path p)` @@ -250,7 +250,7 @@ process. changes are made to the filesystem, the result of `listStatus(parent(P))` SHOULD include the value of `getFileStatus(P)`. -* After an entry at path `P` is created, and before any other +* After an entry at path `P` is deleted, and before any other changes are made to the filesystem, the result of `listStatus(parent(P))` SHOULD NOT include the value of `getFileStatus(P)`. @@ -305,7 +305,7 @@ that they must all be listed, and, at the time of listing, exist. All paths must exist. There is no requirement for uniqueness. forall p in paths : - exists(fs, p) else raise FileNotFoundException + exists(FS, p) else raise FileNotFoundException #### Postconditions @@ -381,7 +381,7 @@ being completely performed. Path `path` must exist: - exists(FS, path) : raise FileNotFoundException + if not exists(FS, path) : raise FileNotFoundException #### Postconditions @@ -432,7 +432,7 @@ of data which must be collected in a single RPC call. #### Preconditions - exists(FS, path) else raise FileNotFoundException + if not exists(FS, path) : raise FileNotFoundException ### Postconditions @@ -463,7 +463,7 @@ and 1 for file count. #### Preconditions - exists(FS, path) else raise FileNotFoundException + if not exists(FS, path) : raise FileNotFoundException #### Postconditions @@ -567,7 +567,7 @@ when writing objects to a path in the filesystem. #### Postconditions - result = integer >= 0 + result = integer >= 0 The outcome of this operation is usually identical to `getDefaultBlockSize()`, with no checks for the existence of the given path. @@ -591,12 +591,12 @@ on the filesystem. #### Preconditions - if not exists(FS, p) : raise FileNotFoundException + if not exists(FS, p) : raise FileNotFoundException #### Postconditions - if len(FS, P) > 0: getFileStatus(P).getBlockSize() > 0 + if len(FS, P) > 0 : getFileStatus(P).getBlockSize() > 0 result == getFileStatus(P).getBlockSize() 1. The outcome of this operation MUST be identical to the value of @@ -654,12 +654,12 @@ No ancestor may be a file forall d = ancestors(FS, p) : if exists(FS, d) and not isDir(FS, d) : - raise [ParentNotDirectoryException, FileAlreadyExistsException, IOException] + raise {ParentNotDirectoryException, FileAlreadyExistsException, IOException} #### Postconditions - FS' where FS'.Directories' = FS.Directories + [p] + ancestors(FS, p) + FS' where FS'.Directories = FS.Directories + [p] + ancestors(FS, p) result = True @@ -688,7 +688,7 @@ The return value is always true—even if a new directory is not created The file must not exist for a no-overwrite create: - if not overwrite and isFile(FS, p) : raise FileAlreadyExistsException + if not overwrite and isFile(FS, p) : raise FileAlreadyExistsException Writing to or overwriting a directory must fail. @@ -698,7 +698,7 @@ No ancestor may be a file forall d = ancestors(FS, p) : if exists(FS, d) and not isDir(FS, d) : - raise [ParentNotDirectoryException, FileAlreadyExistsException, IOException] + raise {ParentNotDirectoryException, FileAlreadyExistsException, IOException} FileSystems may reject the request for other reasons, such as the FS being read-only (HDFS), @@ -712,8 +712,8 @@ For instance, HDFS may raise an `InvalidPathException`. #### Postconditions FS' where : - FS'.Files'[p] == [] - ancestors(p) is-subset-of FS'.Directories' + FS'.Files[p] == [] + ancestors(p) subset-of FS'.Directories result = FSDataOutputStream @@ -734,7 +734,7 @@ The behavior of the returned stream is covered in [Output](outputstream.html). clients creating files with `overwrite==true` to fail if the file is created by another client between the two tests. -* The S3A and potentially other Object Stores connectors not currently change the `FS` state +* The S3A and potentially other Object Stores connectors currently don't change the `FS` state until the output stream `close()` operation is completed. This is a significant difference between the behavior of object stores and that of filesystems, as it allows >1 client to create a file with `overwrite=false`, @@ -762,15 +762,15 @@ The behavior of the returned stream is covered in [Output](outputstream.html). #### Implementation Notes `createFile(p)` returns a `FSDataOutputStreamBuilder` only and does not make -change on filesystem immediately. When `build()` is invoked on the `FSDataOutputStreamBuilder`, +changes on the filesystem immediately. When `build()` is invoked on the `FSDataOutputStreamBuilder`, the builder parameters are verified and [`create(Path p)`](#FileSystem.create) is invoked on the underlying filesystem. `build()` has the same preconditions and postconditions as [`create(Path p)`](#FileSystem.create). * Similar to [`create(Path p)`](#FileSystem.create), files are overwritten -by default, unless specify `builder.overwrite(false)`. +by default, unless specified by `builder.overwrite(false)`. * Unlike [`create(Path p)`](#FileSystem.create), missing parent directories are -not created by default, unless specify `builder.recursive()`. +not created by default, unless specified by `builder.recursive()`. ### `FSDataOutputStream append(Path p, int bufferSize, Progressable progress)` @@ -780,14 +780,14 @@ Implementations without a compliant call SHOULD throw `UnsupportedOperationExcep if not exists(FS, p) : raise FileNotFoundException - if not isFile(FS, p) : raise [FileAlreadyExistsException, FileNotFoundException, IOException] + if not isFile(FS, p) : raise {FileAlreadyExistsException, FileNotFoundException, IOException} #### Postconditions FS' = FS result = FSDataOutputStream -Return: `FSDataOutputStream`, which can update the entry `FS.Files[p]` +Return: `FSDataOutputStream`, which can update the entry `FS'.Files[p]` by appending data to the existing list. The behavior of the returned stream is covered in [Output](outputstream.html). @@ -813,7 +813,7 @@ Implementations without a compliant call SHOULD throw `UnsupportedOperationExcep #### Preconditions - if not isFile(FS, p)) : raise [FileNotFoundException, IOException] + if not isFile(FS, p)) : raise {FileNotFoundException, IOException} This is a critical precondition. Implementations of some FileSystems (e.g. Object stores) could shortcut one round trip by postponing their HTTP GET @@ -842,7 +842,7 @@ The result MUST be the same for local and remote callers of the operation. symbolic links 1. HDFS throws `IOException("Cannot open filename " + src)` if the path -exists in the metadata, but no copies of any its blocks can be located; +exists in the metadata, but no copies of its blocks can be located; -`FileNotFoundException` would seem more accurate and useful. ### `FSDataInputStreamBuilder openFile(Path path)` @@ -861,7 +861,7 @@ Implementations without a compliant call MUST throw `UnsupportedOperationExcepti let stat = getFileStatus(Path p) let FS' where: - (FS.Directories', FS.Files', FS.Symlinks') + (FS'.Directories, FS.Files', FS'.Symlinks) p' in paths(FS') where: exists(FS, stat.path) implies exists(FS', p') @@ -931,16 +931,16 @@ metadata in the `PathHandle` to detect references from other namespaces. ### `FSDataInputStream open(PathHandle handle, int bufferSize)` -Implementaions without a compliant call MUST throw `UnsupportedOperationException` +Implementations without a compliant call MUST throw `UnsupportedOperationException` #### Preconditions let fd = getPathHandle(FileStatus stat) if stat.isdir : raise IOException let FS' where: - (FS.Directories', FS.Files', FS.Symlinks') - p' in FS.Files' where: - FS.Files'[p'] = fd + (FS'.Directories, FS.Files', FS'.Symlinks) + p' in FS'.Files where: + FS'.Files[p'] = fd if not exists(FS', p') : raise InvalidPathHandleException The implementation MUST resolve the referent of the `PathHandle` following @@ -951,7 +951,7 @@ encoded in the `PathHandle`. #### Postconditions - result = FSDataInputStream(0, FS.Files'[p']) + result = FSDataInputStream(0, FS'.Files[p']) The stream returned is subject to the constraints of a stream returned by `open(Path)`. Constraints checked on open MAY hold to hold for the stream, but @@ -1006,7 +1006,7 @@ A directory with children and `recursive == False` cannot be deleted If the file does not exist the filesystem state does not change - if not exists(FS, p): + if not exists(FS, p) : FS' = FS result = False @@ -1089,7 +1089,7 @@ Some of the object store based filesystem implementations always return false when deleting the root, leaving the state of the store unchanged. if isRoot(p) : - FS ' = FS + FS' = FS result = False This is irrespective of the recursive flag status or the state of the directory. @@ -1152,7 +1152,7 @@ has been calculated. Source `src` must exist: - exists(FS, src) else raise FileNotFoundException + if not exists(FS, src) : raise FileNotFoundException `dest` cannot be a descendant of `src`: @@ -1162,7 +1162,7 @@ This implicitly covers the special case of `isRoot(FS, src)`. `dest` must be root, or have a parent that exists: - isRoot(FS, dest) or exists(FS, parent(dest)) else raise IOException + if not (isRoot(FS, dest) or exists(FS, parent(dest))) : raise IOException The parent path of a destination must not be a file: @@ -1240,7 +1240,8 @@ There is no consistent behavior here. The outcome is no change to FileSystem state, with a return value of false. - FS' = FS; result = False + FS' = FS + result = False *Local Filesystem* @@ -1319,28 +1320,31 @@ Implementations without a compliant call SHOULD throw `UnsupportedOperationExcep All sources MUST be in the same directory: - for s in sources: if parent(S) != parent(p) raise IllegalArgumentException + for s in sources: + if parent(s) != parent(p) : raise IllegalArgumentException All block sizes must match that of the target: - for s in sources: getBlockSize(FS, S) == getBlockSize(FS, p) + for s in sources: + getBlockSize(FS, s) == getBlockSize(FS, p) No duplicate paths: - not (exists p1, p2 in (sources + [p]) where p1 == p2) + let input = sources + [p] + not (exists i, j: i != j and input[i] == input[j]) HDFS: All source files except the final one MUST be a complete block: for s in (sources[0:length(sources)-1] + [p]): - (length(FS, s) mod getBlockSize(FS, p)) == 0 + (length(FS, s) mod getBlockSize(FS, p)) == 0 #### Postconditions FS' where: - (data(FS', T) = data(FS, T) + data(FS, sources[0]) + ... + data(FS, srcs[length(srcs)-1])) - and for s in srcs: not exists(FS', S) + (data(FS', p) = data(FS, p) + data(FS, sources[0]) + ... + data(FS, sources[length(sources)-1])) + for s in sources: not exists(FS', s) HDFS's restrictions may be an implementation detail of how it implements @@ -1360,7 +1364,7 @@ Implementations without a compliant call SHOULD throw `UnsupportedOperationExcep if not exists(FS, p) : raise FileNotFoundException - if isDir(FS, p) : raise [FileNotFoundException, IOException] + if isDir(FS, p) : raise {FileNotFoundException, IOException} if newLength < 0 || newLength > len(FS.Files[p]) : raise HadoopIllegalArgumentException @@ -1369,8 +1373,7 @@ Truncate cannot be performed on a file, which is open for writing or appending. #### Postconditions - FS' where: - len(FS.Files[p]) = newLength + len(FS'.Files[p]) = newLength Return: `true`, if truncation is finished and the file can be immediately opened for appending, or `false` otherwise. @@ -1399,7 +1402,7 @@ Source and destination must be different if src = dest : raise FileExistsException ``` -Destination and source must not be descendants one another +Destination and source must not be descendants of one another ```python if isDescendant(src, dest) or isDescendant(dest, src) : raise IOException ``` @@ -1429,7 +1432,7 @@ Given a base path on the source `base` and a child path `child` where `base` is ```python def final_name(base, child, dest): - is base = child: + if base == child: return dest else: return dest + childElements(base, child) @@ -1557,7 +1560,7 @@ while (iterator.hasNext()) { As raising exceptions is an expensive operation in JVMs, the `while(hasNext())` loop option is more efficient. (see also [Concurrency and the Remote Iterator](#RemoteIteratorConcurrency) -for a dicussion on this topic). +for a discussion on this topic). Implementors of the interface MUST support both forms of iterations; authors of tests SHOULD verify that both iteration mechanisms work. diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md index f64a2bd03b..3820d0b8af 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/fsdatainputstream.md @@ -55,7 +55,7 @@ with access functions: file as returned by `FileSystem.getFileStatus(Path p)` forall p in dom(FS.Files[p]) : - len(data(FSDIS)) == FS.getFileStatus(p).length + len(data(FSDIS)) == FS.getFileStatus(p).length ### `Closeable.close()` @@ -259,8 +259,8 @@ Examples: `RawLocalFileSystem` , `HttpFSFileSystem` If the operation is supported and there is a new location for the data: - FSDIS' = (pos, data', true) - result = True + FSDIS' = (pos, data', true) + result = True The new data is the original data (or an updated version of it, as covered in the Consistency section below), but the block containing the data at `offset` @@ -268,7 +268,7 @@ is sourced from a different replica. If there is no other copy, `FSDIS` is not updated; the response indicates this: - result = False + result = False Outside of test methods, the primary use of this method is in the {{FSInputChecker}} class, which can react to a checksum error in a read by attempting to source diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/model.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/model.md index e121c92dee..7507d7a103 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/model.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/model.md @@ -108,21 +108,21 @@ such as `rename`. ## Defining the Filesystem -A filesystem `FS` contains a set of directories, a dictionary of paths and a dictionary of symbolic links +A filesystem `FS` contains directories (a set of paths), files (a mapping of a path to a list of bytes) and symlinks (a set of paths mapping to paths) - (Directories:Set[Path], Files:[Path:List[byte]], Symlinks:Set[Path]) + (Directories:Set[Path], Files:Map[Path:List[byte]], Symlinks:Map[Path:Path]) Accessor functions return the specific element of a filesystem - def FS.Directories = FS.Directories + def directories(FS) = FS.Directories def files(FS) = FS.Files - def symlinks(FS) = FS.Symlinks + def symlinks(FS) = keys(FS.Symlinks) def filenames(FS) = keys(FS.Files) The entire set of a paths finite subset of all possible Paths, and functions to resolve a path to data, a directory predicate or a symbolic link: - def paths(FS) = FS.Directories + filenames(FS) + FS.Symlinks) + def paths(FS) = FS.Directories + filenames(FS) + symlinks(FS) A path is deemed to exist if it is in this aggregate set: @@ -169,10 +169,10 @@ in a set, hence no children with duplicate names. A path *D* is a descendant of a path *P* if it is the direct child of the path *P* or an ancestor is a direct child of path *P*: - def isDescendant(P, D) = parent(D) == P where isDescendant(P, parent(D)) + def isDescendant(P, D) = parent(D) == P or isDescendant(P, parent(D)) The descendants of a directory P are all paths in the filesystem whose -path begins with the path P -that is their parent is P or an ancestor is P +path begins with the path P, i.e. their parent is P or an ancestor is P def descendants(FS, D) = {p for p in paths(FS) where isDescendant(D, p)} @@ -181,7 +181,7 @@ path begins with the path P -that is their parent is P or an ancestor is P A path MAY refer to a file that has data in the filesystem; its path is a key in the data dictionary - def isFile(FS, p) = p in FS.Files + def isFile(FS, p) = p in keys(FS.Files) ### Symbolic references @@ -193,6 +193,10 @@ A path MAY refer to a symbolic link: ### File Length +Files store data: + + def data(FS, p) = files(FS)[p] + The length of a path p in a filesystem FS is the length of the data stored, or 0 if it is a directory: def length(FS, p) = if isFile(p) : return length(data(FS, p)) else return 0 @@ -215,9 +219,9 @@ This may differ from the local user account name. A path cannot refer to more than one of a file, a directory or a symbolic link - FS.Directories ^ keys(data(FS)) == {} - FS.Directories ^ symlinks(FS) == {} - keys(data(FS))(FS) ^ symlinks(FS) == {} + directories(FS) ^ filenames(FS) == {} + directories(FS) ^ symlinks(FS) == {} + filenames(FS) ^ symlinks(FS) == {} This implies that only files may have data. @@ -248,7 +252,7 @@ For all files in an encrypted zone, the data is encrypted, but the encryption type and specification are not defined. forall f in files(FS) where inEncyptionZone(FS, f): - isEncrypted(data(f)) + isEncrypted(data(FS, f)) ## Notes diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/notation.md b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/notation.md index 472bb5dd7d..e82e17a993 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/notation.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/filesystem/notation.md @@ -80,15 +80,15 @@ are used as the basis for this syntax as it is both plain ASCII and well-known. ##### Lists -* A list *L* is an ordered sequence of elements `[e1, e2, ... en]` +* A list *L* is an ordered sequence of elements `[e1, e2, ... e(n)]` * The size of a list `len(L)` is the number of elements in a list. * Items can be addressed by a 0-based index `e1 == L[0]` -* Python slicing operators can address subsets of a list `L[0:3] == [e1,e2]`, `L[:-1] == en` +* Python slicing operators can address subsets of a list `L[0:3] == [e1,e2,e3]`, `L[:-1] == [e1, ... e(n-1)]` * Lists can be concatenated `L' = L + [ e3 ]` * Lists can have entries removed `L' = L - [ e2, e1 ]`. This is different from Python's `del` operation, which operates on the list in place. * The membership predicate `in` returns true iff an element is a member of a List: `e2 in L` -* List comprehensions can create new lists: `L' = [ x for x in l where x < 5]` +* List comprehensions can create new lists: `L' = [ x for x in L where x < 5]` * for a list `L`, `len(L)` returns the number of elements. @@ -130,7 +130,7 @@ Strings are lists of characters represented in double quotes. e.g. `"abc"` All system state declarations are immutable. -The suffix "'" (single quote) is used as the convention to indicate the state of the system after an operation: +The suffix "'" (single quote) is used as the convention to indicate the state of the system after a mutating operation: L' = L + ['d','e']