From c3006be516ce7d4f970e24e7407b401318ceec3c Mon Sep 17 00:00:00 2001 From: Gautham B A Date: Tue, 4 Jan 2022 22:29:54 +0530 Subject: [PATCH] HDFS-16407. Make hdfs_du tool cross platform (#3848) --- .../libhdfspp/tests/tools/CMakeLists.txt | 3 + .../libhdfspp/tests/tools/hdfs-du-mock.cc | 67 ++++++ .../libhdfspp/tests/tools/hdfs-du-mock.h | 68 ++++++ .../libhdfspp/tests/tools/hdfs-tool-tests.cc | 19 ++ .../libhdfspp/tests/tools/hdfs-tool-tests.h | 25 +++ .../native/libhdfspp/tools/CMakeLists.txt | 3 +- .../libhdfspp/tools/hdfs-du/CMakeLists.txt | 27 +++ .../native/libhdfspp/tools/hdfs-du/hdfs-du.cc | 205 ++++++++++++++++++ .../native/libhdfspp/tools/hdfs-du/hdfs-du.h | 93 ++++++++ .../native/libhdfspp/tools/hdfs-du/main.cc | 52 +++++ .../main/native/libhdfspp/tools/hdfs_du.cc | 176 --------------- .../internal/get-content-summary-state.h | 68 ++++++ 12 files changed, 628 insertions(+), 178 deletions(-) create mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-du-mock.cc create mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-du-mock.h create mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/CMakeLists.txt create mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/hdfs-du.cc create mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/hdfs-du.h create mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/main.cc delete mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs_du.cc create mode 100644 hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/internal/get-content-summary-state.h diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/CMakeLists.txt b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/CMakeLists.txt index 210243d61b..7cd92e3492 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/CMakeLists.txt +++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/CMakeLists.txt @@ -29,11 +29,13 @@ add_executable(hdfs_tool_tests hdfs-tool-test-fixtures.cc hdfs-tool-tests.cc hdfs-df-mock.cc + hdfs-du-mock.cc main.cc) target_include_directories(hdfs_tool_tests PRIVATE ../tools ../../tools ../../tools/hdfs-df + ../../tools/hdfs-du ../../tools/hdfs-allow-snapshot ../../tools/hdfs-disallow-snapshot ../../tools/hdfs-delete-snapshot @@ -46,6 +48,7 @@ target_include_directories(hdfs_tool_tests PRIVATE target_link_libraries(hdfs_tool_tests PRIVATE gmock_main hdfs_df_lib + hdfs_du_lib hdfs_allowSnapshot_lib hdfs_disallowSnapshot_lib hdfs_deleteSnapshot_lib diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-du-mock.cc b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-du-mock.cc new file mode 100644 index 0000000000..e0c2ebf745 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-du-mock.cc @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +#include "hdfs-du-mock.h" +#include "hdfs-tool-tests.h" + +namespace hdfs::tools::test { +DuMock::~DuMock() = default; + +void DuMock::SetExpectations(std::function()> test_case, + const std::vector &args) const { + // Get the pointer to the function that defines the test case + const auto test_case_func = test_case.target (*)()>(); + ASSERT_NE(test_case_func, nullptr); + + // Set the expected method calls and their corresponding arguments for each + // test case + if (*test_case_func == &CallHelp) { + EXPECT_CALL(*this, HandleHelp()).Times(1).WillOnce(testing::Return(true)); + return; + } + + if (*test_case_func == &PassAPath) { + const auto arg1 = args[0]; + EXPECT_CALL(*this, HandlePath(arg1, false)) + .Times(1) + .WillOnce(testing::Return(true)); + } + + if (*test_case_func == &PassRecursivePath) { + const auto arg1 = args[0]; + const auto arg2 = args[1]; + ASSERT_EQ(arg1, "-R"); + EXPECT_CALL(*this, HandlePath(arg2, true)) + .Times(1) + .WillOnce(testing::Return(true)); + } + + if (*test_case_func == &PassRecursive) { + const auto arg1 = args[0]; + ASSERT_EQ(arg1, "-R"); + } +} +} // namespace hdfs::tools::test diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-du-mock.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-du-mock.h new file mode 100644 index 0000000000..de5caeb76b --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-du-mock.h @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBHDFSPP_TOOLS_HDFS_DU_MOCK +#define LIBHDFSPP_TOOLS_HDFS_DU_MOCK + +#include +#include +#include +#include + +#include + +#include "hdfs-du.h" + +namespace hdfs::tools::test { +/** + * {@class DuMock} is an {@class Du} whereby it mocks the + * HandleHelp and HandlePath methods for testing their functionality. + */ +class DuMock : public hdfs::tools::Du { +public: + /** + * {@inheritdoc} + */ + DuMock(const int argc, char **argv) : Du(argc, argv) {} + + // Abiding to the Rule of 5 + DuMock(const DuMock &) = delete; + DuMock(DuMock &&) = delete; + DuMock &operator=(const DuMock &) = delete; + DuMock &operator=(DuMock &&) = delete; + ~DuMock() override; + + /** + * Defines the methods and the corresponding arguments that are expected + * to be called on this instance of {@link HdfsTool} for the given test case. + * + * @param test_case An {@link std::function} object that points to the + * function defining the test case + * @param args The arguments that are passed to this test case + */ + void SetExpectations(std::function()> test_case, + const std::vector &args = {}) const; + + MOCK_METHOD(bool, HandleHelp, (), (const, override)); + + MOCK_METHOD(bool, HandlePath, (const std::string &, const bool), + (const, override)); +}; +} // namespace hdfs::tools::test + +#endif diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-tool-tests.cc b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-tool-tests.cc index 1a879aa2fe..cc3257fb5c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-tool-tests.cc +++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-tool-tests.cc @@ -28,6 +28,7 @@ #include "hdfs-delete-snapshot-mock.h" #include "hdfs-df-mock.h" #include "hdfs-disallow-snapshot-mock.h" +#include "hdfs-du-mock.h" #include "hdfs-rename-snapshot-mock.h" #include "hdfs-tool-test-fixtures.h" #include "hdfs-tool-tests.h" @@ -67,6 +68,12 @@ INSTANTIATE_TEST_SUITE_P(HdfsDf, HdfsToolBasicTest, testing::Values(PassAPath, CallHelp)); +INSTANTIATE_TEST_SUITE_P( + HdfsDu, HdfsToolBasicTest, + testing::Values(PassAPath, + CallHelp, + PassRecursivePath)); + INSTANTIATE_TEST_SUITE_P( HdfsDeleteSnapshot, HdfsToolBasicTest, testing::Values(CallHelp, @@ -114,6 +121,14 @@ INSTANTIATE_TEST_SUITE_P( HdfsDf, HdfsToolNegativeTestThrows, testing::Values(Pass2Paths)); +INSTANTIATE_TEST_SUITE_P( + HdfsDu, HdfsToolNegativeTestThrows, + testing::Values(Pass2Paths, + Pass3Paths, + PassNOptAndAPath, + PassOwnerAndAPath, + PassPermissionsAndAPath)); + INSTANTIATE_TEST_SUITE_P( HdfsCat, HdfsToolNegativeTestThrows, testing::Values(Pass2Paths)); @@ -122,6 +137,10 @@ INSTANTIATE_TEST_SUITE_P( HdfsDeleteSnapshot, HdfsToolNegativeTestNoThrow, testing::Values(PassAPath)); +INSTANTIATE_TEST_SUITE_P( + HdfsDu, HdfsToolNegativeTestNoThrow, + testing::Values(PassRecursive)); + INSTANTIATE_TEST_SUITE_P( HdfsChown, HdfsToolNegativeTestNoThrow, testing::Values(PassAPath)); diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-tool-tests.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-tool-tests.h index 9d012e3d37..a34435cd63 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-tool-tests.h +++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tests/tools/hdfs-tool-tests.h @@ -44,6 +44,31 @@ template std::unique_ptr PassAPath() { return hdfs_tool; } +template std::unique_ptr PassRecursive() { + constexpr auto argc = 2; + static std::string exe("hdfs_tool_name"); + static std::string arg1("-R"); + + static char *argv[] = {exe.data(), arg1.data()}; + + auto hdfs_tool = std::make_unique(argc, argv); + hdfs_tool->SetExpectations(PassRecursive, {arg1}); + return hdfs_tool; +} + +template std::unique_ptr PassRecursivePath() { + constexpr auto argc = 3; + static std::string exe("hdfs_tool_name"); + static std::string arg1("-R"); + static std::string arg2("a/b/c"); + + static char *argv[] = {exe.data(), arg1.data(), arg2.data()}; + + auto hdfs_tool = std::make_unique(argc, argv); + hdfs_tool->SetExpectations(PassRecursivePath, {arg1, arg2}); + return hdfs_tool; +} + template std::unique_ptr CallHelp() { constexpr auto argc = 2; static std::string exe("hdfs_tool_name"); diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/CMakeLists.txt b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/CMakeLists.txt index e93322e09a..8e83febddb 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/CMakeLists.txt +++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/CMakeLists.txt @@ -61,8 +61,7 @@ target_link_libraries(hdfs_count tools_common hdfspp_static) add_subdirectory(hdfs-df) -add_executable(hdfs_du hdfs_du.cc) -target_link_libraries(hdfs_du tools_common hdfspp_static) +add_subdirectory(hdfs-du) add_executable(hdfs_get hdfs_get.cc) target_link_libraries(hdfs_get tools_common hdfspp_static) diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/CMakeLists.txt b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/CMakeLists.txt new file mode 100644 index 0000000000..7164136fa1 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/CMakeLists.txt @@ -0,0 +1,27 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +add_library(hdfs_du_lib STATIC $ hdfs-du.cc) +target_include_directories(hdfs_du_lib PRIVATE ../../tools ${Boost_INCLUDE_DIRS}) +target_link_libraries(hdfs_du_lib PRIVATE Boost::boost Boost::program_options tools_common hdfspp_static) + +add_executable(hdfs_du main.cc) +target_include_directories(hdfs_du PRIVATE ../../tools) +target_link_libraries(hdfs_du PRIVATE hdfs_du_lib) + +install(TARGETS hdfs_du RUNTIME DESTINATION bin) diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/hdfs-du.cc b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/hdfs-du.cc new file mode 100644 index 0000000000..5b5cab67d0 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/hdfs-du.cc @@ -0,0 +1,205 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include "hdfs-du.h" +#include "internal/get-content-summary-state.h" +#include "tools_common.h" + +namespace hdfs::tools { +Du::Du(const int argc, char **argv) : HdfsTool(argc, argv) {} + +bool Du::Initialize() { + auto add_options = opt_desc_.add_options(); + add_options("help,h", + "Displays sizes of files and directories contained in the given " + "PATH or the length of a file in case PATH is just a file"); + add_options("recursive,R", "Operate on files and directories recursively"); + add_options("path", po::value(), + "The path indicating the filesystem that needs to be du-ed"); + + // We allow only one positional argument to be passed to this tool. An + // exception is thrown if multiple arguments are passed. + pos_opt_desc_.add("path", 1); + + po::store(po::command_line_parser(argc_, argv_) + .options(opt_desc_) + .positional(pos_opt_desc_) + .run(), + opt_val_); + po::notify(opt_val_); + return true; +} + +std::string Du::GetDescription() const { + std::stringstream desc; + desc << "Usage: hdfs_du [OPTION] PATH" << std::endl + << std::endl + << "Displays sizes of files and directories contained in the given PATH" + << std::endl + << "or the length of a file in case PATH is just a file" << std::endl + << std::endl + << " -R operate on files and directories recursively" + << std::endl + << " -h display this help and exit" << std::endl + << std::endl + << "Examples:" << std::endl + << "hdfs_du hdfs://localhost.localdomain:8020/dir/file" << std::endl + << "hdfs_du -R /dir1/dir2" << std::endl; + return desc.str(); +} + +bool Du::Do() { + if (!Initialize()) { + std::cerr << "Unable to initialize HDFS du tool" << std::endl; + return false; + } + + if (!ValidateConstraints()) { + std::cout << GetDescription(); + return false; + } + + if (opt_val_.count("help") > 0) { + return HandleHelp(); + } + + if (opt_val_.count("path") > 0) { + const auto path = opt_val_["path"].as(); + const auto recursive = opt_val_.count("recursive") > 0; + return HandlePath(path, recursive); + } + + return false; +} + +bool Du::HandleHelp() const { + std::cout << GetDescription(); + return true; +} + +bool Du::HandlePath(const std::string &path, const bool recursive) const { + // Building a URI object from the given path. + auto uri = hdfs::parse_path_or_exit(path); + + const auto fs = hdfs::doConnect(uri, true); + if (!fs) { + std::cerr << "Could not connect to the file system." << std::endl; + return false; + } + + /* + * Wrap async FileSystem::GetContentSummary with promise to make it a blocking + * call. + */ + const auto promise = std::make_shared>(); + std::future future(promise->get_future()); + auto handler = [promise](const hdfs::Status &s) { promise->set_value(s); }; + + /* + * Allocating shared state, which includes: handler to be called, request + * counter, and a boolean to keep track if find is done. + */ + const auto state = + std::make_shared(handler, 0, false); + + /* + * Keep requesting more from Find until we process the entire listing. Call + * handler when Find is done and request counter is 0. Find guarantees that + * the handler will only be called once at a time so we do not need locking in + * handler_find. + */ + auto handler_find = [fs, state](const hdfs::Status &status_find, + const std::vector &stat_infos, + const bool has_more_results) -> bool { + /* + * For each result returned by Find we call async GetContentSummary with the + * handler below. GetContentSummary DOES NOT guarantee that the handler will + * only be called once at a time, so we DO need locking in + * handler_get_content_summary. + */ + auto handler_get_content_summary = + [state](const hdfs::Status &status_get_summary, + const hdfs::ContentSummary &si) { + std::lock_guard guard(state->lock); + std::cout << si.str_du() << std::endl; + // Decrement the counter once since we are done with this async call. + if (!status_get_summary.ok() && state->status.ok()) { + // We make sure we set state->status only on the first error. + state->status = status_get_summary; + } + state->request_counter--; + if (state->request_counter == 0 && state->find_is_done) { + state->handler(state->status); // exit + } + }; + + if (!stat_infos.empty() && state->status.ok()) { + for (hdfs::StatInfo const &s : stat_infos) { + /* + * Launch an asynchronous call to GetContentSummary for every returned + * result. + */ + state->request_counter++; + fs->GetContentSummary(s.full_path, handler_get_content_summary); + } + } + + /* + * Lock this section because handler_get_content_summary might be accessing + * the same shared variables simultaneously. + */ + std::lock_guard guard(state->lock); + if (!status_find.ok() && state->status.ok()) { + // We make sure we set state->status only on the first error. + state->status = status_find; + } + + if (!has_more_results) { + state->find_is_done = true; + if (state->request_counter == 0) { + state->handler(state->status); // exit + } + return false; + } + return true; + }; + + // Asynchronous call to Find. + if (!recursive) { + fs->GetListing(uri.get_path(), handler_find); + } else { + fs->Find(uri.get_path(), "*", hdfs::FileSystem::GetDefaultFindMaxDepth(), + handler_find); + } + + // Block until promise is set. + const auto status = future.get(); + if (!status.ok()) { + std::cerr << "Error: " << status.ToString() << std::endl; + return false; + } + return true; +} +} // namespace hdfs::tools diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/hdfs-du.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/hdfs-du.h new file mode 100644 index 0000000000..494270868e --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/hdfs-du.h @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBHDFSPP_TOOLS_HDFS_DU +#define LIBHDFSPP_TOOLS_HDFS_DU + +#include + +#include + +#include "hdfs-tool.h" + +namespace hdfs::tools { +/** + * {@class Du} is an {@class HdfsTool} that displays the size of the directories + * and files. + */ +class Du : public HdfsTool { +public: + /** + * {@inheritdoc} + */ + Du(int argc, char **argv); + + // Abiding to the Rule of 5 + Du(const Du &) = default; + Du(Du &&) = default; + Du &operator=(const Du &) = delete; + Du &operator=(Du &&) = delete; + ~Du() override = default; + + /** + * {@inheritdoc} + */ + [[nodiscard]] std::string GetDescription() const override; + + /** + * {@inheritdoc} + */ + [[nodiscard]] bool Do() override; + +protected: + /** + * {@inheritdoc} + */ + [[nodiscard]] bool Initialize() override; + + /** + * {@inheritdoc} + */ + [[nodiscard]] bool ValidateConstraints() const override { return argc_ > 1; } + + /** + * {@inheritdoc} + */ + [[nodiscard]] bool HandleHelp() const override; + + /** + * Handle the path argument that's passed to this tool. + * + * @param path The path to the directory for which we need du info. + * @param recursive A boolean indicating whether du needs to be + * performed recursively for the given path. + * + * @return A boolean indicating the result of this operation. + */ + [[nodiscard]] virtual bool HandlePath(const std::string &path, + bool recursive) const; + +private: + /** + * A boost data-structure containing the description of positional arguments + * passed to the command-line. + */ + po::positional_options_description pos_opt_desc_; +}; +} // namespace hdfs::tools +#endif diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/main.cc b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/main.cc new file mode 100644 index 0000000000..0d8738fe3d --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs-du/main.cc @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include "hdfs-du.h" + +int main(int argc, char *argv[]) { + const auto result = std::atexit([]() -> void { + // Clean up static data on exit and prevent valgrind memory leaks + google::protobuf::ShutdownProtobufLibrary(); + }); + if (result != 0) { + std::cerr + << "Error: Unable to schedule clean-up tasks for HDFS df tool, exiting" + << std::endl; + std::exit(EXIT_FAILURE); + } + + hdfs::tools::Du du(argc, argv); + auto success = false; + + try { + success = du.Do(); + } catch (const std::exception &e) { + std::cerr << "Error: " << e.what() << std::endl; + } + + if (!success) { + std::exit(EXIT_FAILURE); + } + return 0; +} diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs_du.cc b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs_du.cc deleted file mode 100644 index f6b6e73f09..0000000000 --- a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/hdfs_du.cc +++ /dev/null @@ -1,176 +0,0 @@ -/* - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. -*/ - -#include -#include -#include -#include "tools_common.h" - -void usage(){ - std::cout << "Usage: hdfs_du [OPTION] PATH" - << std::endl - << std::endl << "Displays sizes of files and directories contained in the given PATH" - << std::endl << "or the length of a file in case PATH is just a file" - << std::endl - << std::endl << " -R operate on files and directories recursively" - << std::endl << " -h display this help and exit" - << std::endl - << std::endl << "Examples:" - << std::endl << "hdfs_du hdfs://localhost.localdomain:8020/dir/file" - << std::endl << "hdfs_du -R /dir1/dir2" - << std::endl; -} - -struct GetContentSummaryState { - const std::function handler; - //The request counter is incremented once every time GetContentSummary async call is made - uint64_t request_counter; - //This boolean will be set when find returns the last result - bool find_is_done; - //Final status to be returned - hdfs::Status status; - //Shared variables will need protection with a lock - std::mutex lock; - GetContentSummaryState(const std::function & handler_, - uint64_t request_counter_, bool find_is_done_) - : handler(handler_), - request_counter(request_counter_), - find_is_done(find_is_done_), - status(), - lock() { - } -}; - -int main(int argc, char *argv[]) { - //We should have at least 2 arguments - if (argc < 2) { - usage(); - exit(EXIT_FAILURE); - } - - bool recursive = false; - int input; - - //Using GetOpt to read in the values - opterr = 0; - while ((input = getopt(argc, argv, "Rh")) != -1) { - switch (input) - { - case 'R': - recursive = true; - break; - case 'h': - usage(); - exit(EXIT_SUCCESS); - case '?': - if (isprint(optopt)) - std::cerr << "Unknown option `-" << (char) optopt << "'." << std::endl; - else - std::cerr << "Unknown option character `" << (char) optopt << "'." << std::endl; - usage(); - exit(EXIT_FAILURE); - default: - exit(EXIT_FAILURE); - } - } - std::string uri_path = argv[optind]; - - //Building a URI object from the given uri_path - hdfs::URI uri = hdfs::parse_path_or_exit(uri_path); - - std::shared_ptr fs = hdfs::doConnect(uri, true); - if (!fs) { - std::cerr << "Could not connect the file system. " << std::endl; - exit(EXIT_FAILURE); - } - - /* wrap async FileSystem::GetContentSummary with promise to make it a blocking call */ - std::shared_ptr> promise = std::make_shared>(); - std::future future(promise->get_future()); - auto handler = [promise](const hdfs::Status &s) { - promise->set_value(s); - }; - - //Allocating shared state, which includes: - //handler to be called, request counter, and a boolean to keep track if find is done - std::shared_ptr state = std::make_shared(handler, 0, false); - - // Keep requesting more from Find until we process the entire listing. Call handler when Find is done and reques counter is 0. - // Find guarantees that the handler will only be called once at a time so we do not need locking in handlerFind. - auto handlerFind = [fs, state](const hdfs::Status &status_find, const std::vector & stat_infos, bool has_more_results) -> bool { - - //For each result returned by Find we call async GetContentSummary with the handler below. - //GetContentSummary DOES NOT guarantee that the handler will only be called once at a time, so we DO need locking in handlerGetContentSummary. - auto handlerGetContentSummary = [state](const hdfs::Status &status_get_summary, const hdfs::ContentSummary &si) { - std::lock_guard guard(state->lock); - std::cout << si.str_du() << std::endl; - //Decrement the counter once since we are done with this async call - if (!status_get_summary.ok() && state->status.ok()){ - //We make sure we set state->status only on the first error. - state->status = status_get_summary; - } - state->request_counter--; - if(state->request_counter == 0 && state->find_is_done){ - state->handler(state->status); //exit - } - }; - if(!stat_infos.empty() && state->status.ok()) { - for (hdfs::StatInfo const& s : stat_infos) { - //Launch an asynchronous call to GetContentSummary for every returned result - state->request_counter++; - fs->GetContentSummary(s.full_path, handlerGetContentSummary); - } - } - - //Lock this section because handlerGetContentSummary might be accessing the same - //shared variables simultaneously - std::lock_guard guard(state->lock); - if (!status_find.ok() && state->status.ok()){ - //We make sure we set state->status only on the first error. - state->status = status_find; - } - if(!has_more_results){ - state->find_is_done = true; - if(state->request_counter == 0){ - state->handler(state->status); //exit - } - return false; - } - return true; - }; - - if(!recursive){ - //Asynchronous call to Find - fs->GetListing(uri.get_path(), handlerFind); - } else { - //Asynchronous call to Find - fs->Find(uri.get_path(), "*", hdfs::FileSystem::GetDefaultFindMaxDepth(), handlerFind); - } - - /* block until promise is set */ - hdfs::Status status = future.get(); - if (!status.ok()) { - std::cerr << "Error: " << status.ToString() << std::endl; - exit(EXIT_FAILURE); - } - - // Clean up static data and prevent valgrind memory leaks - google::protobuf::ShutdownProtobufLibrary(); - return 0; -} diff --git a/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/internal/get-content-summary-state.h b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/internal/get-content-summary-state.h new file mode 100644 index 0000000000..f35b55b3cf --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfspp/tools/internal/get-content-summary-state.h @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBHDFSPP_TOOLS_HDFS_DU_GET_CONTENT_SUMMARY_STATE +#define LIBHDFSPP_TOOLS_HDFS_DU_GET_CONTENT_SUMMARY_STATE + +#include +#include +#include + +#include "hdfspp/hdfspp.h" + +namespace hdfs::tools { +/** + * The {@class GetContentSummaryState} is used to hold intermediate information + * during the execution of {@link hdfs::FileSystem#GetContentSummary}. + */ +struct GetContentSummaryState { + GetContentSummaryState(std::function handler, + const uint64_t request_counter, + const bool find_is_done) + : handler{std::move(handler)}, request_counter{request_counter}, + find_is_done{find_is_done} {} + + /** + * The handler that is used to update the status asynchronously. + */ + const std::function handler; + + /** + * The request counter is incremented once every time GetContentSummary async + * call is made. + */ + uint64_t request_counter; + + /** + * This boolean will be set when find returns the last result. + */ + bool find_is_done; + + /** + * Final status to be returned. + */ + hdfs::Status status; + + /** + * Shared variables will need protection with a lock. + */ + std::mutex lock; +}; +} // namespace hdfs::tools + +#endif