diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index c98ea508b0..cb28fce57d 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -600,6 +600,8 @@ Release 2.8.0 - UNRELEASED MAPREDUCE-6302. Preempt reducers after a configurable timeout irrespective of headroom. (kasha) + MAPREDUCE-6495. Docs for archive-logs tool (rkanter) + Release 2.7.2 - UNRELEASED INCOMPATIBLE CHANGES diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/MapredCommands.md b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/MapredCommands.md index fd695b2c62..7d6f5ffefc 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/MapredCommands.md +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/site/markdown/MapredCommands.md @@ -18,6 +18,7 @@ MapReduce Commands Guide * [Overview](#Overview) * [User Commands](#User_Commands) * [archive](#archive) + * [archive-logs](#archive-logs) * [classpath](#classpath) * [distcp](#distcp) * [job](#job) @@ -53,6 +54,12 @@ Commands useful for users of a hadoop cluster. Creates a hadoop archive. More information can be found at [Hadoop Archives Guide](../../hadoop-archives/HadoopArchives.html). +### `archive-logs` + +A tool to combine YARN aggregated logs into Hadoop archives to reduce the number +of files in HDFS. More information can be found at +[Hadoop Archive Logs Guide](../../hadoop-archive-logs/HadoopArchiveLogs.html). + ### `classpath` Usage: `yarn classpath [--glob |--jar |-h |--help]` diff --git a/hadoop-project/src/site/site.xml b/hadoop-project/src/site/site.xml index 5c92d73a94..b7db5a3c71 100644 --- a/hadoop-project/src/site/site.xml +++ b/hadoop-project/src/site/site.xml @@ -156,6 +156,7 @@ + diff --git a/hadoop-tools/hadoop-archive-logs/src/main/java/org/apache/hadoop/tools/HadoopArchiveLogs.java b/hadoop-tools/hadoop-archive-logs/src/main/java/org/apache/hadoop/tools/HadoopArchiveLogs.java index b6335217e1..363e287178 100644 --- a/hadoop-tools/hadoop-archive-logs/src/main/java/org/apache/hadoop/tools/HadoopArchiveLogs.java +++ b/hadoop-tools/hadoop-archive-logs/src/main/java/org/apache/hadoop/tools/HadoopArchiveLogs.java @@ -221,7 +221,7 @@ private void handleOpts(String[] args) throws ParseException { CommandLine commandLine = parser.parse(opts, args); if (commandLine.hasOption(HELP_OPTION)) { HelpFormatter formatter = new HelpFormatter(); - formatter.printHelp("yarn archive-logs", opts); + formatter.printHelp("mapred archive-logs", opts); System.exit(0); } if (commandLine.hasOption(MAX_ELIGIBLE_APPS_OPTION)) { @@ -254,7 +254,7 @@ private void handleOpts(String[] args) throws ParseException { } } catch (ParseException pe) { HelpFormatter formatter = new HelpFormatter(); - formatter.printHelp("yarn archive-logs", opts); + formatter.printHelp("mapred archive-logs", opts); throw pe; } } diff --git a/hadoop-tools/hadoop-archive-logs/src/site/markdown/HadoopArchiveLogs.md b/hadoop-tools/hadoop-archive-logs/src/site/markdown/HadoopArchiveLogs.md new file mode 100644 index 0000000000..a54c9a944a --- /dev/null +++ b/hadoop-tools/hadoop-archive-logs/src/site/markdown/HadoopArchiveLogs.md @@ -0,0 +1,85 @@ + + +Hadoop Archive Logs Guide +========================= + + - [Overview](#Overview) + - [How to Archive Logs](#How_to_Archive_Logs) + +Overview +-------- + +For clusters with a lot of Yarn aggregated logs, it can be helpful to combine +them into hadoop archives in order to reduce the number of small files, and +hence the stress on the NameNode. This tool provides an easy way to do this. +Aggregated logs in hadoop archives can still be read by the Job History Server +and by the `yarn logs` command. + +For more on hadoop archives, see +[Hadoop Archives Guide](../hadoop-archives/HadoopArchives.html). + +How to Archive Logs +------------------- + + usage: mapred archive-logs + -force Force recreating the working directory if + an existing one is found. This should + only be used if you know that another + instance is not currently running + -help Prints this message + -maxEligibleApps The maximum number of eligible apps to + process (default: -1 (all)) + -maxTotalLogsSize The maximum total logs size (in + megabytes) required to be eligible + (default: 1024) + -memory The amount of memory (in megabytes) for + each container (default: 1024) + -minNumberLogFiles The minimum number of log files required + to be eligible (default: 20) + -verbose Print more details. + +The tool only supports running one instance on a cluster at a time in order +to prevent conflicts. It does this by checking for the existance of a +directory named ``archive-logs-work`` under +``yarn.nodemanager.remote-app-log-dir`` in HDFS +(default: ``/tmp/logs/archive-logs-work``). If for some reason that +directory was not cleaned up properly, and the tool refuses to run, you can +force it with the ``-force`` option. + +The ``-help`` option prints out the usage information. + +The tool works by performing the following procedure: + + 1. Determine the list of eligible applications, based on the following + criteria: + - is not already archived + - its aggregation status has successfully completed + - has at least ``-minNumberLogFiles`` log files + - the sum of its log files size is less than ``-maxTotalLogsSize`` megabytes + 2. If there are are more than ``-maxEligibleApps`` applications found, the + newest applications are dropped. They can be processed next time. + 3. A shell script is generated based on the eligible applications + 4. The Distributed Shell program is run with the aformentioned script. It + will run with ``-maxEligibleApps`` containers, one to process each + application, and with ``-memory`` megabytes of memory. Each container runs + the ``hadoop archives`` command for a single application and replaces + its aggregated log files with the resulting archive. + +The ``-verbose`` option makes the tool print more details about what it's +doing. + +The end result of running the tool is that the original aggregated log files for +a processed application will be replaced by a hadoop archive containing all of +those logs. diff --git a/hadoop-tools/hadoop-archive-logs/src/site/resources/css/site.css b/hadoop-tools/hadoop-archive-logs/src/site/resources/css/site.css new file mode 100644 index 0000000000..f830baafa8 --- /dev/null +++ b/hadoop-tools/hadoop-archive-logs/src/site/resources/css/site.css @@ -0,0 +1,30 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +#banner { + height: 93px; + background: none; +} + +#bannerLeft img { + margin-left: 30px; + margin-top: 10px; +} + +#bannerRight img { + margin: 17px; +} +