diff --git a/hadoop-mapreduce-project/CHANGES.txt b/hadoop-mapreduce-project/CHANGES.txt index b10d314a91..d81fc50246 100644 --- a/hadoop-mapreduce-project/CHANGES.txt +++ b/hadoop-mapreduce-project/CHANGES.txt @@ -214,6 +214,9 @@ Release 2.2.1 - UNRELEASED OPTIMIZATIONS + MAPREDUCE-4680. Job history cleaner should only check timestamps of files in + old enough directories (Robert Kanter via Sandy Ryza) + BUG FIXES MAPREDUCE-5569. FloatSplitter is not generating correct splits (Nathan diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/main/java/org/apache/hadoop/mapreduce/v2/jobhistory/JobHistoryUtils.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/main/java/org/apache/hadoop/mapreduce/v2/jobhistory/JobHistoryUtils.java index 1ef213936b..d80fe40958 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/main/java/org/apache/hadoop/mapreduce/v2/jobhistory/JobHistoryUtils.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/main/java/org/apache/hadoop/mapreduce/v2/jobhistory/JobHistoryUtils.java @@ -21,6 +21,7 @@ import java.io.File; import java.io.IOException; import java.util.Calendar; +import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.concurrent.atomic.AtomicBoolean; @@ -499,4 +500,72 @@ public static Path getPreviousJobHistoryPath( return fc.makeQualified(JobHistoryUtils.getStagingJobHistoryFile( histDirPath,jobId, (applicationAttemptId.getAttemptId() - 1))); } + + /** + * Looks for the dirs to clean. The folder structure is YYYY/MM/DD/Serial so + * we can use that to more efficiently find the directories to clean by + * comparing the cutoff timestamp with the timestamp from the folder + * structure. + * + * @param fc done dir FileContext + * @param root folder for completed jobs + * @param cutoff The cutoff for the max history age + * @return The list of directories for cleaning + * @throws IOException + */ + public static List getHistoryDirsForCleaning(FileContext fc, + Path root, long cutoff) throws IOException { + List fsList = new ArrayList(); + Calendar cCal = Calendar.getInstance(); + cCal.setTimeInMillis(cutoff); + int cYear = cCal.get(Calendar.YEAR); + int cMonth = cCal.get(Calendar.MONTH) + 1; + int cDate = cCal.get(Calendar.DATE); + + RemoteIterator yearDirIt = fc.listStatus(root); + while (yearDirIt.hasNext()) { + FileStatus yearDir = yearDirIt.next(); + try { + int year = Integer.parseInt(yearDir.getPath().getName()); + if (year <= cYear) { + RemoteIterator monthDirIt = + fc.listStatus(yearDir.getPath()); + while (monthDirIt.hasNext()) { + FileStatus monthDir = monthDirIt.next(); + try { + int month = Integer.parseInt(monthDir.getPath().getName()); + // If we only checked the month here, then something like 07/2013 + // would incorrectly not pass when the cutoff is 06/2014 + if (year < cYear || month <= cMonth) { + RemoteIterator dateDirIt = + fc.listStatus(monthDir.getPath()); + while (dateDirIt.hasNext()) { + FileStatus dateDir = dateDirIt.next(); + try { + int date = Integer.parseInt(dateDir.getPath().getName()); + // If we only checked the date here, then something like + // 07/21/2013 would incorrectly not pass when the cutoff is + // 08/20/2013 or 07/20/2012 + if (year < cYear || month < cMonth || date <= cDate) { + fsList.addAll(remoteIterToList( + fc.listStatus(dateDir.getPath()))); + } + } catch (NumberFormatException nfe) { + // the directory didn't fit the format we're looking for so + // skip the dir + } + } + } + } catch (NumberFormatException nfe) { + // the directory didn't fit the format we're looking for so skip + // the dir + } + } + } + } catch (NumberFormatException nfe) { + // the directory didn't fit the format we're looking for so skip the dir + } + } + return fsList; + } } diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/test/java/org/apache/hadoop/mapreduce/v2/jobhistory/TestJobHistoryUtils.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/test/java/org/apache/hadoop/mapreduce/v2/jobhistory/TestJobHistoryUtils.java new file mode 100644 index 0000000000..6878cce7d4 --- /dev/null +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-common/src/test/java/org/apache/hadoop/mapreduce/v2/jobhistory/TestJobHistoryUtils.java @@ -0,0 +1,143 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.mapreduce.v2.jobhistory; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Calendar; +import java.util.Collections; +import java.util.List; +import org.apache.hadoop.fs.FileContext; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsPermission; +import org.junit.Assert; +import org.junit.Test; + +public class TestJobHistoryUtils { + + final static String TEST_DIR = new File(System.getProperty("test.build.data")) + .getAbsolutePath(); + + @Test + @SuppressWarnings("unchecked") + public void testGetHistoryDirsForCleaning() throws IOException { + Path pRoot = new Path(TEST_DIR, "org.apache.hadoop.mapreduce.v2.jobhistory." + + "TestJobHistoryUtils.testGetHistoryDirsForCleaning"); + FileContext fc = FileContext.getFileContext(); + Calendar cCal = Calendar.getInstance(); + int year = 2013; + int month = 7; + int day = 21; + cCal.set(year, month - 1, day, 1, 0); + long cutoff = cCal.getTimeInMillis(); + + clearDir(fc, pRoot); + Path pId00 = createPath(fc, pRoot, year, month, day, "000000"); + Path pId01 = createPath(fc, pRoot, year, month, day+1, "000001"); + Path pId02 = createPath(fc, pRoot, year, month, day-1, "000002"); + Path pId03 = createPath(fc, pRoot, year, month+1, day, "000003"); + Path pId04 = createPath(fc, pRoot, year, month+1, day+1, "000004"); + Path pId05 = createPath(fc, pRoot, year, month+1, day-1, "000005"); + Path pId06 = createPath(fc, pRoot, year, month-1, day, "000006"); + Path pId07 = createPath(fc, pRoot, year, month-1, day+1, "000007"); + Path pId08 = createPath(fc, pRoot, year, month-1, day-1, "000008"); + Path pId09 = createPath(fc, pRoot, year+1, month, day, "000009"); + Path pId10 = createPath(fc, pRoot, year+1, month, day+1, "000010"); + Path pId11 = createPath(fc, pRoot, year+1, month, day-1, "000011"); + Path pId12 = createPath(fc, pRoot, year+1, month+1, day, "000012"); + Path pId13 = createPath(fc, pRoot, year+1, month+1, day+1, "000013"); + Path pId14 = createPath(fc, pRoot, year+1, month+1, day-1, "000014"); + Path pId15 = createPath(fc, pRoot, year+1, month-1, day, "000015"); + Path pId16 = createPath(fc, pRoot, year+1, month-1, day+1, "000016"); + Path pId17 = createPath(fc, pRoot, year+1, month-1, day-1, "000017"); + Path pId18 = createPath(fc, pRoot, year-1, month, day, "000018"); + Path pId19 = createPath(fc, pRoot, year-1, month, day+1, "000019"); + Path pId20 = createPath(fc, pRoot, year-1, month, day-1, "000020"); + Path pId21 = createPath(fc, pRoot, year-1, month+1, day, "000021"); + Path pId22 = createPath(fc, pRoot, year-1, month+1, day+1, "000022"); + Path pId23 = createPath(fc, pRoot, year-1, month+1, day-1, "000023"); + Path pId24 = createPath(fc, pRoot, year-1, month-1, day, "000024"); + Path pId25 = createPath(fc, pRoot, year-1, month-1, day+1, "000025"); + Path pId26 = createPath(fc, pRoot, year-1, month-1, day-1, "000026"); + // non-expected names should be ignored without problems + Path pId27 = createPath(fc, pRoot, "foo", "" + month, "" + day, "000027"); + Path pId28 = createPath(fc, pRoot, "" + year, "foo", "" + day, "000028"); + Path pId29 = createPath(fc, pRoot, "" + year, "" + month, "foo", "000029"); + + List dirs = JobHistoryUtils + .getHistoryDirsForCleaning(fc, pRoot, cutoff); + Collections.sort(dirs); + Assert.assertEquals(14, dirs.size()); + Assert.assertEquals(pId26.toUri().getPath(), + dirs.get(0).getPath().toUri().getPath()); + Assert.assertEquals(pId24.toUri().getPath(), + dirs.get(1).getPath().toUri().getPath()); + Assert.assertEquals(pId25.toUri().getPath(), + dirs.get(2).getPath().toUri().getPath()); + Assert.assertEquals(pId20.toUri().getPath(), + dirs.get(3).getPath().toUri().getPath()); + Assert.assertEquals(pId18.toUri().getPath(), + dirs.get(4).getPath().toUri().getPath()); + Assert.assertEquals(pId19.toUri().getPath(), + dirs.get(5).getPath().toUri().getPath()); + Assert.assertEquals(pId23.toUri().getPath(), + dirs.get(6).getPath().toUri().getPath()); + Assert.assertEquals(pId21.toUri().getPath(), + dirs.get(7).getPath().toUri().getPath()); + Assert.assertEquals(pId22.toUri().getPath(), + dirs.get(8).getPath().toUri().getPath()); + Assert.assertEquals(pId08.toUri().getPath(), + dirs.get(9).getPath().toUri().getPath()); + Assert.assertEquals(pId06.toUri().getPath(), + dirs.get(10).getPath().toUri().getPath()); + Assert.assertEquals(pId07.toUri().getPath(), + dirs.get(11).getPath().toUri().getPath()); + Assert.assertEquals(pId02.toUri().getPath(), + dirs.get(12).getPath().toUri().getPath()); + Assert.assertEquals(pId00.toUri().getPath(), + dirs.get(13).getPath().toUri().getPath()); + } + + private void clearDir(FileContext fc, Path p) throws IOException { + try { + fc.delete(p, true); + } catch (FileNotFoundException e) { + // ignore + } + fc.mkdir(p, FsPermission.getDirDefault(), false); + } + + private Path createPath(FileContext fc, Path root, int year, int month, + int day, String id) throws IOException { + Path path = new Path(root, year + Path.SEPARATOR + month + Path.SEPARATOR + + day + Path.SEPARATOR + id); + fc.mkdir(path, FsPermission.getDirDefault(), true); + return path; + } + + private Path createPath(FileContext fc, Path root, String year, String month, + String day, String id) throws IOException { + Path path = new Path(root, year + Path.SEPARATOR + month + Path.SEPARATOR + + day + Path.SEPARATOR + id); + fc.mkdir(path, FsPermission.getDirDefault(), true); + return path; + } +} diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java index 2394880520..d086100240 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/main/java/org/apache/hadoop/mapreduce/v2/hs/HistoryFileManager.java @@ -924,6 +924,11 @@ private void deleteJobFromDone(HistoryFileInfo fileInfo) throws IOException { fileInfo.delete(); } + List getHistoryDirsForCleaning(long cutoff) throws IOException { + return JobHistoryUtils. + getHistoryDirsForCleaning(doneDirFc, doneDirPrefixPath, cutoff); + } + /** * Clean up older history files. * @@ -932,12 +937,9 @@ private void deleteJobFromDone(HistoryFileInfo fileInfo) throws IOException { */ @SuppressWarnings("unchecked") void clean() throws IOException { - // TODO this should be replaced by something that knows about the directory - // structure and will put less of a load on HDFS. long cutoff = System.currentTimeMillis() - maxHistoryAge; boolean halted = false; - // TODO Delete YYYY/MM/DD directories. - List serialDirList = findTimestampedDirectories(); + List serialDirList = getHistoryDirsForCleaning(cutoff); // Sort in ascending order. Relies on YYYY/MM/DD/Serial Collections.sort(serialDirList); for (FileStatus serialDir : serialDirList) { diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestJobHistory.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestJobHistory.java index 2e6d4cebda..de0de7dfda 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestJobHistory.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-hs/src/test/java/org/apache/hadoop/mapreduce/v2/hs/TestJobHistory.java @@ -37,6 +37,7 @@ import org.apache.hadoop.mapreduce.v2.jobhistory.JobHistoryUtils; import org.junit.After; import org.junit.Test; +import org.mockito.Mockito; import static org.junit.Assert.assertEquals; import static org.mockito.Mockito.*; @@ -175,7 +176,8 @@ public void testRefreshJobRetentionSettings() throws IOException, doReturn(list2).when(historyManager).scanDirectoryForHistoryFiles( eq(donePathToday), any(FileContext.class)); - doReturn(fileStatusList).when(historyManager).findTimestampedDirectories(); + doReturn(fileStatusList).when(historyManager) + .getHistoryDirsForCleaning(Mockito.anyLong()); doReturn(true).when(historyManager).deleteDir(any(FileStatus.class)); JobListCache jobListCache = mock(JobListCache.class);