MAPREDUCE-5875. Make Counter limits consistent across JobClient, MRAppMaster, and YarnChild. (Gera Shegalov via kasha)

This commit is contained in:
Karthik Kambatla 2014-10-11 22:48:47 -07:00
parent ac64ff77cf
commit e8a31f2e1c
8 changed files with 127 additions and 22 deletions

View File

@ -420,6 +420,9 @@ Release 2.6.0 - UNRELEASED
MAPREDUCE-6123. TestCombineFileInputFormat incorrectly starts 2 MAPREDUCE-6123. TestCombineFileInputFormat incorrectly starts 2
MiniDFSCluster instances. (cnauroth) MiniDFSCluster instances. (cnauroth)
MAPREDUCE-5875. Make Counter limits consistent across JobClient,
MRAppMaster, and YarnChild. (Gera Shegalov via kasha)
Release 2.5.1 - 2014-09-05 Release 2.5.1 - 2014-09-05
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -54,6 +54,7 @@
import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TypeConverter; import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.counters.Limits;
import org.apache.hadoop.mapreduce.jobhistory.AMStartedEvent; import org.apache.hadoop.mapreduce.jobhistory.AMStartedEvent;
import org.apache.hadoop.mapreduce.jobhistory.EventReader; import org.apache.hadoop.mapreduce.jobhistory.EventReader;
import org.apache.hadoop.mapreduce.jobhistory.EventType; import org.apache.hadoop.mapreduce.jobhistory.EventType;
@ -1107,6 +1108,8 @@ protected void serviceStart() throws Exception {
// finally set the job classloader // finally set the job classloader
MRApps.setClassLoader(jobClassLoader, getConfig()); MRApps.setClassLoader(jobClassLoader, getConfig());
// set job classloader if configured
Limits.init(getConfig());
if (initFailed) { if (initFailed) {
JobEvent initFailedEvent = new JobEvent(job.getID(), JobEventType.JOB_INIT_FAILED); JobEvent initFailedEvent = new JobEvent(job.getID(), JobEventType.JOB_INIT_FAILED);

View File

@ -182,15 +182,15 @@ public FileSystem run() throws IOException, InterruptedException {
public Job getJob(JobID jobId) throws IOException, InterruptedException { public Job getJob(JobID jobId) throws IOException, InterruptedException {
JobStatus status = client.getJobStatus(jobId); JobStatus status = client.getJobStatus(jobId);
if (status != null) { if (status != null) {
JobConf conf; final JobConf conf = new JobConf();
final Path jobPath = new Path(client.getFilesystemName(),
status.getJobFile());
final FileSystem fs = FileSystem.get(jobPath.toUri(), getConf());
try { try {
conf = new JobConf(status.getJobFile()); conf.addResource(fs.open(jobPath), jobPath.toString());
} catch (RuntimeException ex) { } catch (FileNotFoundException fnf) {
// If job file doesn't exist it means we can't find the job if (LOG.isWarnEnabled()) {
if (ex.getCause() instanceof FileNotFoundException) { LOG.warn("Job conf missing on cluster", fnf);
return null;
} else {
throw ex;
} }
} }
return Job.getInstance(this, status, conf); return Job.getInstance(this, status, conf);

View File

@ -50,6 +50,7 @@
import static org.apache.hadoop.mapred.QueueManager.toFullPropertyName; import static org.apache.hadoop.mapred.QueueManager.toFullPropertyName;
import org.apache.hadoop.mapreduce.counters.Limits;
import org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager; import org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager;
import org.apache.hadoop.mapreduce.filecache.DistributedCache; import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.protocol.ClientProtocol; import org.apache.hadoop.mapreduce.protocol.ClientProtocol;
@ -437,6 +438,7 @@ JobStatus submitJobInternal(Job job, Cluster cluster)
// Write job file to submit dir // Write job file to submit dir
writeConf(conf, submitJobFile); writeConf(conf, submitJobFile);
Limits.reset(conf);
// //
// Now, actually submit the job (using the submit name) // Now, actually submit the job (using the submit name)

View File

@ -123,4 +123,9 @@ public synchronized void checkGroups(int size) {
public synchronized LimitExceededException violation() { public synchronized LimitExceededException violation() {
return firstViolation; return firstViolation;
} }
public static synchronized void reset(Configuration conf) {
isInited = false;
init(conf);
}
} }

View File

@ -17,6 +17,7 @@
*/ */
package org.apache.hadoop.mapreduce.jobhistory; package org.apache.hadoop.mapreduce.jobhistory;
import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.text.DecimalFormat; import java.text.DecimalFormat;
import java.text.Format; import java.text.Format;
@ -29,6 +30,8 @@
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
@ -41,6 +44,7 @@
import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskID; import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.counters.Limits;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.JobInfo; import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.JobInfo;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo; import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.util.HostUtil; import org.apache.hadoop.mapreduce.util.HostUtil;
@ -54,7 +58,8 @@
@InterfaceAudience.Private @InterfaceAudience.Private
@InterfaceStability.Unstable @InterfaceStability.Unstable
public class HistoryViewer { public class HistoryViewer {
private static SimpleDateFormat dateFormat = private static final Log LOG = LogFactory.getLog(HistoryViewer.class);
private static final SimpleDateFormat dateFormat =
new SimpleDateFormat("d-MMM-yyyy HH:mm:ss"); new SimpleDateFormat("d-MMM-yyyy HH:mm:ss");
private FileSystem fs; private FileSystem fs;
private JobInfo job; private JobInfo job;
@ -83,6 +88,17 @@ public HistoryViewer(String historyFile,
System.err.println("Ignore unrecognized file: " + jobFile.getName()); System.err.println("Ignore unrecognized file: " + jobFile.getName());
throw new IOException(errorMsg); throw new IOException(errorMsg);
} }
final Path jobConfPath = new Path(jobFile.getParent(), jobDetails[0]
+ "_" + jobDetails[1] + "_" + jobDetails[2] + "_conf.xml");
final Configuration jobConf = new Configuration(conf);
try {
jobConf.addResource(fs.open(jobConfPath), jobConfPath.toString());
Limits.reset(conf);
} catch (FileNotFoundException fnf) {
if (LOG.isWarnEnabled()) {
LOG.warn("Missing job conf in history", fnf);
}
}
JobHistoryParser parser = new JobHistoryParser(fs, jobFile); JobHistoryParser parser = new JobHistoryParser(fs, jobFile);
job = parser.parse(); job = parser.parse();
jobId = job.getJobId().toString(); jobId = job.getJobId().toString();

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.mapreduce.v2.hs; package org.apache.hadoop.mapreduce.v2.hs;
import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.util.ArrayList; import java.util.ArrayList;
@ -34,6 +35,7 @@
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobACLsManager; import org.apache.hadoop.mapred.JobACLsManager;
import org.apache.hadoop.mapred.TaskCompletionEvent; import org.apache.hadoop.mapred.TaskCompletionEvent;
@ -41,6 +43,7 @@
import org.apache.hadoop.mapreduce.JobACL; import org.apache.hadoop.mapreduce.JobACL;
import org.apache.hadoop.mapreduce.TaskID; import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.TypeConverter; import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.counters.Limits;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser; import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.JobInfo; import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.JobInfo;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo; import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
@ -331,9 +334,21 @@ protected synchronized void loadFullHistoryData(boolean loadTasks,
if (historyFileAbsolute != null) { if (historyFileAbsolute != null) {
JobHistoryParser parser = null; JobHistoryParser parser = null;
try { try {
final FileSystem fs = historyFileAbsolute.getFileSystem(conf);
parser = parser =
new JobHistoryParser(historyFileAbsolute.getFileSystem(conf), new JobHistoryParser(historyFileAbsolute.getFileSystem(conf),
historyFileAbsolute); historyFileAbsolute);
final Path jobConfPath = new Path(historyFileAbsolute.getParent(),
JobHistoryUtils.getIntermediateConfFileName(jobId));
final Configuration conf = new Configuration();
try {
conf.addResource(fs.open(jobConfPath), jobConfPath.toString());
Limits.reset(conf);
} catch (FileNotFoundException fnf) {
if (LOG.isWarnEnabled()) {
LOG.warn("Missing job conf in history", fnf);
}
}
this.jobInfo = parser.parse(); this.jobInfo = parser.parse();
} catch (IOException e) { } catch (IOException e) {
throw new YarnRuntimeException("Could not load history file " throw new YarnRuntimeException("Could not load history file "

View File

@ -53,10 +53,14 @@
import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobID;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TaskLog; import org.apache.hadoop.mapred.TaskLog;
import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Job;
@ -105,6 +109,7 @@ public class TestMRJobs {
EnumSet.of(RMAppState.FINISHED, RMAppState.FAILED, RMAppState.KILLED); EnumSet.of(RMAppState.FINISHED, RMAppState.FAILED, RMAppState.KILLED);
private static final int NUM_NODE_MGRS = 3; private static final int NUM_NODE_MGRS = 3;
private static final String TEST_IO_SORT_MB = "11"; private static final String TEST_IO_SORT_MB = "11";
private static final String TEST_GROUP_MAX = "200";
protected static MiniMRYarnCluster mrCluster; protected static MiniMRYarnCluster mrCluster;
protected static MiniDFSCluster dfsCluster; protected static MiniDFSCluster dfsCluster;
@ -213,31 +218,58 @@ public void testSleepJob() throws IOException, InterruptedException,
} }
@Test(timeout = 300000) @Test(timeout = 300000)
public void testJobClassloader() throws IOException, InterruptedException, public void testConfVerificationWithClassloader() throws Exception {
ClassNotFoundException { testConfVerification(true, false, false, false);
testJobClassloader(false);
} }
@Test(timeout = 300000) @Test(timeout = 300000)
public void testJobClassloaderWithCustomClasses() throws IOException, public void testConfVerificationWithClassloaderCustomClasses()
InterruptedException, ClassNotFoundException { throws Exception {
testJobClassloader(true); testConfVerification(true, true, false, false);
} }
private void testJobClassloader(boolean useCustomClasses) throws IOException, @Test(timeout = 300000)
InterruptedException, ClassNotFoundException { public void testConfVerificationWithOutClassloader() throws Exception {
LOG.info("\n\n\nStarting testJobClassloader()" testConfVerification(false, false, false, false);
+ " useCustomClasses=" + useCustomClasses); }
@Test(timeout = 300000)
public void testConfVerificationWithJobClient() throws Exception {
testConfVerification(false, false, true, false);
}
@Test(timeout = 300000)
public void testConfVerificationWithJobClientLocal() throws Exception {
testConfVerification(false, false, true, true);
}
private void testConfVerification(boolean useJobClassLoader,
boolean useCustomClasses, boolean useJobClientForMonitring,
boolean useLocal) throws Exception {
LOG.info("\n\n\nStarting testConfVerification()"
+ " jobClassloader=" + useJobClassLoader
+ " customClasses=" + useCustomClasses
+ " jobClient=" + useJobClientForMonitring
+ " localMode=" + useLocal);
if (!(new File(MiniMRYarnCluster.APPJAR)).exists()) { if (!(new File(MiniMRYarnCluster.APPJAR)).exists()) {
LOG.info("MRAppJar " + MiniMRYarnCluster.APPJAR LOG.info("MRAppJar " + MiniMRYarnCluster.APPJAR
+ " not found. Not running test."); + " not found. Not running test.");
return; return;
} }
final Configuration sleepConf = new Configuration(mrCluster.getConfig()); final Configuration clusterConfig;
if (useLocal) {
clusterConfig = new Configuration();
conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.LOCAL_FRAMEWORK_NAME);
} else {
clusterConfig = mrCluster.getConfig();
}
final JobClient jc = new JobClient(clusterConfig);
final Configuration sleepConf = new Configuration(clusterConfig);
// set master address to local to test that local mode applied iff framework == local // set master address to local to test that local mode applied iff framework == local
sleepConf.set(MRConfig.MASTER_ADDRESS, "local"); sleepConf.set(MRConfig.MASTER_ADDRESS, "local");
sleepConf.setBoolean(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER, true); sleepConf.setBoolean(MRJobConfig.MAPREDUCE_JOB_CLASSLOADER,
useJobClassLoader);
if (useCustomClasses) { if (useCustomClasses) {
// to test AM loading user classes such as output format class, we want // to test AM loading user classes such as output format class, we want
// to blacklist them from the system classes (they need to be prepended // to blacklist them from the system classes (they need to be prepended
@ -255,6 +287,7 @@ private void testJobClassloader(boolean useCustomClasses) throws IOException,
sleepConf.set(MRJobConfig.MAP_LOG_LEVEL, Level.ALL.toString()); sleepConf.set(MRJobConfig.MAP_LOG_LEVEL, Level.ALL.toString());
sleepConf.set(MRJobConfig.REDUCE_LOG_LEVEL, Level.ALL.toString()); sleepConf.set(MRJobConfig.REDUCE_LOG_LEVEL, Level.ALL.toString());
sleepConf.set(MRJobConfig.MAP_JAVA_OPTS, "-verbose:class"); sleepConf.set(MRJobConfig.MAP_JAVA_OPTS, "-verbose:class");
sleepConf.set(MRJobConfig.COUNTER_GROUPS_MAX_KEY, TEST_GROUP_MAX);
final SleepJob sleepJob = new SleepJob(); final SleepJob sleepJob = new SleepJob();
sleepJob.setConf(sleepConf); sleepJob.setConf(sleepConf);
final Job job = sleepJob.createJob(1, 1, 10, 1, 10, 1); final Job job = sleepJob.createJob(1, 1, 10, 1, 10, 1);
@ -272,7 +305,26 @@ private void testJobClassloader(boolean useCustomClasses) throws IOException,
jobConf.setBoolean(MRJobConfig.MAP_SPECULATIVE, true); jobConf.setBoolean(MRJobConfig.MAP_SPECULATIVE, true);
} }
job.submit(); job.submit();
boolean succeeded = job.waitForCompletion(true); final boolean succeeded;
if (useJobClientForMonitring && !useLocal) {
// We can't use getJobID in useLocal case because JobClient and Job
// point to different instances of LocalJobRunner
//
final JobID mapredJobID = JobID.downgrade(job.getJobID());
RunningJob runningJob = null;
do {
Thread.sleep(10);
runningJob = jc.getJob(mapredJobID);
} while (runningJob == null);
Assert.assertEquals("Unexpected RunningJob's "
+ MRJobConfig.COUNTER_GROUPS_MAX_KEY,
TEST_GROUP_MAX, runningJob.getConfiguration()
.get(MRJobConfig.COUNTER_GROUPS_MAX_KEY));
runningJob.waitForCompletion();
succeeded = runningJob.isSuccessful();
} else {
succeeded = job.waitForCompletion(true);
}
Assert.assertTrue("Job status: " + job.getStatus().getFailureInfo(), Assert.assertTrue("Job status: " + job.getStatus().getFailureInfo(),
succeeded); succeeded);
} }
@ -925,5 +977,14 @@ protected void setup(Context context)
+ ", actual: " + ioSortMb); + ", actual: " + ioSortMb);
} }
} }
@Override
public void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {
super.map(key, value, context);
for (int i = 0; i < 100; i++) {
context.getCounter("testCounterGroup-" + i,
"testCounter").increment(1);
}
}
} }
} }