MAPREDUCE-3481. [Gridmix] Improve Gridmix STRESS mode. (amarrk)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1237543 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Amar Kamat 2012-01-30 06:20:53 +00:00
parent 98302971c2
commit 5652e71992
5 changed files with 193 additions and 70 deletions

View File

@ -12,6 +12,8 @@ Trunk (unreleased changes)
(Plamen Jeliazkov via shv) (Plamen Jeliazkov via shv)
IMPROVEMENTS IMPROVEMENTS
MAPREDUCE-3481. [Gridmix] Improve Gridmix STRESS mode. (amarrk)
MAPREDUCE-3597. [Rumen] Rumen should provide APIs to access all the MAPREDUCE-3597. [Rumen] Rumen should provide APIs to access all the
job-history related information. job-history related information.

View File

@ -101,13 +101,15 @@ public void addJobStats(Job job, JobStory jobdesc) {
} }
int maps = 0; int maps = 0;
int reds = 0;
if (jobdesc == null) { if (jobdesc == null) {
throw new IllegalArgumentException( throw new IllegalArgumentException(
" JobStory not available for job " + job.getJobName()); " JobStory not available for job " + job.getJobName());
} else { } else {
maps = jobdesc.getNumberMaps(); maps = jobdesc.getNumberMaps();
reds = jobdesc.getNumberReduces();
} }
JobStats stats = new JobStats(maps,job); JobStats stats = new JobStats(maps, reds, job);
jobMaps.put(seq,stats); jobMaps.put(seq,stats);
} }
@ -258,15 +260,20 @@ public void abort() {
*/ */
static class JobStats { static class JobStats {
private int noOfMaps; private int noOfMaps;
private int noOfReds;
private Job job; private Job job;
public JobStats(int noOfMaps,Job job){ public JobStats(int noOfMaps,int numOfReds, Job job){
this.job = job; this.job = job;
this.noOfMaps = noOfMaps; this.noOfMaps = noOfMaps;
this.noOfReds = numOfReds;
} }
public int getNoOfMaps() { public int getNoOfMaps() {
return noOfMaps; return noOfMaps;
} }
public int getNoOfReds() {
return noOfReds;
}
/** /**
* Returns the job , * Returns the job ,

View File

@ -31,13 +31,12 @@
import java.io.IOException; import java.io.IOException;
import java.util.concurrent.CountDownLatch; import java.util.concurrent.CountDownLatch;
import java.util.concurrent.locks.Condition; import java.util.concurrent.atomic.AtomicBoolean;
public class StressJobFactory extends JobFactory<Statistics.ClusterStats> { public class StressJobFactory extends JobFactory<Statistics.ClusterStats> {
public static final Log LOG = LogFactory.getLog(StressJobFactory.class); public static final Log LOG = LogFactory.getLog(StressJobFactory.class);
private final LoadStatus loadStatus = new LoadStatus(); private final LoadStatus loadStatus = new LoadStatus();
private final Condition condUnderloaded = this.lock.newCondition();
/** /**
* The minimum ratio between pending+running map tasks (aka. incomplete map * The minimum ratio between pending+running map tasks (aka. incomplete map
* tasks) and cluster map slot capacity for us to consider the cluster is * tasks) and cluster map slot capacity for us to consider the cluster is
@ -150,23 +149,32 @@ public void run() {
} }
LOG.info("START STRESS @ " + System.currentTimeMillis()); LOG.info("START STRESS @ " + System.currentTimeMillis());
while (!Thread.currentThread().isInterrupted()) { while (!Thread.currentThread().isInterrupted()) {
lock.lock();
try { try {
while (loadStatus.overloaded()) { while (loadStatus.overloaded()) {
//Wait while JT is overloaded. if (LOG.isDebugEnabled()) {
LOG.debug("Cluster overloaded in run! Sleeping...");
}
// sleep
try { try {
condUnderloaded.await(); Thread.sleep(1000);
} catch (InterruptedException ie) { } catch (InterruptedException ie) {
return; return;
} }
} }
while (!loadStatus.overloaded()) { while (!loadStatus.overloaded()) {
if (LOG.isDebugEnabled()) {
LOG.debug("Cluster underloaded in run! Stressing...");
}
try { try {
//TODO This in-line read can block submission for large jobs.
final JobStory job = getNextJobFiltered(); final JobStory job = getNextJobFiltered();
if (null == job) { if (null == job) {
return; return;
} }
if (LOG.isDebugEnabled()) {
LOG.debug("Job Selected: " + job.getJobID());
}
submitter.add( submitter.add(
jobCreator.createGridmixJob( jobCreator.createGridmixJob(
conf, 0L, job, scratch, conf, 0L, job, scratch,
@ -175,14 +183,20 @@ public void run() {
sequence.getAndIncrement())); sequence.getAndIncrement()));
// TODO: We need to take care of scenario when one map/reduce // TODO: We need to take care of scenario when one map/reduce
// takes more than 1 slot. // takes more than 1 slot.
loadStatus.mapSlotsBackfill -=
calcEffectiveIncompleteMapTasks( // Lock the loadjob as we are making updates
loadStatus.mapSlotCapacity, job.getNumberMaps(), 0.0f); int incompleteMapTasks = (int) calcEffectiveIncompleteMapTasks(
loadStatus.reduceSlotsBackfill -= loadStatus.getMapCapacity(),
calcEffectiveIncompleteReduceTasks( job.getNumberMaps(), 0.0f);
loadStatus.reduceSlotCapacity, job.getNumberReduces(), loadStatus.decrementMapLoad(incompleteMapTasks);
0.0f);
--loadStatus.numJobsBackfill; int incompleteReduceTasks =
(int) calcEffectiveIncompleteReduceTasks(
loadStatus.getReduceCapacity(),
job.getNumberReduces(), 0.0f);
loadStatus.decrementReduceLoad(incompleteReduceTasks);
loadStatus.decrementJobLoad(1);
} catch (IOException e) { } catch (IOException e) {
LOG.error("Error while submitting the job ", e); LOG.error("Error while submitting the job ", e);
error = e; error = e;
@ -191,7 +205,7 @@ public void run() {
} }
} finally { } finally {
lock.unlock(); // do nothing
} }
} }
} catch (InterruptedException e) { } catch (InterruptedException e) {
@ -210,19 +224,11 @@ public void run() {
*/ */
@Override @Override
public void update(Statistics.ClusterStats item) { public void update(Statistics.ClusterStats item) {
lock.lock(); ClusterStatus clusterMetrics = item.getStatus();
try { try {
ClusterStatus clusterMetrics = item.getStatus(); checkLoadAndGetSlotsToBackfill(item, clusterMetrics);
try { } catch (Exception e) {
checkLoadAndGetSlotsToBackfill(item,clusterMetrics); LOG.error("Couldn't get the new Status",e);
} catch (Exception e) {
LOG.error("Couldn't get the new Status",e);
}
if (!loadStatus.overloaded()) {
condUnderloaded.signalAll();
}
} finally {
lock.unlock();
} }
} }
@ -254,18 +260,25 @@ float calcEffectiveIncompleteReduceTasks(int reduceSlotCapacity,
*/ */
private void checkLoadAndGetSlotsToBackfill( private void checkLoadAndGetSlotsToBackfill(
ClusterStats stats, ClusterStatus clusterStatus) throws IOException, InterruptedException { ClusterStats stats, ClusterStatus clusterStatus) throws IOException, InterruptedException {
loadStatus.mapSlotCapacity = clusterStatus.getMaxMapTasks();
loadStatus.reduceSlotCapacity = clusterStatus.getMaxReduceTasks();
// update the max cluster capacity incase its updated
int mapCapacity = clusterStatus.getMaxMapTasks();
loadStatus.updateMapCapacity(mapCapacity);
loadStatus.numJobsBackfill = int reduceCapacity = clusterStatus.getMaxReduceTasks();
(int) (maxJobTrackerRatio * clusterStatus.getTaskTrackers())
- stats.getNumRunningJob(); loadStatus.updateReduceCapacity(reduceCapacity);
if (loadStatus.numJobsBackfill <= 0) {
int numTrackers = clusterStatus.getTaskTrackers();
int jobLoad =
(int) (maxJobTrackerRatio * numTrackers) - stats.getNumRunningJob();
loadStatus.updateJobLoad(jobLoad);
if (loadStatus.getJobLoad() <= 0) {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug(System.currentTimeMillis() + " Overloaded is " LOG.debug(System.currentTimeMillis() + " [JobLoad] Overloaded is "
+ Boolean.TRUE.toString() + " NumJobsBackfill is " + Boolean.TRUE.toString() + " NumJobsBackfill is "
+ loadStatus.numJobsBackfill); + loadStatus.getJobLoad());
} }
return; // stop calculation because we know it is overloaded. return; // stop calculation because we know it is overloaded.
} }
@ -275,56 +288,84 @@ private void checkLoadAndGetSlotsToBackfill(
float mapProgress = job.getJob().mapProgress(); float mapProgress = job.getJob().mapProgress();
int noOfMaps = job.getNoOfMaps(); int noOfMaps = job.getNoOfMaps();
incompleteMapTasks += incompleteMapTasks +=
calcEffectiveIncompleteMapTasks( calcEffectiveIncompleteMapTasks(mapCapacity, noOfMaps, mapProgress);
clusterStatus.getMaxMapTasks(), noOfMaps, mapProgress);
} }
loadStatus.mapSlotsBackfill =
(int) ((overloadMapTaskMapSlotRatio * clusterStatus.getMaxMapTasks()) int mapSlotsBackFill =
- incompleteMapTasks); (int) ((overloadMapTaskMapSlotRatio * mapCapacity) - incompleteMapTasks);
if (loadStatus.mapSlotsBackfill <= 0) { loadStatus.updateMapLoad(mapSlotsBackFill);
if (loadStatus.getMapLoad() <= 0) {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug(System.currentTimeMillis() + " Overloaded is " LOG.debug(System.currentTimeMillis() + " [MAP-LOAD] Overloaded is "
+ Boolean.TRUE.toString() + " MapSlotsBackfill is " + Boolean.TRUE.toString() + " MapSlotsBackfill is "
+ loadStatus.mapSlotsBackfill); + loadStatus.getMapLoad());
} }
return; // stop calculation because we know it is overloaded. return; // stop calculation because we know it is overloaded.
} }
float incompleteReduceTasks = 0; // include pending & running reduce tasks. float incompleteReduceTasks = 0; // include pending & running reduce tasks.
for (JobStats job : ClusterStats.getRunningJobStats()) { for (JobStats job : ClusterStats.getRunningJobStats()) {
int noOfReduces = job.getJob().getNumReduceTasks(); // Cached the num-reds value in JobStats
int noOfReduces = job.getNoOfReds();
if (noOfReduces > 0) { if (noOfReduces > 0) {
float reduceProgress = job.getJob().reduceProgress(); float reduceProgress = job.getJob().reduceProgress();
incompleteReduceTasks += incompleteReduceTasks +=
calcEffectiveIncompleteReduceTasks( calcEffectiveIncompleteReduceTasks(reduceCapacity, noOfReduces,
clusterStatus.getMaxReduceTasks(), noOfReduces, reduceProgress); reduceProgress);
} }
} }
loadStatus.reduceSlotsBackfill =
(int) ((overloadReduceTaskReduceSlotRatio * clusterStatus.getMaxReduceTasks()) int reduceSlotsBackFill =
(int)((overloadReduceTaskReduceSlotRatio * reduceCapacity)
- incompleteReduceTasks); - incompleteReduceTasks);
if (loadStatus.reduceSlotsBackfill <= 0) { loadStatus.updateReduceLoad(reduceSlotsBackFill);
if (loadStatus.getReduceLoad() <= 0) {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug(System.currentTimeMillis() + " Overloaded is " LOG.debug(System.currentTimeMillis() + " [REDUCE-LOAD] Overloaded is "
+ Boolean.TRUE.toString() + " ReduceSlotsBackfill is " + Boolean.TRUE.toString() + " ReduceSlotsBackfill is "
+ loadStatus.reduceSlotsBackfill); + loadStatus.getReduceLoad());
} }
return; // stop calculation because we know it is overloaded. return; // stop calculation because we know it is overloaded.
} }
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug(System.currentTimeMillis() + " Overloaded is " LOG.debug(System.currentTimeMillis() + " [OVERALL] Overloaded is "
+ Boolean.FALSE.toString() + "Current load Status is " + Boolean.FALSE.toString() + "Current load Status is "
+ loadStatus); + loadStatus);
} }
} }
static class LoadStatus { static class LoadStatus {
int mapSlotsBackfill; /**
int mapSlotCapacity; * Additional number of map slots that can be requested before
int reduceSlotsBackfill; * declaring (by Gridmix STRESS mode) the cluster as overloaded.
int reduceSlotCapacity; */
int numJobsBackfill; private volatile int mapSlotsBackfill;
/**
* Determines the total map slot capacity of the cluster.
*/
private volatile int mapSlotCapacity;
/**
* Additional number of reduce slots that can be requested before
* declaring (by Gridmix STRESS mode) the cluster as overloaded.
*/
private volatile int reduceSlotsBackfill;
/**
* Determines the total reduce slot capacity of the cluster.
*/
private volatile int reduceSlotCapacity;
/**
* Determines the max count of running jobs in the cluster.
*/
private volatile int numJobsBackfill;
// set the default to true
private AtomicBoolean overloaded = new AtomicBoolean(true);
/** /**
* Construct the LoadStatus in an unknown state - assuming the cluster is * Construct the LoadStatus in an unknown state - assuming the cluster is
@ -339,12 +380,76 @@ static class LoadStatus {
reduceSlotCapacity = -1; reduceSlotCapacity = -1;
} }
public boolean overloaded() { public synchronized int getMapLoad() {
return (mapSlotsBackfill <= 0) || (reduceSlotsBackfill <= 0) return mapSlotsBackfill;
|| (numJobsBackfill <= 0);
} }
public String toString() { public synchronized int getMapCapacity() {
return mapSlotCapacity;
}
public synchronized int getReduceLoad() {
return reduceSlotsBackfill;
}
public synchronized int getReduceCapacity() {
return reduceSlotCapacity;
}
public synchronized int getJobLoad() {
return numJobsBackfill;
}
public synchronized void decrementMapLoad(int mapSlotsConsumed) {
this.mapSlotsBackfill -= mapSlotsConsumed;
updateOverloadStatus();
}
public synchronized void decrementReduceLoad(int reduceSlotsConsumed) {
this.reduceSlotsBackfill -= reduceSlotsConsumed;
updateOverloadStatus();
}
public synchronized void decrementJobLoad(int numJobsConsumed) {
this.numJobsBackfill -= numJobsConsumed;
updateOverloadStatus();
}
public synchronized void updateMapCapacity(int mapSlotsCapacity) {
this.mapSlotCapacity = mapSlotsCapacity;
updateOverloadStatus();
}
public synchronized void updateReduceCapacity(int reduceSlotsCapacity) {
this.reduceSlotCapacity = reduceSlotsCapacity;
updateOverloadStatus();
}
public synchronized void updateMapLoad(int mapSlotsBackfill) {
this.mapSlotsBackfill = mapSlotsBackfill;
updateOverloadStatus();
}
public synchronized void updateReduceLoad(int reduceSlotsBackfill) {
this.reduceSlotsBackfill = reduceSlotsBackfill;
updateOverloadStatus();
}
public synchronized void updateJobLoad(int numJobsBackfill) {
this.numJobsBackfill = numJobsBackfill;
updateOverloadStatus();
}
private synchronized void updateOverloadStatus() {
overloaded.set((mapSlotsBackfill <= 0) || (reduceSlotsBackfill <= 0)
|| (numJobsBackfill <= 0));
}
public synchronized boolean overloaded() {
return overloaded.get();
}
public synchronized String toString() {
// TODO Use StringBuilder instead // TODO Use StringBuilder instead
return " Overloaded = " + overloaded() return " Overloaded = " + overloaded()
+ ", MapSlotBackfill = " + mapSlotsBackfill + ", MapSlotBackfill = " + mapSlotsBackfill

View File

@ -101,10 +101,17 @@ public TestMonitor(int expected, Statistics stats) {
retiredJobs = new LinkedBlockingQueue<Job>(); retiredJobs = new LinkedBlockingQueue<Job>();
} }
public void verify(ArrayList<JobStory> submitted) throws Exception { public void verify(ArrayList<JobStory> submitted, Configuration clientConf)
throws Exception {
final ArrayList<Job> succeeded = new ArrayList<Job>(); final ArrayList<Job> succeeded = new ArrayList<Job>();
assertEquals("Bad job count", expected, retiredJobs.drainTo(succeeded)); assertEquals("Bad job count", expected, retiredJobs.drainTo(succeeded));
final HashMap<String,JobStory> sub = new HashMap<String,JobStory>(); final HashMap<String,JobStory> sub = new HashMap<String,JobStory>();
// define the input and output path for the run
final Path in = new Path("foo").makeQualified(GridmixTestUtils.dfs);
final Path out =
new Path(in, clientConf.get(Gridmix.GRIDMIX_OUT_DIR, "gridmix"));
for (JobStory spec : submitted) { for (JobStory spec : submitted) {
sub.put(spec.getJobID().toString(), spec); sub.put(spec.getJobID().toString(), spec);
} }
@ -115,8 +122,7 @@ public void verify(ArrayList<JobStory> submitted) throws Exception {
Configuration conf = job.getConfiguration(); Configuration conf = job.getConfiguration();
if (GenerateData.JOB_NAME.equals(jobName)) { if (GenerateData.JOB_NAME.equals(jobName)) {
verifyQueue(conf, jobName); verifyQueue(conf, jobName);
final Path in = new Path("foo").makeQualified(GridmixTestUtils.dfs);
final Path out = new Path("/gridmix").makeQualified(GridmixTestUtils.dfs);
final ContentSummary generated = GridmixTestUtils.dfs.getContentSummary(in); final ContentSummary generated = GridmixTestUtils.dfs.getContentSummary(in);
assertTrue("Mismatched data gen", // +/- 100k for logs assertTrue("Mismatched data gen", // +/- 100k for logs
(GENDATA << 20) < generated.getLength() + GENSLOP || (GENDATA << 20) < generated.getLength() + GENSLOP ||
@ -164,7 +170,7 @@ public void verify(ArrayList<JobStory> submitted) throws Exception {
final FileStatus stat = final FileStatus stat =
GridmixTestUtils.dfs.getFileStatus( GridmixTestUtils.dfs.getFileStatus(
new Path(GridmixTestUtils.DEST, "" + Integer.valueOf(jobSeqNum))); new Path(out, "" + Integer.valueOf(jobSeqNum)));
assertEquals("Wrong owner for " + jobName, spec.getUser(), assertEquals("Wrong owner for " + jobName, spec.getUser(),
stat.getOwner()); stat.getOwner());
@ -337,8 +343,9 @@ static class DebugGridmix extends Gridmix {
private JobFactory factory; private JobFactory factory;
private TestMonitor monitor; private TestMonitor monitor;
public void checkMonitor() throws Exception { public void checkMonitor(Configuration conf) throws Exception {
monitor.verify(((DebugJobFactory.Debuggable)factory).getSubmitted()); monitor.verify(((DebugJobFactory.Debuggable)factory).getSubmitted(),
conf);
} }
@Override @Override
@ -534,9 +541,11 @@ private void doSubmission(boolean useDefaultQueue,
GridmixTestUtils.dfs.setPermission(root, new FsPermission((short)0777)); GridmixTestUtils.dfs.setPermission(root, new FsPermission((short)0777));
int res = ToolRunner.run(conf, client, argv); int res = ToolRunner.run(conf, client, argv);
assertEquals("Client exited with nonzero status", 0, res); assertEquals("Client exited with nonzero status", 0, res);
client.checkMonitor(); client.checkMonitor(conf);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
// fail the test if there is an exception
throw new RuntimeException(e);
} finally { } finally {
in.getFileSystem(conf).delete(in, true); in.getFileSystem(conf).delete(in, true);
out.getFileSystem(conf).delete(out, true); out.getFileSystem(conf).delete(out, true);

View File

@ -338,7 +338,7 @@ public boolean isSuccessful() throws IOException, InterruptedException {
return isSuccessful; return isSuccessful;
}; };
}; };
return new JobStats(numMaps, fakeJob); return new JobStats(numMaps, numReds, fakeJob);
} }
/** /**