From 4efdf3a979c361348612f817a3253be6d0de58f7 Mon Sep 17 00:00:00 2001 From: Xuan Date: Tue, 26 Jan 2016 18:17:12 -0800 Subject: [PATCH] YARN-4612. Fix rumen and scheduler load simulator handle killed tasks properly. Contributed by Ming Ma. --- .../apache/hadoop/tools/rumen/JobBuilder.java | 11 +- .../src/main/data/2jobs2min-rumen-jh.json | 606 ++++++++++++++++++ .../org/apache/hadoop/yarn/sls/SLSRunner.java | 6 + .../hadoop/yarn/sls/utils/SLSUtils.java | 6 + hadoop-yarn-project/CHANGES.txt | 3 + 5 files changed, 628 insertions(+), 4 deletions(-) diff --git a/hadoop-tools/hadoop-rumen/src/main/java/org/apache/hadoop/tools/rumen/JobBuilder.java b/hadoop-tools/hadoop-rumen/src/main/java/org/apache/hadoop/tools/rumen/JobBuilder.java index c5ae2fc36d..890f388d4a 100644 --- a/hadoop-tools/hadoop-rumen/src/main/java/org/apache/hadoop/tools/rumen/JobBuilder.java +++ b/hadoop-tools/hadoop-rumen/src/main/java/org/apache/hadoop/tools/rumen/JobBuilder.java @@ -473,9 +473,12 @@ private void processTaskFailedEvent(TaskFailedEvent event) { task.setTaskStatus(getPre21Value(event.getTaskStatus())); TaskFailed t = (TaskFailed)(event.getDatum()); task.putDiagnosticInfo(t.error.toString()); - task.putFailedDueToAttemptId(t.failedDueToAttempt.toString()); + // killed task wouldn't have failed attempt. + if (t.getFailedDueToAttempt() != null) { + task.putFailedDueToAttemptId(t.getFailedDueToAttempt().toString()); + } org.apache.hadoop.mapreduce.jobhistory.JhCounters counters = - ((TaskFailed) event.getDatum()).counters; + ((TaskFailed) event.getDatum()).getCounters(); task.incorporateCounters( counters == null ? EMPTY_COUNTERS : counters); } @@ -500,7 +503,7 @@ private void processTaskAttemptUnsuccessfulCompletionEvent( attempt.setFinishTime(event.getFinishTime()); org.apache.hadoop.mapreduce.jobhistory.JhCounters counters = - ((TaskAttemptUnsuccessfulCompletion) event.getDatum()).counters; + ((TaskAttemptUnsuccessfulCompletion) event.getDatum()).getCounters(); attempt.incorporateCounters( counters == null ? EMPTY_COUNTERS : counters); attempt.arraySetClockSplits(event.getClockSplits()); @@ -509,7 +512,7 @@ private void processTaskAttemptUnsuccessfulCompletionEvent( attempt.arraySetPhysMemKbytes(event.getPhysMemKbytes()); TaskAttemptUnsuccessfulCompletion t = (TaskAttemptUnsuccessfulCompletion) (event.getDatum()); - attempt.putDiagnosticInfo(t.error.toString()); + attempt.putDiagnosticInfo(t.getError().toString()); } private void processTaskAttemptStartedEvent(TaskAttemptStartedEvent event) { diff --git a/hadoop-tools/hadoop-sls/src/main/data/2jobs2min-rumen-jh.json b/hadoop-tools/hadoop-sls/src/main/data/2jobs2min-rumen-jh.json index 83629ed68e..9d90deb3c7 100644 --- a/hadoop-tools/hadoop-sls/src/main/data/2jobs2min-rumen-jh.json +++ b/hadoop-tools/hadoop-sls/src/main/data/2jobs2min-rumen-jh.json @@ -10208,4 +10208,610 @@ "clusterReduceMB" : -1, "jobMapMB" : 200, "jobReduceMB" : 200 +} { +"priority" : "NORMAL", +"jobID" : "job_1369942127770_1207", +"user" : "jenkins", +"jobName" : "TeraGen", +"submitTime" : 1371223054499, +"finishTime" : 1371223153874, +"queue" : "sls_queue_1", +"mapTasks" : [ { +"startTime" : 1371223059053, +"taskID" : "task_1369942127770_1207_m_000000", +"taskType" : "MAP", +"finishTime" : 1371223078206, +"attempts" : [ ], +"preferredLocations" : [ ], +"taskStatus" : "KILLED", +"inputBytes" : -1, +"inputRecords" : -1, +"outputBytes" : -1, +"outputRecords" : -1 +} ], +"reduceTasks" : [ ], +"launchTime" : 1371223058937, +"totalMaps" : 1, +"totalReduces" : 0, +"otherTasks" : [ ], +"jobProperties" : { +"mapreduce.job.ubertask.enable" : "false", +"yarn.resourcemanager.max-completed-applications" : "10000", +"yarn.resourcemanager.delayed.delegation-token.removal-interval-ms" : "30000", +"mapreduce.client.submit.file.replication" : "2", +"yarn.nodemanager.container-manager.thread-count" : "20", +"mapred.queue.default.acl-administer-jobs" : "*", +"dfs.image.transfer.bandwidthPerSec" : "0", +"mapreduce.tasktracker.healthchecker.interval" : "60000", +"mapreduce.jobtracker.staging.root.dir" : "/user", +"yarn.resourcemanager.recovery.enabled" : "false", +"yarn.resourcemanager.am.max-retries" : "1", +"dfs.block.access.token.lifetime" : "600", +"fs.AbstractFileSystem.file.impl" : "org.apache.hadoop.fs.local.LocalFs", +"mapreduce.client.completion.pollinterval" : "5000", +"mapreduce.job.ubertask.maxreduces" : "1", +"mapreduce.reduce.shuffle.memory.limit.percent" : "0.25", +"dfs.domain.socket.path" : "/var/run/hdfs-sockets/dn", +"hadoop.ssl.keystores.factory.class" : "org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory", +"hadoop.http.authentication.kerberos.keytab" : "${user.home}/hadoop.keytab", +"yarn.nodemanager.keytab" : "/etc/krb5.keytab", +"io.seqfile.sorter.recordlimit" : "1000000", +"s3.blocksize" : "67108864", +"mapreduce.task.io.sort.factor" : "10", +"yarn.nodemanager.disk-health-checker.interval-ms" : "120000", +"mapreduce.job.working.dir" : "hdfs://a2115.smile.com:8020/user/jenkins", +"yarn.admin.acl" : "*", +"mapreduce.job.speculative.speculativecap" : "0.1", +"dfs.namenode.num.checkpoints.retained" : "2", +"dfs.namenode.delegation.token.renew-interval" : "86400000", +"yarn.nodemanager.resource.memory-mb" : "8192", +"io.map.index.interval" : "128", +"s3.client-write-packet-size" : "65536", +"mapreduce.task.files.preserve.failedtasks" : "false", +"dfs.namenode.http-address" : "a2115.smile.com:20101", +"ha.zookeeper.session-timeout.ms" : "5000", +"hadoop.hdfs.configuration.version" : "1", +"s3.replication" : "3", +"dfs.datanode.balance.bandwidthPerSec" : "1048576", +"mapreduce.reduce.shuffle.connect.timeout" : "180000", +"hadoop.ssl.enabled" : "false", +"dfs.journalnode.rpc-address" : "0.0.0.0:8485", +"yarn.nodemanager.aux-services" : "mapreduce.shuffle", +"mapreduce.job.counters.max" : "120", +"dfs.datanode.readahead.bytes" : "4193404", +"ipc.client.connect.max.retries.on.timeouts" : "45", +"mapreduce.job.complete.cancel.delegation.tokens" : "true", +"dfs.client.failover.max.attempts" : "15", +"dfs.namenode.checkpoint.dir" : "file://${hadoop.tmp.dir}/dfs/namesecondary", +"dfs.namenode.replication.work.multiplier.per.iteration" : "2", +"fs.trash.interval" : "1", +"yarn.resourcemanager.admin.address" : "a2115.smile.com:8033", +"ha.health-monitor.check-interval.ms" : "1000", +"mapreduce.job.outputformat.class" : "org.apache.hadoop.examples.terasort.TeraOutputFormat", +"hadoop.jetty.logs.serve.aliases" : "true", +"hadoop.http.authentication.kerberos.principal" : "HTTP/_HOST@LOCALHOST", +"mapreduce.job.reduce.shuffle.consumer.plugin.class" : "org.apache.hadoop.mapreduce.task.reduce.Shuffle", +"s3native.blocksize" : "67108864", +"dfs.namenode.edits.dir" : "${dfs.namenode.name.dir}", +"ha.health-monitor.sleep-after-disconnect.ms" : "1000", +"dfs.encrypt.data.transfer" : "false", +"dfs.datanode.http.address" : "0.0.0.0:50075", +"mapreduce.terasort.num-rows" : "400000000", +"mapreduce.job.map.class" : "org.apache.hadoop.examples.terasort.TeraGen$SortGenMapper", +"mapreduce.jobtracker.jobhistory.task.numberprogresssplits" : "12", +"dfs.namenode.write.stale.datanode.ratio" : "0.5f", +"dfs.client.use.datanode.hostname" : "false", +"yarn.acl.enable" : "true", +"hadoop.security.instrumentation.requires.admin" : "false", +"yarn.nodemanager.localizer.fetch.thread-count" : "4", +"hadoop.security.authorization" : "false", +"user.name" : "jenkins", +"dfs.namenode.fs-limits.min-block-size" : "1048576", +"dfs.client.failover.connection.retries.on.timeouts" : "0", +"hadoop.security.group.mapping.ldap.search.filter.group" : "(objectClass=group)", +"mapreduce.output.fileoutputformat.compress.codec" : "org.apache.hadoop.io.compress.DefaultCodec", +"dfs.namenode.safemode.extension" : "30000", +"mapreduce.shuffle.port" : "8080", +"mapreduce.reduce.log.level" : "INFO", +"yarn.log-aggregation-enable" : "false", +"dfs.datanode.sync.behind.writes" : "false", +"mapreduce.jobtracker.instrumentation" : "org.apache.hadoop.mapred.JobTrackerMetricsInst", +"dfs.https.server.keystore.resource" : "ssl-server.xml", +"hadoop.security.group.mapping.ldap.search.attr.group.name" : "cn", +"dfs.namenode.replication.min" : "1", +"mapreduce.map.java.opts" : " -Xmx825955249", +"yarn.scheduler.fair.allocation.file" : "/etc/yarn/fair-scheduler.xml", +"s3native.bytes-per-checksum" : "512", +"mapreduce.tasktracker.tasks.sleeptimebeforesigkill" : "5000", +"tfile.fs.output.buffer.size" : "262144", +"yarn.nodemanager.local-dirs" : "${hadoop.tmp.dir}/nm-local-dir", +"mapreduce.jobtracker.persist.jobstatus.active" : "false", +"fs.AbstractFileSystem.hdfs.impl" : "org.apache.hadoop.fs.Hdfs", +"mapreduce.job.map.output.collector.class" : "org.apache.hadoop.mapred.MapTask$MapOutputBuffer", +"mapreduce.tasktracker.local.dir.minspacestart" : "0", +"dfs.namenode.safemode.min.datanodes" : "0", +"hadoop.security.uid.cache.secs" : "14400", +"dfs.client.https.need-auth" : "false", +"dfs.client.write.exclude.nodes.cache.expiry.interval.millis" : "600000", +"dfs.client.https.keystore.resource" : "ssl-client.xml", +"dfs.namenode.max.objects" : "0", +"hadoop.ssl.client.conf" : "ssl-client.xml", +"dfs.namenode.safemode.threshold-pct" : "0.999f", +"mapreduce.tasktracker.local.dir.minspacekill" : "0", +"mapreduce.jobtracker.retiredjobs.cache.size" : "1000", +"dfs.blocksize" : "134217728", +"yarn.resourcemanager.scheduler.class" : "org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler", +"mapreduce.job.reduce.slowstart.completedmaps" : "0.8", +"mapreduce.job.end-notification.retry.attempts" : "5", +"mapreduce.job.inputformat.class" : "org.apache.hadoop.examples.terasort.TeraGen$RangeInputFormat", +"mapreduce.map.memory.mb" : "1024", +"mapreduce.job.user.name" : "jenkins", +"mapreduce.tasktracker.outofband.heartbeat" : "false", +"io.native.lib.available" : "true", +"mapreduce.jobtracker.persist.jobstatus.hours" : "0", +"dfs.client-write-packet-size" : "65536", +"mapreduce.client.progressmonitor.pollinterval" : "1000", +"dfs.namenode.name.dir" : "file://${hadoop.tmp.dir}/dfs/name", +"dfs.ha.log-roll.period" : "120", +"mapreduce.reduce.input.buffer.percent" : "0.0", +"mapreduce.map.output.compress.codec" : "org.apache.hadoop.io.compress.SnappyCodec", +"dfs.client.failover.sleep.base.millis" : "500", +"dfs.datanode.directoryscan.threads" : "1", +"mapreduce.jobtracker.address" : "neededForHive:999999", +"mapreduce.cluster.local.dir" : "${hadoop.tmp.dir}/mapred/local", +"yarn.scheduler.fair.user-as-default-queue" : "true", +"mapreduce.job.application.attempt.id" : "1", +"dfs.permissions.enabled" : "true", +"mapreduce.tasktracker.taskcontroller" : "org.apache.hadoop.mapred.DefaultTaskController", +"yarn.scheduler.fair.preemption" : "true", +"mapreduce.reduce.shuffle.parallelcopies" : "5", +"dfs.support.append" : "true", +"yarn.nodemanager.env-whitelist" : "JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,YARN_HOME", +"mapreduce.jobtracker.heartbeats.in.second" : "100", +"mapreduce.job.maxtaskfailures.per.tracker" : "3", +"ipc.client.connection.maxidletime" : "10000", +"mapreduce.shuffle.ssl.enabled" : "false", +"dfs.namenode.invalidate.work.pct.per.iteration" : "0.32f", +"dfs.blockreport.intervalMsec" : "21600000", +"fs.s3.sleepTimeSeconds" : "10", +"dfs.namenode.replication.considerLoad" : "true", +"dfs.client.block.write.retries" : "3", +"hadoop.ssl.server.conf" : "ssl-server.xml", +"dfs.namenode.name.dir.restore" : "false", +"rpc.engine.org.apache.hadoop.mapreduce.v2.api.MRClientProtocolPB" : "org.apache.hadoop.ipc.ProtobufRpcEngine", +"dfs.datanode.hdfs-blocks-metadata.enabled" : "true", +"ha.zookeeper.parent-znode" : "/hadoop-ha", +"io.seqfile.lazydecompress" : "true", +"mapreduce.reduce.merge.inmem.threshold" : "1000", +"mapreduce.input.fileinputformat.split.minsize" : "0", +"dfs.replication" : "3", +"ipc.client.tcpnodelay" : "false", +"dfs.namenode.accesstime.precision" : "3600000", +"s3.stream-buffer-size" : "4096", +"mapreduce.jobtracker.tasktracker.maxblacklists" : "4", +"dfs.client.read.shortcircuit.skip.checksum" : "false", +"mapreduce.job.jvm.numtasks" : "1", +"mapreduce.task.io.sort.mb" : "100", +"io.file.buffer.size" : "65536", +"dfs.namenode.audit.loggers" : "default", +"dfs.namenode.checkpoint.txns" : "1000000", +"yarn.nodemanager.admin-env" : "MALLOC_ARENA_MAX=$MALLOC_ARENA_MAX", +"mapreduce.job.jar" : "/user/jenkins/.staging/job_1369942127770_1207/job.jar", +"mapreduce.job.split.metainfo.maxsize" : "10000000", +"kfs.replication" : "3", +"rpc.engine.org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolPB" : "org.apache.hadoop.ipc.ProtobufRpcEngine", +"yarn.app.mapreduce.am.scheduler.heartbeat.interval-ms" : "1000", +"mapreduce.reduce.maxattempts" : "4", +"kfs.stream-buffer-size" : "4096", +"dfs.ha.tail-edits.period" : "60", +"hadoop.security.authentication" : "simple", +"fs.s3.buffer.dir" : "${hadoop.tmp.dir}/s3", +"rpc.engine.org.apache.hadoop.yarn.api.AMRMProtocolPB" : "org.apache.hadoop.ipc.ProtobufRpcEngine", +"mapreduce.jobtracker.taskscheduler" : "org.apache.hadoop.mapred.JobQueueTaskScheduler", +"yarn.app.mapreduce.am.job.task.listener.thread-count" : "30", +"dfs.namenode.avoid.read.stale.datanode" : "false", +"mapreduce.job.reduces" : "0", +"mapreduce.map.sort.spill.percent" : "0.8", +"dfs.client.file-block-storage-locations.timeout" : "60", +"dfs.datanode.drop.cache.behind.writes" : "false", +"mapreduce.job.end-notification.retry.interval" : "1", +"mapreduce.job.maps" : "96", +"mapreduce.job.speculative.slownodethreshold" : "1.0", +"tfile.fs.input.buffer.size" : "262144", +"mapreduce.map.speculative" : "false", +"dfs.block.access.token.enable" : "false", +"dfs.journalnode.http-address" : "0.0.0.0:8480", +"mapreduce.job.acl-view-job" : " ", +"mapreduce.reduce.shuffle.retry-delay.max.ms" : "60000", +"mapreduce.job.end-notification.max.retry.interval" : "5", +"ftp.blocksize" : "67108864", +"mapreduce.tasktracker.http.threads" : "80", +"mapreduce.reduce.java.opts" : " -Xmx825955249", +"dfs.datanode.data.dir" : "file://${hadoop.tmp.dir}/dfs/data", +"ha.failover-controller.cli-check.rpc-timeout.ms" : "20000", +"dfs.namenode.max.extra.edits.segments.retained" : "10000", +"dfs.https.port" : "20102", +"dfs.namenode.replication.interval" : "3", +"mapreduce.task.skip.start.attempts" : "2", +"dfs.namenode.https-address" : "a2115.smile.com:20102", +"mapreduce.jobtracker.persist.jobstatus.dir" : "/jobtracker/jobsInfo", +"ipc.client.kill.max" : "10", +"dfs.ha.automatic-failover.enabled" : "false", +"mapreduce.jobhistory.keytab" : "/etc/security/keytab/jhs.service.keytab", +"dfs.image.transfer.timeout" : "600000", +"dfs.client.failover.sleep.max.millis" : "15000", +"mapreduce.job.end-notification.max.attempts" : "5", +"mapreduce.task.tmp.dir" : "./tmp", +"dfs.default.chunk.view.size" : "32768", +"kfs.bytes-per-checksum" : "512", +"mapreduce.reduce.memory.mb" : "1024", +"hadoop.http.filter.initializers" : "org.apache.hadoop.yarn.server.webproxy.amfilter.AmFilterInitializer", +"dfs.datanode.failed.volumes.tolerated" : "0", +"hadoop.http.authentication.type" : "simple", +"dfs.datanode.data.dir.perm" : "700", +"yarn.resourcemanager.client.thread-count" : "50", +"ipc.server.listen.queue.size" : "128", +"mapreduce.reduce.skip.maxgroups" : "0", +"file.stream-buffer-size" : "4096", +"dfs.namenode.fs-limits.max-directory-items" : "0", +"io.mapfile.bloom.size" : "1048576", +"yarn.nodemanager.container-executor.class" : "org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor", +"mapreduce.map.maxattempts" : "4", +"mapreduce.jobtracker.jobhistory.block.size" : "3145728", +"yarn.log-aggregation.retain-seconds" : "-1", +"yarn.app.mapreduce.am.job.committer.cancel-timeout" : "60000", +"ftp.replication" : "3", +"mapreduce.jobtracker.http.address" : "0.0.0.0:50030", +"yarn.nodemanager.health-checker.script.timeout-ms" : "1200000", +"mapreduce.jobhistory.address" : "a2115.smile.com:10020", +"mapreduce.jobtracker.taskcache.levels" : "2", +"dfs.datanode.dns.nameserver" : "default", +"mapreduce.application.classpath" : "$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*", +"yarn.nodemanager.log.retain-seconds" : "10800", +"mapred.child.java.opts" : "-Xmx200m", +"dfs.replication.max" : "512", +"map.sort.class" : "org.apache.hadoop.util.QuickSort", +"dfs.stream-buffer-size" : "4096", +"dfs.namenode.backup.address" : "0.0.0.0:50100", +"hadoop.util.hash.type" : "murmur", +"dfs.block.access.key.update.interval" : "600", +"dfs.datanode.dns.interface" : "default", +"dfs.datanode.use.datanode.hostname" : "false", +"mapreduce.job.output.key.class" : "org.apache.hadoop.io.Text", +"dfs.client.read.shortcircuit" : "false", +"dfs.namenode.backup.http-address" : "0.0.0.0:50105", +"yarn.nodemanager.container-monitor.interval-ms" : "3000", +"yarn.nodemanager.disk-health-checker.min-healthy-disks" : "0.25", +"kfs.client-write-packet-size" : "65536", +"ha.zookeeper.acl" : "world:anyone:rwcda", +"yarn.nodemanager.sleep-delay-before-sigkill.ms" : "250", +"mapreduce.job.dir" : "/user/jenkins/.staging/job_1369942127770_1207", +"io.map.index.skip" : "0", +"net.topology.node.switch.mapping.impl" : "org.apache.hadoop.net.ScriptBasedMapping", +"fs.s3.maxRetries" : "4", +"ha.failover-controller.new-active.rpc-timeout.ms" : "60000", +"s3native.client-write-packet-size" : "65536", +"yarn.resourcemanager.amliveliness-monitor.interval-ms" : "1000", +"hadoop.http.staticuser.user" : "dr.who", +"mapreduce.reduce.speculative" : "false", +"mapreduce.client.output.filter" : "FAILED", +"mapreduce.ifile.readahead.bytes" : "4194304", +"mapreduce.tasktracker.report.address" : "127.0.0.1:0", +"mapreduce.task.userlog.limit.kb" : "0", +"mapreduce.tasktracker.map.tasks.maximum" : "2", +"hadoop.http.authentication.simple.anonymous.allowed" : "true", +"hadoop.fuse.timer.period" : "5", +"dfs.namenode.num.extra.edits.retained" : "1000000", +"hadoop.rpc.socket.factory.class.default" : "org.apache.hadoop.net.StandardSocketFactory", +"mapreduce.job.submithostname" : "a2115.smile.com", +"dfs.namenode.handler.count" : "10", +"fs.automatic.close" : "false", +"mapreduce.job.submithostaddress" : "10.20.206.115", +"mapreduce.tasktracker.healthchecker.script.timeout" : "600000", +"dfs.datanode.directoryscan.interval" : "21600", +"yarn.resourcemanager.address" : "a2115.smile.com:8032", +"yarn.nodemanager.health-checker.interval-ms" : "600000", +"dfs.client.file-block-storage-locations.num-threads" : "10", +"yarn.resourcemanager.container-tokens.master-key-rolling-interval-secs" : "86400", +"mapreduce.reduce.markreset.buffer.percent" : "0.0", +"hadoop.security.group.mapping.ldap.directory.search.timeout" : "10000", +"mapreduce.map.log.level" : "INFO", +"dfs.bytes-per-checksum" : "512", +"yarn.nodemanager.localizer.address" : "0.0.0.0:8040", +"dfs.namenode.checkpoint.max-retries" : "3", +"ha.health-monitor.rpc-timeout.ms" : "45000", +"yarn.resourcemanager.keytab" : "/etc/krb5.keytab", +"ftp.stream-buffer-size" : "4096", +"dfs.namenode.avoid.write.stale.datanode" : "false", +"hadoop.security.group.mapping.ldap.search.attr.member" : "member", +"mapreduce.output.fileoutputformat.outputdir" : "hdfs://a2115.smile.com:8020/user/jenkins/tera-gen-1", +"dfs.blockreport.initialDelay" : "0", +"yarn.nm.liveness-monitor.expiry-interval-ms" : "600000", +"hadoop.http.authentication.token.validity" : "36000", +"dfs.namenode.delegation.token.max-lifetime" : "604800000", +"mapreduce.job.hdfs-servers" : "${fs.defaultFS}", +"s3native.replication" : "3", +"yarn.nodemanager.localizer.client.thread-count" : "5", +"dfs.heartbeat.interval" : "3", +"rpc.engine.org.apache.hadoop.ipc.ProtocolMetaInfoPB" : "org.apache.hadoop.ipc.ProtobufRpcEngine", +"dfs.ha.fencing.ssh.connect-timeout" : "30000", +"yarn.resourcemanager.container.liveness-monitor.interval-ms" : "600000", +"yarn.am.liveness-monitor.expiry-interval-ms" : "600000", +"mapreduce.task.profile" : "false", +"mapreduce.tasktracker.http.address" : "0.0.0.0:50060", +"mapreduce.tasktracker.instrumentation" : "org.apache.hadoop.mapred.TaskTrackerMetricsInst", +"mapreduce.jobhistory.webapp.address" : "a2115.smile.com:19888", +"ha.failover-controller.graceful-fence.rpc-timeout.ms" : "5000", +"yarn.ipc.rpc.class" : "org.apache.hadoop.yarn.ipc.HadoopYarnProtoRPC", +"mapreduce.job.name" : "TeraGen", +"kfs.blocksize" : "67108864", +"yarn.resourcemanager.am-rm-tokens.master-key-rolling-interval-secs" : "86400", +"mapreduce.job.ubertask.maxmaps" : "9", +"yarn.scheduler.maximum-allocation-mb" : "8192", +"yarn.nodemanager.heartbeat.interval-ms" : "1000", +"mapreduce.job.userlog.retain.hours" : "24", +"dfs.namenode.secondary.http-address" : "0.0.0.0:50090", +"mapreduce.task.timeout" : "600000", +"mapreduce.framework.name" : "yarn", +"ipc.client.idlethreshold" : "4000", +"ftp.bytes-per-checksum" : "512", +"ipc.server.tcpnodelay" : "false", +"dfs.namenode.stale.datanode.interval" : "30000", +"s3.bytes-per-checksum" : "512", +"mapreduce.job.speculative.slowtaskthreshold" : "1.0", +"yarn.nodemanager.localizer.cache.target-size-mb" : "10240", +"yarn.nodemanager.remote-app-log-dir" : "/tmp/logs", +"fs.s3.block.size" : "67108864", +"mapreduce.job.queuename" : "sls_queue_1", +"dfs.client.failover.connection.retries" : "0", +"hadoop.rpc.protection" : "authentication", +"yarn.scheduler.minimum-allocation-mb" : "1024", +"yarn.app.mapreduce.client-am.ipc.max-retries" : "1", +"hadoop.security.auth_to_local" : "DEFAULT", +"dfs.secondary.namenode.kerberos.internal.spnego.principal" : "${dfs.web.authentication.kerberos.principal}", +"ftp.client-write-packet-size" : "65536", +"fs.defaultFS" : "hdfs://a2115.smile.com:8020", +"yarn.nodemanager.address" : "0.0.0.0:0", +"yarn.scheduler.fair.assignmultiple" : "true", +"yarn.resourcemanager.scheduler.client.thread-count" : "50", +"mapreduce.task.merge.progress.records" : "10000", +"file.client-write-packet-size" : "65536", +"yarn.nodemanager.delete.thread-count" : "4", +"yarn.resourcemanager.scheduler.address" : "a2115.smile.com:8030", +"fs.trash.checkpoint.interval" : "0", +"hadoop.http.authentication.signature.secret.file" : "${user.home}/hadoop-http-auth-signature-secret", +"s3native.stream-buffer-size" : "4096", +"mapreduce.reduce.shuffle.read.timeout" : "180000", +"mapreduce.admin.user.env" : "LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native", +"yarn.app.mapreduce.am.command-opts" : " -Xmx1238932873", +"dfs.namenode.checkpoint.edits.dir" : "${dfs.namenode.checkpoint.dir}", +"fs.permissions.umask-mode" : "022", +"dfs.client.domain.socket.data.traffic" : "false", +"hadoop.common.configuration.version" : "0.23.0", +"mapreduce.tasktracker.dns.interface" : "default", +"mapreduce.output.fileoutputformat.compress.type" : "BLOCK", +"mapreduce.ifile.readahead" : "true", +"hadoop.security.group.mapping.ldap.ssl" : "false", +"io.serializations" : "org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization", +"yarn.nodemanager.aux-services.mapreduce.shuffle.class" : "org.apache.hadoop.mapred.ShuffleHandler", +"fs.df.interval" : "60000", +"mapreduce.reduce.shuffle.input.buffer.percent" : "0.70", +"io.seqfile.compress.blocksize" : "1000000", +"hadoop.security.groups.cache.secs" : "300", +"ipc.client.connect.max.retries" : "10", +"dfs.namenode.delegation.key.update-interval" : "86400000", +"yarn.nodemanager.process-kill-wait.ms" : "2000", +"yarn.application.classpath" : "$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,$YARN_HOME/*,$YARN_HOME/lib/*", +"yarn.app.mapreduce.client.max-retries" : "3", +"dfs.datanode.available-space-volume-choosing-policy.balanced-space-preference-fraction" : "0.75f", +"yarn.nodemanager.log-aggregation.compression-type" : "none", +"hadoop.security.group.mapping.ldap.search.filter.user" : "(&(objectClass=user)(sAMAccountName={0}))", +"yarn.nodemanager.localizer.cache.cleanup.interval-ms" : "600000", +"dfs.image.compress" : "false", +"mapred.mapper.new-api" : "true", +"yarn.nodemanager.log-dirs" : "${yarn.log.dir}/userlogs", +"dfs.namenode.kerberos.internal.spnego.principal" : "${dfs.web.authentication.kerberos.principal}", +"fs.s3n.block.size" : "67108864", +"fs.ftp.host" : "0.0.0.0", +"hadoop.security.group.mapping" : "org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback", +"dfs.datanode.address" : "0.0.0.0:50010", +"mapreduce.map.skip.maxrecords" : "0", +"dfs.datanode.https.address" : "0.0.0.0:50475", +"file.replication" : "1", +"yarn.resourcemanager.resource-tracker.address" : "a2115.smile.com:8031", +"dfs.datanode.drop.cache.behind.reads" : "false", +"hadoop.fuse.connection.timeout" : "300", +"hadoop.work.around.non.threadsafe.getpwuid" : "false", +"mapreduce.jobtracker.restart.recover" : "false", +"hadoop.tmp.dir" : "/tmp/hadoop-${user.name}", +"mapreduce.output.fileoutputformat.compress" : "false", +"mapreduce.tasktracker.indexcache.mb" : "10", +"mapreduce.client.genericoptionsparser.used" : "true", +"dfs.client.block.write.replace-datanode-on-failure.policy" : "DEFAULT", +"mapreduce.job.committer.setup.cleanup.needed" : "true", +"hadoop.kerberos.kinit.command" : "kinit", +"dfs.datanode.du.reserved" : "0", +"dfs.namenode.fs-limits.max-blocks-per-file" : "1048576", +"file.bytes-per-checksum" : "512", +"mapreduce.task.profile.reduces" : "0-2", +"mapreduce.jobtracker.handler.count" : "10", +"dfs.client.block.write.replace-datanode-on-failure.enable" : "true", +"mapreduce.job.output.value.class" : "org.apache.hadoop.io.Text", +"yarn.dispatcher.exit-on-error" : "true", +"net.topology.script.number.args" : "100", +"mapreduce.task.profile.maps" : "0-2", +"dfs.namenode.decommission.interval" : "30", +"dfs.image.compression.codec" : "org.apache.hadoop.io.compress.DefaultCodec", +"yarn.resourcemanager.webapp.address" : "a2115.smile.com:8088", +"mapreduce.jobtracker.system.dir" : "${hadoop.tmp.dir}/mapred/system", +"hadoop.ssl.hostname.verifier" : "DEFAULT", +"yarn.nodemanager.vmem-pmem-ratio" : "2.1", +"dfs.namenode.support.allow.format" : "true", +"mapreduce.jobhistory.principal" : "jhs/_HOST@REALM.TLD", +"io.mapfile.bloom.error.rate" : "0.005", +"mapreduce.shuffle.ssl.file.buffer.size" : "65536", +"dfs.permissions.superusergroup" : "supergroup", +"dfs.datanode.available-space-volume-choosing-policy.balanced-space-threshold" : "10737418240", +"mapreduce.jobtracker.expire.trackers.interval" : "600000", +"mapreduce.cluster.acls.enabled" : "false", +"yarn.nodemanager.remote-app-log-dir-suffix" : "logs", +"ha.failover-controller.graceful-fence.connection.retries" : "1", +"ha.health-monitor.connect-retry-interval.ms" : "1000", +"mapreduce.reduce.shuffle.merge.percent" : "0.66", +"yarn.app.mapreduce.am.resource.mb" : "1536", +"io.seqfile.local.dir" : "${hadoop.tmp.dir}/io/local", +"dfs.namenode.checkpoint.check.period" : "60", +"yarn.resourcemanager.nm.liveness-monitor.interval-ms" : "1000", +"mapreduce.jobtracker.maxtasks.perjob" : "-1", +"mapreduce.jobtracker.jobhistory.lru.cache.size" : "5", +"file.blocksize" : "67108864", +"tfile.io.chunk.size" : "1048576", +"mapreduce.job.acl-modify-job" : " ", +"yarn.nodemanager.webapp.address" : "0.0.0.0:8042", +"mapreduce.tasktracker.reduce.tasks.maximum" : "2", +"io.skip.checksum.errors" : "false", +"mapreduce.cluster.temp.dir" : "${hadoop.tmp.dir}/mapred/temp", +"yarn.app.mapreduce.am.staging-dir" : "/user", +"dfs.namenode.edits.journal-plugin.qjournal" : "org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager", +"dfs.datanode.handler.count" : "10", +"fs.ftp.host.port" : "21", +"dfs.namenode.decommission.nodes.per.interval" : "5", +"yarn.resourcemanager.admin.client.thread-count" : "1", +"dfs.namenode.fs-limits.max-component-length" : "0", +"dfs.namenode.checkpoint.period" : "3600", +"fs.AbstractFileSystem.viewfs.impl" : "org.apache.hadoop.fs.viewfs.ViewFs", +"yarn.resourcemanager.resource-tracker.client.thread-count" : "50", +"mapreduce.tasktracker.dns.nameserver" : "default", +"mapreduce.map.output.compress" : "true", +"dfs.datanode.ipc.address" : "0.0.0.0:50020", +"hadoop.ssl.require.client.cert" : "false", +"yarn.nodemanager.delete.debug-delay-sec" : "0", +"dfs.datanode.max.transfer.threads" : "4096" +}, +"computonsPerMapInputByte" : -1, +"computonsPerMapOutputByte" : -1, +"computonsPerReduceInputByte" : -1, +"computonsPerReduceOutputByte" : -1, +"heapMegabytes" : 200, +"outcome" : "SUCCESS", +"jobtype" : "JAVA", +"directDependantJobs" : [ ], +"successfulMapAttemptCDFs" : [ { +"maximum" : 9223372036854775807, +"minimum" : -9223372036854775808, +"rankings" : [ ], +"numberValues" : 0 +}, { +"maximum" : 9223372036854775807, +"minimum" : -9223372036854775808, +"rankings" : [ ], +"numberValues" : 0 +}, { +"maximum" : 9223372036854775807, +"minimum" : -9223372036854775808, +"rankings" : [ ], +"numberValues" : 0 +}, { +"maximum" : 47021, +"minimum" : 11143, +"rankings" : [ { +"datum" : 13354, +"relativeRanking" : 0.05 +}, { +"datum" : 14101, +"relativeRanking" : 0.1 +}, { +"datum" : 15609, +"relativeRanking" : 0.15 +}, { +"datum" : 15919, +"relativeRanking" : 0.2 +}, { +"datum" : 17003, +"relativeRanking" : 0.25 +}, { +"datum" : 17109, +"relativeRanking" : 0.3 +}, { +"datum" : 18342, +"relativeRanking" : 0.35 +}, { +"datum" : 18870, +"relativeRanking" : 0.4 +}, { +"datum" : 19127, +"relativeRanking" : 0.45 +}, { +"datum" : 19221, +"relativeRanking" : 0.5 +}, { +"datum" : 19481, +"relativeRanking" : 0.55 +}, { +"datum" : 19896, +"relativeRanking" : 0.6 +}, { +"datum" : 20585, +"relativeRanking" : 0.65 +}, { +"datum" : 20784, +"relativeRanking" : 0.7 +}, { +"datum" : 21452, +"relativeRanking" : 0.75 +}, { +"datum" : 21853, +"relativeRanking" : 0.8 +}, { +"datum" : 22436, +"relativeRanking" : 0.85 +}, { +"datum" : 32646, +"relativeRanking" : 0.9 +}, { +"datum" : 41553, +"relativeRanking" : 0.95 +} ], +"numberValues" : 96 +} ], +"failedMapAttemptCDFs" : [ { +"maximum" : 9223372036854775807, +"minimum" : -9223372036854775808, +"rankings" : [ ], +"numberValues" : 0 +}, { +"maximum" : 9223372036854775807, +"minimum" : -9223372036854775808, +"rankings" : [ ], +"numberValues" : 0 +}, { +"maximum" : 9223372036854775807, +"minimum" : -9223372036854775808, +"rankings" : [ ], +"numberValues" : 0 +}, { +"maximum" : 9223372036854775807, +"minimum" : -9223372036854775808, +"rankings" : [ ], +"numberValues" : 0 +} ], +"successfulReduceAttemptCDF" : { +"maximum" : 9223372036854775807, +"minimum" : -9223372036854775808, +"rankings" : [ ], +"numberValues" : 0 +}, +"failedReduceAttemptCDF" : { +"maximum" : 9223372036854775807, +"minimum" : -9223372036854775808, +"rankings" : [ ], +"numberValues" : 0 +}, +"mapperTriesToSucceed" : [ 1.0 ], +"failedMapperFraction" : 0.0, +"relativeTime" : 0, +"clusterMapMB" : -1, +"clusterReduceMB" : -1, +"jobMapMB" : 200, +"jobReduceMB" : 200 } diff --git a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/SLSRunner.java b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/SLSRunner.java index b36edc9412..c79233585c 100644 --- a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/SLSRunner.java +++ b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/SLSRunner.java @@ -389,6 +389,9 @@ private void startAMFromRumenTraces(Resource containerResource, new ArrayList(); // map tasks for(LoggedTask mapTask : job.getMapTasks()) { + if (mapTask.getAttempts().size() == 0) { + continue; + } LoggedTaskAttempt taskAttempt = mapTask.getAttempts() .get(mapTask.getAttempts().size() - 1); String hostname = taskAttempt.getHostName().getValue(); @@ -400,6 +403,9 @@ private void startAMFromRumenTraces(Resource containerResource, // reduce tasks for(LoggedTask reduceTask : job.getReduceTasks()) { + if (reduceTask.getAttempts().size() == 0) { + continue; + } LoggedTaskAttempt taskAttempt = reduceTask.getAttempts() .get(reduceTask.getAttempts().size() - 1); String hostname = taskAttempt.getHostName().getValue(); diff --git a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/utils/SLSUtils.java b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/utils/SLSUtils.java index d950aeeb92..f1b4f07802 100644 --- a/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/utils/SLSUtils.java +++ b/hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/utils/SLSUtils.java @@ -72,11 +72,17 @@ public static Set parseNodesFromRumenTrace(String jobTrace) while ((job = reader.getNext()) != null) { for(LoggedTask mapTask : job.getMapTasks()) { // select the last attempt + if (mapTask.getAttempts().size() == 0) { + continue; + } LoggedTaskAttempt taskAttempt = mapTask.getAttempts() .get(mapTask.getAttempts().size() - 1); nodeSet.add(taskAttempt.getHostName().getValue()); } for(LoggedTask reduceTask : job.getReduceTasks()) { + if (reduceTask.getAttempts().size() == 0) { + continue; + } LoggedTaskAttempt taskAttempt = reduceTask.getAttempts() .get(reduceTask.getAttempts().size() - 1); nodeSet.add(taskAttempt.getHostName().getValue()); diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index c2f16d524d..435eb68b02 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -159,6 +159,9 @@ Release 2.9.0 - UNRELEASED YARN-4613. Fix test failure in TestClientRMService#testGetClusterNodes. (Takashi Ohnishi via rohithsharmaks) + YARN-4612. Fix rumen and scheduler load simulator handle killed tasks properly. + (Ming Ma via xgong) + Release 2.8.0 - UNRELEASED INCOMPATIBLE CHANGES