YARN-10739. GenericEventHandler.printEventQueueDetails causes RM recovery to take too much time. Contributed by Qi Zhu.

This commit is contained in:
Peter Bacsko 2021-04-27 14:01:53 +02:00
parent f54e7646cf
commit a967ab06f2
2 changed files with 17 additions and 3 deletions

View File

@ -25,7 +25,11 @@
import java.util.Map; import java.util.Map;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.hadoop.yarn.metrics.EventTypeMetrics; import org.apache.hadoop.yarn.metrics.EventTypeMetrics;
import org.apache.hadoop.yarn.util.Clock; import org.apache.hadoop.yarn.util.Clock;
import org.apache.hadoop.yarn.util.MonotonicClock; import org.apache.hadoop.yarn.util.MonotonicClock;
@ -93,6 +97,8 @@ public class AsyncDispatcher extends AbstractService implements Dispatcher {
private Clock clock = new MonotonicClock(); private Clock clock = new MonotonicClock();
private ThreadPoolExecutor printEventDetailsExecutor;
/** /**
* The thread name for dispatcher. * The thread name for dispatcher.
*/ */
@ -179,6 +185,15 @@ protected void serviceInit(Configuration conf) throws Exception{
YARN_DISPATCHER_PRINT_EVENTS_INFO_THRESHOLD, YARN_DISPATCHER_PRINT_EVENTS_INFO_THRESHOLD,
YarnConfiguration. YarnConfiguration.
DEFAULT_YARN_DISPATCHER_PRINT_EVENTS_INFO_THRESHOLD); DEFAULT_YARN_DISPATCHER_PRINT_EVENTS_INFO_THRESHOLD);
ThreadFactory threadFactory = new ThreadFactoryBuilder()
.setNameFormat("PrintEventDetailsThread #%d")
.build();
// Thread pool for async print event details,
// to prevent wasting too much time for RM.
printEventDetailsExecutor = new ThreadPoolExecutor(
1, 5, 10, TimeUnit.SECONDS,
new LinkedBlockingQueue<>(), threadFactory);
} }
@Override @Override
@ -222,6 +237,7 @@ protected void serviceStop() throws Exception {
LOG.warn("Interrupted Exception while stopping", ie); LOG.warn("Interrupted Exception while stopping", ie);
} }
} }
printEventDetailsExecutor.shutdownNow();
// stop all the components // stop all the components
super.serviceStop(); super.serviceStop();
@ -319,7 +335,7 @@ public void handle(Event event) {
if (qSize != 0 && qSize % detailsInterval == 0 if (qSize != 0 && qSize % detailsInterval == 0
&& lastEventDetailsQueueSizeLogged != qSize) { && lastEventDetailsQueueSizeLogged != qSize) {
lastEventDetailsQueueSizeLogged = qSize; lastEventDetailsQueueSizeLogged = qSize;
printEventQueueDetails(); printEventDetailsExecutor.submit(this::printEventQueueDetails);
printTrigger = true; printTrigger = true;
} }
int remCapacity = eventQueue.remainingCapacity(); int remCapacity = eventQueue.remainingCapacity();

View File

@ -187,8 +187,6 @@ public void testPrintDispatcherEventDetails() throws Exception {
when(event.getType()).thenReturn(TestEnum.TestEventType); when(event.getType()).thenReturn(TestEnum.TestEventType);
dispatcher.getEventHandler().handle(event); dispatcher.getEventHandler().handle(event);
} }
verify(log, atLeastOnce()).info("Event type: TestEventType, " +
"Event record counter: 5000");
Thread.sleep(2000); Thread.sleep(2000);
//Make sure more than one event to take //Make sure more than one event to take
verify(log, atLeastOnce()). verify(log, atLeastOnce()).