From 108da85320d65e37fe835de65866b818e5420587 Mon Sep 17 00:00:00 2001 From: Eric Yang Date: Tue, 12 Jun 2018 20:40:32 -0400 Subject: [PATCH] HADOOP-15527. Improve delay check for stopping processes. Contributed by Vinod Kumar Vavilapalli --- .../src/main/bin/hadoop-functions.sh | 34 ++++++++++++++++++- .../src/test/scripts/hadoop_stop_daemon.bats | 24 ++++++++++++- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh b/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh index bee1430523..cbedd97218 100755 --- a/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh +++ b/hadoop-common-project/hadoop-common/src/main/bin/hadoop-functions.sh @@ -2040,6 +2040,35 @@ function hadoop_start_secure_daemon_wrapper return 0 } +## @description Wait till process dies or till timeout +## @audience private +## @stability evolving +## @param pid +## @param timeout +function wait_process_to_die_or_timeout +{ + local pid=$1 + local timeout=$2 + + # Normalize timeout + # Round up or down + timeout=$(printf "%.0f\n" "${timeout}") + if [[ ${timeout} -lt 1 ]]; then + # minimum 1 second + timeout=1 + fi + + # Wait to see if it's still alive + for (( i=0; i < "${timeout}"; i++ )) + do + if kill -0 "${pid}" > /dev/null 2>&1; then + sleep 1 + else + break + fi + done +} + ## @description Stop the non-privileged `command` daemon with that ## @description that is running at `pidfile`. ## @audience public @@ -2060,11 +2089,14 @@ function hadoop_stop_daemon pid=$(cat "$pidfile") kill "${pid}" >/dev/null 2>&1 - sleep "${HADOOP_STOP_TIMEOUT}" + + wait_process_to_die_or_timeout "${pid}" "${HADOOP_STOP_TIMEOUT}" + if kill -0 "${pid}" > /dev/null 2>&1; then hadoop_error "WARNING: ${cmd} did not stop gracefully after ${HADOOP_STOP_TIMEOUT} seconds: Trying to kill with kill -9" kill -9 "${pid}" >/dev/null 2>&1 fi + wait_process_to_die_or_timeout "${pid}" "${HADOOP_STOP_TIMEOUT}" if ps -p "${pid}" > /dev/null 2>&1; then hadoop_error "ERROR: Unable to kill ${pid}" else diff --git a/hadoop-common-project/hadoop-common/src/test/scripts/hadoop_stop_daemon.bats b/hadoop-common-project/hadoop-common/src/test/scripts/hadoop_stop_daemon.bats index 023d01c02c..148380706d 100644 --- a/hadoop-common-project/hadoop-common/src/test/scripts/hadoop_stop_daemon.bats +++ b/hadoop-common-project/hadoop-common/src/test/scripts/hadoop_stop_daemon.bats @@ -15,7 +15,7 @@ load hadoop-functions_test_helper -@test "hadoop_stop_daemon" { +@test "hadoop_stop_daemon_changing_pid" { old_pid=12345 new_pid=54321 HADOOP_STOP_TIMEOUT=3 @@ -29,3 +29,25 @@ load hadoop-functions_test_helper [ -f pidfile ] [ "$(cat pidfile)" = "${new_pid}" ] } + +@test "hadoop_stop_daemon_force_kill" { + + HADOOP_STOP_TIMEOUT=4 + + # Run the following in a sub-shell so that its termination doesn't affect the test + (sh ${TESTBINDIR}/process_with_sigterm_trap.sh ${TMP}/pidfile &) + + # Wait for the process to go into tight loop + sleep 1 + + [ -f ${TMP}/pidfile ] + pid=$(cat "${TMP}/pidfile") + + run hadoop_stop_daemon my_command ${TMP}/pidfile 2>&1 + + # The process should no longer be alive + ! kill -0 ${pid} > /dev/null 2>&1 + + # The PID file should be gone + [ ! -f ${TMP}/pidfile ] +}