From d1f25e5abe03ecb16731e3c3fa16a102c0a80128 Mon Sep 17 00:00:00 2001 From: Qixiang Cheng Date: Mon, 20 Aug 2018 16:16:59 +0800 Subject: [PATCH 1/3] Distinguish cgroup OOM from dmesg. --- src/rest-server/src/models/job.js | 3 +- .../templates/yarnContainerScript.mustache | 39 +++++++++++++++++-- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/src/rest-server/src/models/job.js b/src/rest-server/src/models/job.js index e27617ba12..0f83b1b0ce 100644 --- a/src/rest-server/src/models/job.js +++ b/src/rest-server/src/models/job.js @@ -425,7 +425,8 @@ class Job { 'frameworkInfoWebhdfsUri': launcherConfig.frameworkInfoWebhdfsPath(data.jobName), 'taskData': data.taskRoles[idx], 'jobData': data, - 'inspectFormat': '{{.State.Pid}}', + 'inspectPidFormat': '{{.State.Pid}}', + 'inspectOOMKilledFormat': '{{.State.OOMKilled}}', 'jobEnvs': jobEnvs, 'azRDMA': azureEnv.azRDMA === 'false' ? false : true, 'reqAzRDMA': data.jobEnvs && data.jobEnvs.paiAzRDMA === true ? true : false, diff --git a/src/rest-server/src/templates/yarnContainerScript.mustache b/src/rest-server/src/templates/yarnContainerScript.mustache index 3fc6cc33b9..23f2b790c8 100644 --- a/src/rest-server/src/templates/yarnContainerScript.mustache +++ b/src/rest-server/src/templates/yarnContainerScript.mustache @@ -30,9 +30,37 @@ BASH_XTRACEFD=13 function exit_handler() { rc=$? - printf "[DEBUG] EXIT signal received in yarn container, performing clean up action...\n" + echo "Exited with $rc" + local handler="Yarn container exit handler" + local if_cgroup_oom="" + debug_log "$handler" "EXIT signal received in yarn container, performing clean up action..." + + debug_log "$handler" "trying to kill docker container $docker_name" + docker logs $docker_name + docker inspect $docker_name + pid=$(docker inspect --format={{{ inspectPidFormat }}} $docker_name 2>/dev/null) + if [ $pid -gt 0 ]; then + kill -9 $pid &&\ + debug_log "$handler" "docker caontainer $docker_name killed successfully." ||\ + debug_log "$handler" "tries to kill the container $docker_name but failed. Maybe it has already exited." + else + debug_log "$handler" "docker container $docker_name has already exited" + is_oom=$(docker inspect --format={{{ inspectOOMKilledFormat }}} $docker_name 2>/dev/null) + debug_log "$handler" "docker container $docker_name is exited by OOM? $is_oom" + if [ "$is_oom" = "true" ]; then + is_cgroup_oom=$(dmesg | grep "Memory cgroup out of memory: Kill $docker_pid") + debug_log "$handler" "docker container $docker_name is exited by cgroup OOM? $is_cgroup_oom" + if [ -n "$is_cgroup_oom" ]; then + rc=206 + fi + fi - printf "[DEBUG] Write exit code $rc to file /var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode.\n" + docker container rm $docker_name + pkill --parent $$ + + debug_log "$handler" "write exit code to file" + debug_log "$handler" "yarn container exit code: $rc" + debug_log "$handler" "exit code file path: /var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode" echo $rc > "/var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode" exit $rc @@ -266,7 +294,7 @@ docker pull {{ jobData.image }} \ ## network consumption docker run --name $docker_name \ --init \ - --rm \ + --detach \ --tty \ --privileged=false \ --oom-score-adj=1000 \ @@ -306,3 +334,8 @@ docker run --name $docker_name \ {{ jobData.image }} \ /bin/bash '/pai/bootstrap/docker_bootstrap.sh' +docker_pid=$(docker inspect --format "{{{ inspectPidFormat }}}" $docker_name) + +echo "Docker container pid is $docker_pid" + +docker attach --no-stdin $docker_name From c9141cb7af09dffe2bec45bf63a5b31ed6d2d7a7 Mon Sep 17 00:00:00 2001 From: Qixiang Cheng Date: Mon, 25 Feb 2019 14:28:17 +0800 Subject: [PATCH 2/3] Remove cgroup OOM detection Make all OOM cause exiting by 5 --- src/rest-server/src/templates/yarnContainerScript.mustache | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/rest-server/src/templates/yarnContainerScript.mustache b/src/rest-server/src/templates/yarnContainerScript.mustache index 23f2b790c8..06c8cb1454 100644 --- a/src/rest-server/src/templates/yarnContainerScript.mustache +++ b/src/rest-server/src/templates/yarnContainerScript.mustache @@ -32,7 +32,6 @@ function exit_handler() rc=$? echo "Exited with $rc" local handler="Yarn container exit handler" - local if_cgroup_oom="" debug_log "$handler" "EXIT signal received in yarn container, performing clean up action..." debug_log "$handler" "trying to kill docker container $docker_name" @@ -48,10 +47,7 @@ function exit_handler() is_oom=$(docker inspect --format={{{ inspectOOMKilledFormat }}} $docker_name 2>/dev/null) debug_log "$handler" "docker container $docker_name is exited by OOM? $is_oom" if [ "$is_oom" = "true" ]; then - is_cgroup_oom=$(dmesg | grep "Memory cgroup out of memory: Kill $docker_pid") - debug_log "$handler" "docker container $docker_name is exited by cgroup OOM? $is_cgroup_oom" - if [ -n "$is_cgroup_oom" ]; then - rc=206 + rc=5 fi fi From 1d67e9b59c8d446584ed7c24d73b309fc8a1d4e6 Mon Sep 17 00:00:00 2001 From: George Cheng Date: Mon, 25 Feb 2019 16:38:09 +0800 Subject: [PATCH 3/3] Exit 55 when OOM --- src/rest-server/src/templates/yarnContainerScript.mustache | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rest-server/src/templates/yarnContainerScript.mustache b/src/rest-server/src/templates/yarnContainerScript.mustache index 06c8cb1454..2db339f1aa 100644 --- a/src/rest-server/src/templates/yarnContainerScript.mustache +++ b/src/rest-server/src/templates/yarnContainerScript.mustache @@ -47,7 +47,7 @@ function exit_handler() is_oom=$(docker inspect --format={{{ inspectOOMKilledFormat }}} $docker_name 2>/dev/null) debug_log "$handler" "docker container $docker_name is exited by OOM? $is_oom" if [ "$is_oom" = "true" ]; then - rc=5 + rc=55 fi fi