diff --git a/src/rest-server/src/models/job.js b/src/rest-server/src/models/job.js index 8786f1f4d0..33027d90b0 100644 --- a/src/rest-server/src/models/job.js +++ b/src/rest-server/src/models/job.js @@ -423,7 +423,8 @@ class Job { 'frameworkInfoWebhdfsUri': launcherConfig.frameworkInfoWebhdfsPath(data.jobName), 'taskData': data.taskRoles[idx], 'jobData': data, - 'inspectFormat': '{{.State.Pid}}', + 'inspectPidFormat': '{{.State.Pid}}', + 'inspectOOMKilledFormat': '{{.State.OOMKilled}}', 'jobEnvs': jobEnvs, }); return yarnContainerScript; diff --git a/src/rest-server/src/templates/yarnContainerScript.mustache b/src/rest-server/src/templates/yarnContainerScript.mustache index dc3f1e59ec..0f6be1c7d9 100644 --- a/src/rest-server/src/templates/yarnContainerScript.mustache +++ b/src/rest-server/src/templates/yarnContainerScript.mustache @@ -38,9 +38,34 @@ function debug_log() function exit_handler() { rc=$? + echo "Exited with $rc" local handler="Yarn container exit handler" + local if_cgroup_oom="" debug_log "$handler" "EXIT signal received in yarn container, performing clean up action..." + debug_log "$handler" "trying to kill docker container $docker_name" + docker logs $docker_name + docker inspect $docker_name + pid=$(docker inspect --format={{{ inspectPidFormat }}} $docker_name 2>/dev/null) + if [ $pid -gt 0 ]; then + kill -9 $pid &&\ + debug_log "$handler" "docker caontainer $docker_name killed successfully." ||\ + debug_log "$handler" "tries to kill the container $docker_name but failed. Maybe it has already exited." + else + debug_log "$handler" "docker container $docker_name has already exited" + is_oom=$(docker inspect --format={{{ inspectOOMKilledFormat }}} $docker_name 2>/dev/null) + debug_log "$handler" "docker container $docker_name is exited by OOM? $is_oom" + if [ "$is_oom" = "true" ]; then + is_cgroup_oom=$(dmesg | grep "Memory cgroup out of memory: Kill $docker_pid") + debug_log "$handler" "docker container $docker_name is exited by cgroup OOM? $is_cgroup_oom" + if [ -n "$is_cgroup_oom" ]; then + rc=206 + fi + fi + + docker container rm $docker_name + pkill --parent $$ + debug_log "$handler" "write exit code to file" debug_log "$handler" "yarn container exit code: $rc" debug_log "$handler" "exit code file path: /var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode" @@ -263,7 +288,7 @@ docker pull {{{ jobData.image }}} \ ## network consumption docker run --name $docker_name \ --init \ - --rm \ + --detach \ --tty \ --privileged=false \ --oom-score-adj=1000 \ @@ -294,3 +319,8 @@ docker run --name $docker_name \ {{{ jobData.image }}} \ /bin/bash '/pai/bootstrap/docker_bootstrap.sh' +docker_pid=$(docker inspect --format "{{{ inspectPidFormat }}}" $docker_name) + +echo "Docker container pid is $docker_pid" + +docker attach --no-stdin $docker_name