Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
Distinguish cgroup OOM from dmesg.
Browse files Browse the repository at this point in the history
  • Loading branch information
Gerhut committed Dec 3, 2018
1 parent 1e2202c commit a812230
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 2 deletions.
3 changes: 2 additions & 1 deletion src/rest-server/src/models/job.js
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,8 @@ class Job {
'frameworkInfoWebhdfsUri': launcherConfig.frameworkInfoWebhdfsPath(data.jobName),
'taskData': data.taskRoles[idx],
'jobData': data,
'inspectFormat': '{{.State.Pid}}',
'inspectPidFormat': '{{.State.Pid}}',
'inspectOOMKilledFormat': '{{.State.OOMKilled}}',
'jobEnvs': jobEnvs,
});
return yarnContainerScript;
Expand Down
32 changes: 31 additions & 1 deletion src/rest-server/src/templates/yarnContainerScript.mustache
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,34 @@ function debug_log()
function exit_handler()
{
rc=$?
echo "Exited with $rc"
local handler="Yarn container exit handler"
local if_cgroup_oom=""
debug_log "$handler" "EXIT signal received in yarn container, performing clean up action..."

debug_log "$handler" "trying to kill docker container $docker_name"
docker logs $docker_name
docker inspect $docker_name
pid=$(docker inspect --format={{{ inspectPidFormat }}} $docker_name 2>/dev/null)
if [ $pid -gt 0 ]; then
kill -9 $pid &&\
debug_log "$handler" "docker caontainer $docker_name killed successfully." ||\
debug_log "$handler" "tries to kill the container $docker_name but failed. Maybe it has already exited."
else
debug_log "$handler" "docker container $docker_name has already exited"
is_oom=$(docker inspect --format={{{ inspectOOMKilledFormat }}} $docker_name 2>/dev/null)
debug_log "$handler" "docker container $docker_name is exited by OOM? $is_oom"
if [ "$is_oom" = "true" ]; then
is_cgroup_oom=$(dmesg | grep "Memory cgroup out of memory: Kill $docker_pid")
debug_log "$handler" "docker container $docker_name is exited by cgroup OOM? $is_cgroup_oom"
if [ -n "$is_cgroup_oom" ]; then
rc=206
fi
fi

docker container rm $docker_name
pkill --parent $$

debug_log "$handler" "write exit code to file"
debug_log "$handler" "yarn container exit code: $rc"
debug_log "$handler" "exit code file path: /var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode"
Expand Down Expand Up @@ -263,7 +288,7 @@ docker pull {{{ jobData.image }}} \
## network consumption
docker run --name $docker_name \
--init \
--rm \
--detach \
--tty \
--privileged=false \
--oom-score-adj=1000 \
Expand Down Expand Up @@ -294,3 +319,8 @@ docker run --name $docker_name \
{{{ jobData.image }}} \
/bin/bash '/pai/bootstrap/docker_bootstrap.sh'

docker_pid=$(docker inspect --format "{{{ inspectPidFormat }}}" $docker_name)

echo "Docker container pid is $docker_pid"

docker attach --no-stdin $docker_name

0 comments on commit a812230

Please sign in to comment.