Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Fix auto retries when out of memory. #1108

Merged
merged 3 commits into from
Mar 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/rest-server/src/models/job.js
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,8 @@ class Job {
'frameworkInfoWebhdfsUri': launcherConfig.frameworkInfoWebhdfsPath(data.jobName),
'taskData': data.taskRoles[idx],
'jobData': data,
'inspectFormat': '{{.State.Pid}}',
'inspectPidFormat': '{{.State.Pid}}',
'inspectOOMKilledFormat': '{{.State.OOMKilled}}',
'jobEnvs': jobEnvs,
'azRDMA': azureEnv.azRDMA === 'false' ? false : true,
'reqAzRDMA': data.jobEnvs && data.jobEnvs.paiAzRDMA === true ? true : false,
Expand Down
35 changes: 32 additions & 3 deletions src/rest-server/src/templates/yarnContainerScript.mustache
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,33 @@ BASH_XTRACEFD=13
function exit_handler()
{
rc=$?
printf "[DEBUG] EXIT signal received in yarn container, performing clean up action...\n"
echo "Exited with $rc"
local handler="Yarn container exit handler"
debug_log "$handler" "EXIT signal received in yarn container, performing clean up action..."

debug_log "$handler" "trying to kill docker container $docker_name"
docker logs $docker_name
docker inspect $docker_name
pid=$(docker inspect --format={{{ inspectPidFormat }}} $docker_name 2>/dev/null)
if [ $pid -gt 0 ]; then
kill -9 $pid &&\
debug_log "$handler" "docker caontainer $docker_name killed successfully." ||\
debug_log "$handler" "tries to kill the container $docker_name but failed. Maybe it has already exited."
else
debug_log "$handler" "docker container $docker_name has already exited"
is_oom=$(docker inspect --format={{{ inspectOOMKilledFormat }}} $docker_name 2>/dev/null)
debug_log "$handler" "docker container $docker_name is exited by OOM? $is_oom"
if [ "$is_oom" = "true" ]; then
rc=55
fi
fi

printf "[DEBUG] Write exit code $rc to file /var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode.\n"
docker container rm $docker_name
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Gerhut
We can't guarantee exit_handle will be executed, all codes here are besteffort. So the container might be left on the host.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, but there is an abnormal state when exit_handle does not be executed. The legacy containers removing should be rely on cleaner.

pkill --parent $$

debug_log "$handler" "write exit code to file"
debug_log "$handler" "yarn container exit code: $rc"
debug_log "$handler" "exit code file path: /var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode"
echo $rc > "/var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode"

exit $rc
Expand Down Expand Up @@ -266,7 +290,7 @@ docker pull {{ jobData.image }} \
## network consumption
docker run --name $docker_name \
--init \
--rm \
--detach \
--tty \
--privileged=false \
--oom-score-adj=1000 \
Expand Down Expand Up @@ -306,3 +330,8 @@ docker run --name $docker_name \
{{ jobData.image }} \
/bin/bash '/pai/bootstrap/docker_bootstrap.sh'

docker_pid=$(docker inspect --format "{{{ inspectPidFormat }}}" $docker_name)

echo "Docker container pid is $docker_pid"

docker attach --no-stdin $docker_name