Skip to content

Commit

Permalink
fix: Add write permission to job output dirs for remote and step deco…
Browse files Browse the repository at this point in the history
…rator running on non-root job user (aws#4325)
  • Loading branch information
qidewenwhen authored Dec 15, 2023
1 parent 9269053 commit 53b9471
Show file tree
Hide file tree
Showing 7 changed files with 298 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from __future__ import absolute_import

import argparse
import getpass
import sys
import os
import shutil
Expand All @@ -38,6 +39,7 @@
REMOTE_FUNCTION_WORKSPACE = "sm_rf_user_ws"
BASE_CHANNEL_PATH = "/opt/ml/input/data"
FAILURE_REASON_PATH = "/opt/ml/output/failure"
JOB_OUTPUT_DIRS = ["/opt/ml/output", "/opt/ml/model", "/tmp"]
PRE_EXECUTION_SCRIPT_NAME = "pre_exec.sh"
JOB_REMOTE_FUNCTION_WORKSPACE = "sagemaker_remote_function_workspace"
SCRIPT_AND_DEPENDENCIES_CHANNEL_NAME = "pre_exec_script_and_dependencies"
Expand All @@ -63,6 +65,17 @@ def main(sys_args=None):

RuntimeEnvironmentManager()._validate_python_version(client_python_version, conda_env)

user = getpass.getuser()
if user != "root":
log_message = (
"The job is running on non-root user: %s. Adding write permissions to the "
"following job output directories: %s."
)
logger.info(log_message, user, JOB_OUTPUT_DIRS)
RuntimeEnvironmentManager().change_dir_permission(
dirs=JOB_OUTPUT_DIRS, new_permission="777"
)

if pipeline_execution_id:
_bootstrap_runtime_env_for_pipeline_step(
client_python_version, func_step_workspace, conda_env, dependency_settings
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,32 @@ def run_pre_exec_script(self, pre_exec_script_path: str):
pre_exec_script_path,
)

def change_dir_permission(self, dirs: list, new_permission: str):
"""Change the permission of given directories
Args:
dirs (list[str]): A list of directories for permission update.
new_permission (str): The new permission for the given directories.
"""

_ERROR_MSG_PREFIX = "Failed to change directory permissions due to: "
command = ["sudo", "chmod", "-R", new_permission] + dirs
logger.info("Executing '%s'.", " ".join(command))

try:
subprocess.run(command, check=True, stderr=subprocess.PIPE)
except subprocess.CalledProcessError as called_process_err:
err_msg = called_process_err.stderr.decode("utf-8")
raise RuntimeEnvironmentError(f"{_ERROR_MSG_PREFIX} {err_msg}")
except FileNotFoundError as file_not_found_err:
if "[Errno 2] No such file or directory: 'sudo'" in str(file_not_found_err):
raise RuntimeEnvironmentError(
f"{_ERROR_MSG_PREFIX} {file_not_found_err}. "
"Please contact the image owner to install 'sudo' in the job container "
"and provide sudo privilege to the container user."
)
raise RuntimeEnvironmentError(file_not_found_err)

def _is_file_exists(self, dependencies):
"""Check whether the dependencies file exists at the given location.
Expand Down
5 changes: 5 additions & 0 deletions tests/integ/sagemaker/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,12 @@
"RUN curl 'https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip' -o 'awscliv2.zip' \
&& unzip awscliv2.zip \
&& ./aws/install\n\n"
"RUN apt install sudo\n"
"RUN useradd -ms /bin/bash integ-test-user\n"
# Add the user to sudo group
"RUN usermod -aG sudo integ-test-user\n"
# Ensure passwords are not required for sudo group users
"RUN echo '%sudo ALL= (ALL) NOPASSWD:ALL' >> /etc/sudoers\n"
"USER integ-test-user\n"
"WORKDIR /home/integ-test-user\n"
"COPY {source_archive} ./\n"
Expand Down
19 changes: 19 additions & 0 deletions tests/integ/sagemaker/remote_function/test_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -747,6 +747,25 @@ def cuberoot(x):
assert cuberoot(27) == 3


def test_with_user_and_workdir_set_in_the_image_client_error_case(
sagemaker_session, dummy_container_with_user_and_workdir, cpu_instance_type
):
client_error_message = "Testing client error in job."

@remote(
role=ROLE,
image_uri=dummy_container_with_user_and_workdir,
instance_type=cpu_instance_type,
sagemaker_session=sagemaker_session,
)
def my_func():
raise RuntimeError(client_error_message)

with pytest.raises(RuntimeError) as error:
my_func()
assert client_error_message in str(error)


@pytest.mark.skip
def test_decorator_with_spark_job(sagemaker_session, cpu_instance_type):
@remote(
Expand Down
46 changes: 46 additions & 0 deletions tests/integ/sagemaker/workflow/test_step_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,52 @@ def cuberoot(x):
pass


def test_with_user_and_workdir_set_in_the_image_client_error_case(
sagemaker_session, role, pipeline_name, region_name, dummy_container_with_user_and_workdir
):
# This test aims to ensure client error in step decorated function
# can be successfully surfaced and the job can be failed.
os.environ["AWS_DEFAULT_REGION"] = region_name
client_error_message = "Testing client error in job."

@step(
role=role,
image_uri=dummy_container_with_user_and_workdir,
instance_type=INSTANCE_TYPE,
)
def my_func():
raise RuntimeError(client_error_message)

step_a = my_func()

pipeline = Pipeline(
name=pipeline_name,
steps=[step_a],
sagemaker_session=sagemaker_session,
)

try:
_, execution_steps = create_and_execute_pipeline(
pipeline=pipeline,
pipeline_name=pipeline_name,
region_name=region_name,
role=role,
no_of_steps=1,
last_step_name=get_step(step_a).name,
execution_parameters=dict(),
step_status="Failed",
)
assert (
f"ClientError: AlgorithmError: RuntimeError('{client_error_message}')"
in execution_steps[0]["FailureReason"]
)
finally:
try:
pipeline.delete()
except Exception:
pass


def test_step_level_serialization(
sagemaker_session, role, pipeline_name, region_name, dummy_container_without_error
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
# language governing permissions and limitations under the License.
from __future__ import absolute_import


from mock import patch
from mock.mock import MagicMock

from sagemaker.remote_function.runtime_environment.runtime_environment_manager import (
RuntimeEnvironmentError,
_DependencySettings,
Expand Down Expand Up @@ -78,14 +79,22 @@ def args_for_step():
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
"_bootstrap_runtime_env_for_remote_function"
)
def test_main_success_remote_job(
@patch("getpass.getuser", MagicMock(return_value="root"))
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.change_dir_permission"
)
def test_main_success_remote_job_with_root_user(
change_dir_permission,
bootstrap_remote,
run_pre_exec_script,
bootstrap_runtime,
validate_python,
_exit_process,
):
bootstrap.main(args_for_remote())

change_dir_permission.assert_not_called()
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
bootstrap_remote.assert_called_once_with(
TEST_PYTHON_VERSION,
Expand Down Expand Up @@ -114,14 +123,21 @@ def test_main_success_remote_job(
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
"_bootstrap_runtime_env_for_pipeline_step"
)
def test_main_success_pipeline_step(
@patch("getpass.getuser", MagicMock(return_value="root"))
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.change_dir_permission"
)
def test_main_success_pipeline_step_with_root_user(
change_dir_permission,
bootstrap_step,
run_pre_exec_script,
bootstrap_runtime,
validate_python,
_exit_process,
):
bootstrap.main(args_for_step())
change_dir_permission.assert_not_called()
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
bootstrap_step.assert_called_once_with(
TEST_PYTHON_VERSION,
Expand Down Expand Up @@ -150,14 +166,25 @@ def test_main_success_pipeline_step(
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
"_bootstrap_runtime_env_for_remote_function"
)
def test_main_failure_remote_job(
bootstrap_runtime, run_pre_exec_script, write_failure, _exit_process, validate_python
@patch("getpass.getuser", MagicMock(return_value="root"))
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.change_dir_permission"
)
def test_main_failure_remote_job_with_root_user(
change_dir_permission,
bootstrap_runtime,
run_pre_exec_script,
write_failure,
_exit_process,
validate_python,
):
runtime_err = RuntimeEnvironmentError("some failure reason")
bootstrap_runtime.side_effect = runtime_err

bootstrap.main(args_for_remote())

change_dir_permission.assert_not_called()
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
run_pre_exec_script.assert_not_called()
bootstrap_runtime.assert_called()
Expand All @@ -181,21 +208,125 @@ def test_main_failure_remote_job(
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
"_bootstrap_runtime_env_for_pipeline_step"
)
def test_main_failure_pipeline_step(
bootstrap_runtime, run_pre_exec_script, write_failure, _exit_process, validate_python
@patch("getpass.getuser", MagicMock(return_value="root"))
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.change_dir_permission"
)
def test_main_failure_pipeline_step_with_root_user(
change_dir_permission,
bootstrap_runtime,
run_pre_exec_script,
write_failure,
_exit_process,
validate_python,
):
runtime_err = RuntimeEnvironmentError("some failure reason")
bootstrap_runtime.side_effect = runtime_err

bootstrap.main(args_for_step())

change_dir_permission.assert_not_called()
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
run_pre_exec_script.assert_not_called()
bootstrap_runtime.assert_called()
write_failure.assert_called_with(str(runtime_err))
_exit_process.assert_called_with(1)


@patch("sys.exit")
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager._validate_python_version"
)
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.bootstrap"
)
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.run_pre_exec_script"
)
@patch(
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
"_bootstrap_runtime_env_for_remote_function"
)
@patch("getpass.getuser", MagicMock(return_value="non_root"))
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.change_dir_permission"
)
def test_main_remote_job_with_non_root_user(
change_dir_permission,
bootstrap_remote,
run_pre_exec_script,
bootstrap_runtime,
validate_python,
_exit_process,
):
bootstrap.main(args_for_remote())

change_dir_permission.assert_called_once_with(
dirs=bootstrap.JOB_OUTPUT_DIRS, new_permission="777"
)
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
bootstrap_remote.assert_called_once_with(
TEST_PYTHON_VERSION,
TEST_JOB_CONDA_ENV,
_DependencySettings(TEST_DEPENDENCY_FILE_NAME),
)
run_pre_exec_script.assert_not_called()
bootstrap_runtime.assert_not_called()
_exit_process.assert_called_with(0)


@patch("sys.exit")
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager._validate_python_version"
)
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.bootstrap"
)
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.run_pre_exec_script"
)
@patch(
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
"_bootstrap_runtime_env_for_pipeline_step"
)
@patch("getpass.getuser", MagicMock(return_value="non_root"))
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.change_dir_permission"
)
def test_main_pipeline_step_with_non_root_user(
change_dir_permission,
bootstrap_step,
run_pre_exec_script,
bootstrap_runtime,
validate_python,
_exit_process,
):
bootstrap.main(args_for_step())

change_dir_permission.assert_called_once_with(
dirs=bootstrap.JOB_OUTPUT_DIRS, new_permission="777"
)
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
bootstrap_step.assert_called_once_with(
TEST_PYTHON_VERSION,
FUNC_STEP_WORKSPACE,
TEST_JOB_CONDA_ENV,
None,
)
run_pre_exec_script.assert_not_called()
bootstrap_runtime.assert_not_called()
_exit_process.assert_called_with(0)


@patch("shutil.unpack_archive")
@patch("os.getcwd", return_value=CURR_WORKING_DIR)
@patch("os.path.exists", return_value=True)
Expand Down
Loading

0 comments on commit 53b9471

Please sign in to comment.