diff --git a/README.md b/README.md index 7bc80092..ba0c855c 100644 --- a/README.md +++ b/README.md @@ -125,15 +125,6 @@ all zones. --num-slices=4 --spot ``` -* Cluster Create for Pathways: - Pathways compatible cluster can be created using `--enable-pathways` - ```shell - python3 xpk.py cluster create \ - --cluster xpk-pw-test \ - --num-slices=4 --on-demand \ - --tpu-type=v5litepod-16 \ - --enable-pathways - ``` * Cluster Create can be called again with the same `--cluster name` to modify the number of slices or retry failed steps. @@ -211,36 +202,6 @@ all zones. --tpu-type=v5litepod-16 ``` -* Workload Create for Pathways: - Pathways workload can be submitted using `--use-pathways` on a Pathways enabled cluster (created with `--enable-pathways`) - - Pathways workload example: - ```shell - python3 xpk.py workload create \ - --workload xpk-pw-test \ - --num-slices=1 \ - --tpu-type=v5litepod-16 \ - --use-pathways \ - --cluster xpk-pw-test \ - --docker-name='user-workload' \ - --docker-image= \ - --command='bash /usr/pathways/ifrt/maxtext_entrypoint.sh base_output_directory= dataset_path= per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1' - ``` - - Regular workload can also be submitted on a Pathways enabled cluster (created with `--enable-pathways`) - - Pathways workload example: - ```shell - python3 xpk.py workload create \ - --workload xpk-regular-test \ - --num-slices=1 \ - --tpu-type=v5litepod-16 \ - --cluster xpk-pw-test \ - --docker-name='user-workload' \ - --docker-image= \ - --command='python3 MaxText/train.py MaxText/configs/base.yml base_output_directory= dataset_path= per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1' - ``` - ### Set `max-restarts` for production jobs * `--max-restarts `: By default, this is 0. This will restart the job "" @@ -354,6 +315,49 @@ checkpointing so the job restarts near where it was interrupted. python3 xpk.py workload list \ --cluster xpk-test --filter-by-job=$USER ``` +## Pathways on XPK + +* Cluster Create for Pathways: + Pathways compatible cluster can be created using `--enable-pathways` + ```shell + python3 xpk.py cluster create \ + --cluster xpk-pw-test \ + --num-slices=4 --on-demand \ + --tpu-type=v5litepod-16 \ + --enable-pathways + ``` + +* Workload Create for Pathways: + Pathways workload can be submitted using `--use-pathways` on a Pathways enabled cluster (created with `--enable-pathways`) + + Pathways workload example: + ```shell + python3 xpk.py workload create \ + --workload xpk-pw-test \ + --num-slices=1 \ + --tpu-type=v5litepod-16 \ + --cluster xpk-pw-test \ + --use-pathways \ + --server-image= \ + --proxy-server-image= \ + --docker-name='user-workload' \ + --docker-image= \ + --command='bash /usr/pathways/ifrt/maxtext_entrypoint.sh base_output_directory= dataset_path= per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1' + ``` + + Regular workload can also be submitted (by omitting `--use-pathways`) on a Pathways enabled cluster (i.e a cluster created with `--enable-pathways`) + + Pathways workload example: + ```shell + python3 xpk.py workload create \ + --workload xpk-regular-test \ + --num-slices=1 \ + --tpu-type=v5litepod-16 \ + --cluster xpk-pw-test \ + --docker-name='user-workload' \ + --docker-image= \ + --command='python3 MaxText/train.py MaxText/configs/base.yml base_output_directory= dataset_path= per_device_batch_size=1 enable_checkpointing=false enable_profiler=false remat_policy=full global_parameter_scale=4 steps=300 max_target_length=2048 use_iota_embed=true reuse_example_batch=1 dataset_type=synthetic attention=flash gcs_metrics=True run_name=$(USER)-pw-xpk-test-1' + ``` ## Inspector * Inspector provides debug info to understand cluster health, and why workloads are not running. diff --git a/xpk.py b/xpk.py index f1b03cda..7e74c33e 100644 --- a/xpk.py +++ b/xpk.py @@ -3200,6 +3200,35 @@ def setup_docker_image(args) -> tuple[int, str]: return 0, docker_image +def check_use_pathways(args) -> bool: + """ + Both --proxy-server-image and --server-image need to be set if --use-pathways is set. + Neither can be provided if --use-pathways is not provided. + Args: + args: user provided arguments for running the command. + + Returns: + bool: Whether the expected images are provided with --use-pathways. + """ + return args.use_pathways == (args.proxy_server_image is not None and args.server_image is not None) + +def validate_pathways_docker_images(args) -> bool: + """Validates the existence of Pathways server and proxy docker images in the project. + + Args: + args: user provided arguments for running the command. + + Returns: + bool: whether the Pathways proxy and server images are valid. + """ + proxy_server_return_code = validate_docker_image(args.proxy_server_image, args) + server_return_code = validate_docker_image(args.server_image, args) + if proxy_server_return_code > 0 or server_return_code > 0: + return False + else: + return True + + def get_main_and_sidecar_container(args, system, docker_image) -> str: """Generate yaml for main and sidecar container. Args: @@ -3379,7 +3408,6 @@ def get_pathways_proxy_args(args) -> str: - --pathways_ifrt_proxy_server_resource_manager={args.workload}-rm-0-0.{args.workload}:38677 - --pathways_ifrt_proxy_server_port=38676 - --pathways_tmp_dir_pattern={args.pathways_gcs_location} - - --pathways_xprof_trace_enable_bulk_upload=true - --pathways_plaque_network=gcp""" if args.use_pathways: return yaml.format(args=args) @@ -3721,6 +3749,15 @@ def workload_create(args) -> int: if setup_docker_image_code != 0: xpk_exit(setup_docker_image_code) + if not check_use_pathways(args): + xpk_print('--proxy-server-image and --server-image need to be provided', + 'with --use-pathways.') + xpk_exit(1) + + if args.use_pathways and not validate_pathways_docker_images(args): + xpk_print('Please check the proxy-server-image or server-image as advised!') + xpk_exit(1) + add_env_config(args) debugging_dashboard_id = None @@ -4857,17 +4894,15 @@ def directory_path_type(value): workload_pathways_workload_arguments.add_argument( '--proxy-server-image', type=str, - default='gcr.io/cloud-tpu-v2-images/pathways/pathways-demo:proxy_server', help=( - 'Please provide the proxy server image for Pathways.' + 'Please provide the proxy server image for Pathways. This argument needs to be used with --use-pathways.' ), ) workload_pathways_workload_arguments.add_argument( '--server-image', type=str, - default='gcr.io/cloud-tpu-v2-images/pathways/pathways-demo:server', help=( - 'Please provide the server image for Pathways.' + 'Please provide the server image for Pathways. This argument needs to be used with --use-pathways.' ), ) workload_pathways_workload_arguments.add_argument(