From d4600322773b1e7903a2c9466508a7ba3b9ca9f4 Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Thu, 24 Oct 2024 16:56:46 -0700 Subject: [PATCH 1/3] [UX] remove all uses of deprecated `sky jobs` --- docs/source/examples/managed-jobs.rst | 2 +- docs/source/reference/faq.rst | 2 +- examples/managed_job_with_storage.yaml | 2 +- llm/axolotl/axolotl-spot.yaml | 2 +- llm/axolotl/readme.md | 2 +- llm/falcon/README.md | 12 ++++++------ llm/vicuna-llama-2/README.md | 2 +- llm/vicuna/README.md | 4 ++-- tests/backward_compatibility_tests.sh | 4 ++-- 9 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/source/examples/managed-jobs.rst b/docs/source/examples/managed-jobs.rst index a47b4345b9f..cdee85207b7 100644 --- a/docs/source/examples/managed-jobs.rst +++ b/docs/source/examples/managed-jobs.rst @@ -93,7 +93,7 @@ We can launch it with the following: setup: | # Fill in your wandb key: copy from https://wandb.ai/authorize # Alternatively, you can use `--env WANDB_API_KEY=$WANDB_API_KEY` - # to pass the key in the command line, during `sky spot launch`. + # to pass the key in the command line, during `sky jobs launch`. echo export WANDB_API_KEY=[YOUR-WANDB-API-KEY] >> ~/.bashrc pip install -e . diff --git a/docs/source/reference/faq.rst b/docs/source/reference/faq.rst index 5a966a0014f..5a89b23c447 100644 --- a/docs/source/reference/faq.rst +++ b/docs/source/reference/faq.rst @@ -38,7 +38,7 @@ How to ensure my workdir's ``.git`` is synced up for managed spot jobs? Currently, there is a difference in whether ``.git`` is synced up depending on the command used: - For regular ``sky launch``, the workdir's ``.git`` is synced up by default. -- For managed spot jobs ``sky spot launch``, the workdir's ``.git`` is excluded by default. +- For managed spot jobs ``sky jobs launch``, the workdir's ``.git`` is excluded by default. In the second case, to ensure the workdir's ``.git`` is synced up for managed spot jobs, you can explicitly add a file mount to sync it up: diff --git a/examples/managed_job_with_storage.yaml b/examples/managed_job_with_storage.yaml index 61244c16ba0..677e2c8ed6d 100644 --- a/examples/managed_job_with_storage.yaml +++ b/examples/managed_job_with_storage.yaml @@ -3,7 +3,7 @@ # Runs a task that uses cloud buckets for uploading and accessing files. # # Usage: -# sky spot launch -c spot-storage examples/managed_job_with_storage.yaml +# sky jobs launch -c spot-storage examples/managed_job_with_storage.yaml # sky down spot-storage resources: diff --git a/llm/axolotl/axolotl-spot.yaml b/llm/axolotl/axolotl-spot.yaml index b22a8ae3fce..0e04ba11992 100644 --- a/llm/axolotl/axolotl-spot.yaml +++ b/llm/axolotl/axolotl-spot.yaml @@ -4,7 +4,7 @@ # HF_TOKEN=abc BUCKET= sky launch -c axolotl-spot axolotl-spot.yaml --env HF_TOKEN --env BUCKET -i30 --down # # Managed spot (auto-recovery; for full runs): -# HF_TOKEN=abc BUCKET= sky spot launch -n axolotl-spot axolotl-spot.yaml --env HF_TOKEN --env BUCKET +# HF_TOKEN=abc BUCKET= sky jobs launch -n axolotl-spot axolotl-spot.yaml --env HF_TOKEN --env BUCKET name: axolotl diff --git a/llm/axolotl/readme.md b/llm/axolotl/readme.md index 0cc06b98723..eb80231aa93 100644 --- a/llm/axolotl/readme.md +++ b/llm/axolotl/readme.md @@ -22,5 +22,5 @@ ssh -L 8888:localhost:8888 axolotl-spot Launch managed spot instances (auto-recovery; for full runs): ``` -HF_TOKEN=abc BUCKET= sky spot launch -n axolotl-spot axolotl-spot.yaml --env HF_TOKEN --env BUCKET +HF_TOKEN=abc BUCKET= sky jobs launch -n axolotl-spot axolotl-spot.yaml --env HF_TOKEN --env BUCKET ``` diff --git a/llm/falcon/README.md b/llm/falcon/README.md index 6eb480d9ea8..1f40dc9f524 100644 --- a/llm/falcon/README.md +++ b/llm/falcon/README.md @@ -1,6 +1,6 @@ # Finetuning Falcon with SkyPilot -This README contains instructions on how to use SkyPilot to finetune Falcon-7B and Falcon-40B, an open-source LLM that rivals many current closed-source models, including ChatGPT. +This README contains instructions on how to use SkyPilot to finetune Falcon-7B and Falcon-40B, an open-source LLM that rivals many current closed-source models, including ChatGPT. * [Blog post](https://huggingface.co/blog/falcon) * [Repo](https://huggingface.co/tiiuae/falcon-40b) @@ -16,10 +16,10 @@ sky check See the Falcon SkyPilot YAML for [training](train.yaml). Serving is currently a work in progress and a YAML will be provided for that soon! We are also working on adding an evaluation step to evaluate the model you finetuned compared to the base model. ## Running Falcon on SkyPilot -Finetuning `Falcon-7B` and `Falcon-40B` require GPUs with 80GB memory, +Finetuning `Falcon-7B` and `Falcon-40B` require GPUs with 80GB memory, but `Falcon-7b-sharded` requires only 40GB memory. Thus, * If your GPU has 40 GB memory or less (e.g., Nvidia A100): use `ybelkada/falcon-7b-sharded-bf16`. -* If your GPU has 80 GB memory (e.g., Nvidia A100-80GB): you can also use `tiiuae/falcon-7b` and `tiiuae/falcon-40b`. +* If your GPU has 80 GB memory (e.g., Nvidia A100-80GB): you can also use `tiiuae/falcon-7b` and `tiiuae/falcon-40b`. Try `sky show-gpus --all` for supported GPUs. @@ -32,13 +32,13 @@ Steps for training on your cloud(s): 1. In [train.yaml](train.yaml), set the following variables in `envs`: - Replace the `OUTPUT_BUCKET_NAME` with a unique name. SkyPilot will create this bucket for you to store the model weights. - - Replace the `WANDB_API_KEY` to your own key. - - Replace the `MODEL_NAME` with your desired base model. + - Replace the `WANDB_API_KEY` to your own key. + - Replace the `MODEL_NAME` with your desired base model. 2. **Training the Falcon model using spot instances**: ```bash -sky spot launch -n falcon falcon.yaml +sky jobs launch --use-spot -n falcon falcon.yaml ``` Currently, such `A100-80GB:1` spot instances are only available on AWS and GCP. diff --git a/llm/vicuna-llama-2/README.md b/llm/vicuna-llama-2/README.md index 24caa525a56..891cb301123 100644 --- a/llm/vicuna-llama-2/README.md +++ b/llm/vicuna-llama-2/README.md @@ -122,7 +122,7 @@ sky launch --no-use-spot ... [SkyPilot Managed Spot](https://skypilot.readthedocs.io/en/latest/examples/spot-jobs.html) is a library built on top of SkyPilot that helps users run jobs on spot instances without worrying about interruptions. That is the tool used by the LMSYS organization to train the first version of Vicuna (more details can be found in their [launch blog post](https://lmsys.org/blog/2023-03-30-vicuna/) and [example](https://github.com/skypilot-org/skypilot/tree/master/llm/vicuna)). With this, the training cost can be reduced from $1000 to **\$300**. -To use SkyPilot Managed Spot, you can simply replace `sky launch` with `sky spot launch` in the above command: +To use SkyPilot Managed Spot, you can simply replace `sky launch` with `sky jobs launch` in the above command: ```bash sky spot launch -n vicuna train.yaml \ diff --git a/llm/vicuna/README.md b/llm/vicuna/README.md index b511eb7f4b0..6d9f46127d4 100644 --- a/llm/vicuna/README.md +++ b/llm/vicuna/README.md @@ -63,14 +63,14 @@ Steps for training on your cloud(s): 2. **Training the Vicuna-7B model on 8 A100 GPUs (80GB memory) using spot instances**: ```bash # Launch it on managed spot to save 3x cost -sky spot launch -n vicuna train.yaml +sky jobs launch -n vicuna train.yaml ``` Note: if you would like to see the training curve on W&B, you can add `--env WANDB_API_KEY` to the above command, which will propagate your local W&B API key in the environment variable to the job. [Optional] Train a larger 13B model ``` # Train a 13B model instead of the default 7B -sky spot launch -n vicuna-7b train.yaml --env MODEL_SIZE=13 +sky jobs launch -n vicuna-7b train.yaml --env MODEL_SIZE=13 # Use *unmanaged* spot instances (i.e., preemptions won't get auto-recovered). # Unmanaged spot provides a better interactive development experience but is vulnerable to spot preemptions. diff --git a/tests/backward_compatibility_tests.sh b/tests/backward_compatibility_tests.sh index 4f83c379ccf..276fda899dd 100644 --- a/tests/backward_compatibility_tests.sh +++ b/tests/backward_compatibility_tests.sh @@ -167,8 +167,8 @@ MANAGED_JOB_JOB_NAME=${CLUSTER_NAME}-${uuid:0:4} if [ "$start_from" -le 7 ]; then conda activate sky-back-compat-master rm -r ~/.sky/wheels || true -sky spot launch -d --cloud ${CLOUD} -y --cpus 2 --num-nodes 2 -n ${MANAGED_JOB_JOB_NAME}-7-0 "echo hi; sleep 1000" -sky spot launch -d --cloud ${CLOUD} -y --cpus 2 --num-nodes 2 -n ${MANAGED_JOB_JOB_NAME}-7-1 "echo hi; sleep 400" +sky jobs launch -d --cloud ${CLOUD} -y --cpus 2 --num-nodes 2 -n ${MANAGED_JOB_JOB_NAME}-7-0 "echo hi; sleep 1000" +sky jobs launch -d --cloud ${CLOUD} -y --cpus 2 --num-nodes 2 -n ${MANAGED_JOB_JOB_NAME}-7-1 "echo hi; sleep 400" conda activate sky-back-compat-current rm -r ~/.sky/wheels || true s=$(sky jobs queue | grep ${MANAGED_JOB_JOB_NAME}-7 | grep "RUNNING" | wc -l) From 0d1d25b16b12bddbc9627af1383da0d8e2a70404 Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Fri, 25 Oct 2024 17:11:02 -0700 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Romil Bhardwaj --- docs/source/reference/faq.rst | 2 +- llm/vicuna-llama-2/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/reference/faq.rst b/docs/source/reference/faq.rst index 5a89b23c447..1ade656b44b 100644 --- a/docs/source/reference/faq.rst +++ b/docs/source/reference/faq.rst @@ -38,7 +38,7 @@ How to ensure my workdir's ``.git`` is synced up for managed spot jobs? Currently, there is a difference in whether ``.git`` is synced up depending on the command used: - For regular ``sky launch``, the workdir's ``.git`` is synced up by default. -- For managed spot jobs ``sky jobs launch``, the workdir's ``.git`` is excluded by default. +- For managed jobs ``sky jobs launch``, the workdir's ``.git`` is excluded by default. In the second case, to ensure the workdir's ``.git`` is synced up for managed spot jobs, you can explicitly add a file mount to sync it up: diff --git a/llm/vicuna-llama-2/README.md b/llm/vicuna-llama-2/README.md index 891cb301123..7bd5c077a3a 100644 --- a/llm/vicuna-llama-2/README.md +++ b/llm/vicuna-llama-2/README.md @@ -125,7 +125,7 @@ sky launch --no-use-spot ... To use SkyPilot Managed Spot, you can simply replace `sky launch` with `sky jobs launch` in the above command: ```bash -sky spot launch -n vicuna train.yaml \ +sky jobs launch -n vicuna train.yaml \ --env ARTIFACT_BUCKET_NAME= \ --env WANDB_API_KEY= ``` From 2c71e571ef06cff98b7f684512afe0d1492bbc89 Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Fri, 25 Oct 2024 17:28:00 -0700 Subject: [PATCH 3/3] fix other mentions of "spot jobs" --- llm/vicuna-llama-2/README.md | 4 ++-- sky/cli.py | 2 +- sky/jobs/controller.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llm/vicuna-llama-2/README.md b/llm/vicuna-llama-2/README.md index 7bd5c077a3a..e392b231e64 100644 --- a/llm/vicuna-llama-2/README.md +++ b/llm/vicuna-llama-2/README.md @@ -120,9 +120,9 @@ sky launch --no-use-spot ... ### Reducing costs by 3x with spot instances -[SkyPilot Managed Spot](https://skypilot.readthedocs.io/en/latest/examples/spot-jobs.html) is a library built on top of SkyPilot that helps users run jobs on spot instances without worrying about interruptions. That is the tool used by the LMSYS organization to train the first version of Vicuna (more details can be found in their [launch blog post](https://lmsys.org/blog/2023-03-30-vicuna/) and [example](https://github.com/skypilot-org/skypilot/tree/master/llm/vicuna)). With this, the training cost can be reduced from $1000 to **\$300**. +[SkyPilot Managed Jobs](https://skypilot.readthedocs.io/en/latest/examples/managed-jobs.html) is a library built on top of SkyPilot that helps users run jobs on spot instances without worrying about interruptions. That is the tool used by the LMSYS organization to train the first version of Vicuna (more details can be found in their [launch blog post](https://lmsys.org/blog/2023-03-30-vicuna/) and [example](https://github.com/skypilot-org/skypilot/tree/master/llm/vicuna)). With this, the training cost can be reduced from $1000 to **\$300**. -To use SkyPilot Managed Spot, you can simply replace `sky launch` with `sky jobs launch` in the above command: +To use SkyPilot Managed Spot Jobs, you can simply replace `sky launch` with `sky jobs launch` in the above command: ```bash sky jobs launch -n vicuna train.yaml \ diff --git a/sky/cli.py b/sky/cli.py index 6e0587cc117..db1befb04a3 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3519,7 +3519,7 @@ def jobs(): default=None, type=str, hidden=True, - help=('Alias for --name, the name of the spot job.')) + help=('Alias for --name, the name of the managed job.')) @click.option('--job-recovery', default=None, type=str, diff --git a/sky/jobs/controller.py b/sky/jobs/controller.py index f3cd81576e2..1faa5dfbe31 100644 --- a/sky/jobs/controller.py +++ b/sky/jobs/controller.py @@ -215,7 +215,7 @@ def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool: end_time=end_time, callback_func=callback_func) logger.info( - f'Spot job {self._job_id} (task: {task_id}) SUCCEEDED. ' + f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. ' f'Cleaning up the cluster {cluster_name}.') # Only clean up the cluster, not the storages, because tasks may # share storages.