Skip to content

Commit

Permalink
[perf-test] update 100 RayJob perf tests to use PyTorch trainer and R…
Browse files Browse the repository at this point in the history
…ay Data examples (#2149)
  • Loading branch information
andrewsykim authored May 17, 2024
1 parent 49c38be commit 2715544
Show file tree
Hide file tree
Showing 10 changed files with 128 additions and 39 deletions.
7 changes: 3 additions & 4 deletions benchmark/perf-tests/100-raycluster/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ steps:
Params:
action: start
labelSelector: app.kubernetes.io/created-by = kuberay-operator
threshold: 5m
threshold: 30m
- Identifier: WaitForControlledPodsRunning
Method: WaitForControlledPodsRunning
Params:
Expand All @@ -33,13 +33,13 @@ steps:
- basename: raycluster
objectTemplatePath: raycluster.yaml
templateFillMap:
Replicas: 1
Replicas: 3
- name: Wait for RayClusters ready
measurements:
- Identifier: WaitForRayCluster
Method: Exec
Params:
timeout: 10m
timeout: 30m
command:
- "bash"
- "100-raycluster/wait-for-rayclusters.sh"
Expand All @@ -49,7 +49,6 @@ steps:
Method: WaitForControlledPodsRunning
Params:
action: gather
operationTimeout: 10m
- name: Measure pod startup latency
measurements:
- Identifier: PodStartupLatency
Expand Down
2 changes: 2 additions & 0 deletions benchmark/perf-tests/100-raycluster/raycluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: {{.Name}}
labels:
perf-test: ray-cluster
spec:
rayVersion: '2.9.3'
headGroupSpec:
Expand Down
13 changes: 7 additions & 6 deletions benchmark/perf-tests/100-raycluster/results/junit.xml
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<testsuite name="ClusterLoaderV2" tests="0" failures="0" errors="0" time="58.451">
<testcase name="kuberay overall (github.com/ray-project/kuberary/benchmark/perf-tests/100-raycluster/config.yaml)" classname="ClusterLoaderV2" time="58.449897441"></testcase>
<testcase name="kuberay: [step: 01] Start measurements [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.271670737"></testcase>
<testcase name="kuberay: [step: 01] Start measurements [01] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="0.673679139"></testcase>
<testcase name="kuberay: [step: 02] Creating Ray clusters" classname="ClusterLoaderV2" time="1.112338422"></testcase>
<testcase name="kuberay: [step: 03] Wait for pods to be running [00] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="5.199491235"></testcase>
<testcase name="kuberay: [step: 04] Measure pod startup latency [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.477699884"></testcase>
<testcase name="kuberay overall (github.com/ray-project/kuberary/benchmark/perf-tests/100-raycluster/config.yaml)" classname="ClusterLoaderV2" time="135.242997357"/>
<testcase name="kuberay: [step: 01] Start measurements [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.106477876"/>
<testcase name="kuberay: [step: 01] Start measurements [01] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="0.307346494"/>
<testcase name="kuberay: [step: 02] Creating Ray clusters" classname="ClusterLoaderV2" time="1.057693754"/>
<testcase name="kuberay: [step: 03] Wait for RayClusters ready [00] - WaitForRayCluster" classname="ClusterLoaderV2" time="83.460650317"/>
<testcase name="kuberay: [step: 04] Wait for pods to be running [00] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="5.065870284"/>
<testcase name="kuberay: [step: 05] Measure pod startup latency [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.191868969"/>
</testsuite>
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ expect_succeeded=100
echo "waiting for $expect_succeeded RayClusters to be completed successfully"

while true; do
num_succeeded=$(kubectl get raycluster -A -o jsonpath='{range .items[*]}{.metadata.name} {.status.state}{"\n"}' | grep -c ready)
num_succeeded=$(kubectl get raycluster -l perf-test=ray-cluster -A -o jsonpath='{range .items[*]}{.metadata.name} {.status.state}{"\n"}' | grep -c ready)
echo "$num_succeeded RayClusters ready..."

if [[ "$num_succeeded" == "$expect_succeeded" ]]; then
Expand Down
26 changes: 20 additions & 6 deletions benchmark/perf-tests/100-rayjob/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ steps:
Params:
action: start
labelSelector: app.kubernetes.io/created-by = kuberay-operator
threshold: 5m
threshold: 30m
- Identifier: WaitForControlledPodsRunning
Method: WaitForControlledPodsRunning
Params:
Expand All @@ -27,23 +27,37 @@ steps:
Params:
action: start
labelSelector: app.kubernetes.io/created-by = kuberay-operator
threshold: 5m
- name: Creating RayJobs
threshold: 10m
- name: Creating RayJobs for PyTorch MNIST fine-tuning
phases:
- namespaceRange:
min: 1
max: 10
replicasPerNamespace: 10
tuningSet: Uniform100qps
objectBundle:
- basename: rayjob
objectTemplatePath: rayjob.yaml
- basename: pytorch-mnist
objectTemplatePath: pytorch-mnist-rayjob.yaml
templateFillMap:
Image: "rayproject/ray:2.9.3" # replace with image built from images/ray-pytorch
- name: Creating RayJobs for Ray Data Image Resizing
phases:
- namespaceRange:
min: 1
max: 10
replicasPerNamespace: 10
tuningSet: Uniform100qps
objectBundle:
- basename: ray-data-image-resize
objectTemplatePath: ray-data-image-resize.yaml
templateFillMap:
Image: "rayproject/ray:2.9.3" # replace with image built from images/ray-pytorch
- name: Wait for RayJobs complete
measurements:
- Identifier: WaitForRayJob
Method: Exec
Params:
timeout: 10m
timeout: 30m
command:
- "bash"
- "100-rayjob/wait-for-rayjobs.sh"
Expand Down
53 changes: 53 additions & 0 deletions benchmark/perf-tests/100-rayjob/pytorch-mnist-rayjob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
apiVersion: ray.io/v1
kind: RayJob
metadata:
name: {{.Name}}
labels:
perf-test: rayjob-pytorch-mnist
spec:
shutdownAfterJobFinishes: true
entrypoint: python ray_train_pytorch_mnist.py
runtimeEnvYAML: |
env_vars:
NUM_WORKERS: "2"
CPUS_PER_WORKER: "1"
OMP_NUM_THREADS: "1" # Set OMP_NUM_THREADS to avoid KeyErorr race condition.
rayClusterSpec:
rayVersion: '2.9.3'
headGroupSpec:
rayStartParams: {}
template:
spec:
containers:
- name: ray-head
image: {{.Image}}
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
resources:
limits:
memory: "4Gi"
requests:
cpu: "1"
memory: "4Gi"
workerGroupSpecs:
- replicas: 2
minReplicas: 1
maxReplicas: 5
groupName: worker-group
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
image: {{.Image}}
resources:
limits:
memory: "4Gi"
requests:
cpu: "1"
memory: "4Gi"
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,20 @@ apiVersion: ray.io/v1
kind: RayJob
metadata:
name: {{.Name}}
labels:
perf-test: ray-data-image-resize
spec:
entrypoint: python -c "import ray; ray.init(); print(ray.cluster_resources())"
shutdownAfterJobFinishes: true
entrypoint: python ray_data_image_resize.py
rayClusterSpec:
rayVersion: '2.9.3'
headGroupSpec:
rayStartParams:
dashboard-host: '0.0.0.0'
rayStartParams: {}
template:
spec:
containers:
- name: ray-head
image: rayproject/ray:2.9.3
image: {{.Image}}
ports:
- containerPort: 6379
name: gcs-server
Expand All @@ -24,22 +25,24 @@ spec:
name: client
resources:
limits:
cpu: "1"
memory: "10Gi"
requests:
cpu: "100m"
cpu: "2"
memory: "10Gi"
workerGroupSpecs:
- replicas: 1
- replicas: 2
minReplicas: 1
maxReplicas: 5
groupName: small-group
groupName: worker-group
rayStartParams: {}
template:
spec:
containers:
- name: ray-worker
image: rayproject/ray:2.9.3
image: {{.Image}}
resources:
limits:
cpu: "1"
memory: "4Gi"
requests:
cpu: "100m"
cpu: "2"
memory: "4Gi"
21 changes: 11 additions & 10 deletions benchmark/perf-tests/100-rayjob/results/junit.xml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<testsuite name="ClusterLoaderV2" tests="0" failures="0" errors="0" time="284.804">
<testcase name="kuberay overall (100-rayjob/config.yaml)" classname="ClusterLoaderV2" time="284.80261448"></testcase>
<testcase name="kuberay: [step: 01] Start measurements [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.106826602"></testcase>
<testcase name="kuberay: [step: 01] Start measurements [01] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="0.207335286"></testcase>
<testcase name="kuberay: [step: 01] Start measurements [02] - JobLifecycleLatency" classname="ClusterLoaderV2" time="0.106730692"></testcase>
<testcase name="kuberay: [step: 02] Creating RayJobs" classname="ClusterLoaderV2" time="1.059487968"></testcase>
<testcase name="kuberay: [step: 03] Wait for RayJobs complete [00] - WaitForRayJob" classname="ClusterLoaderV2" time="217.399873864"></testcase>
<testcase name="kuberay: [step: 04] Wait for pods to be running [00] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="5.011879337"></testcase>
<testcase name="kuberay: [step: 05] Measure pod startup latency [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.04856601"></testcase>
<testcase name="kuberay: [step: 06] Measure job finished [00] - JobLifecycleLatency" classname="ClusterLoaderV2" time="1.001760404"></testcase>
<testsuite name="ClusterLoaderV2" tests="0" failures="0" errors="0" time="848.021">
<testcase name="kuberay overall (github.com/ray-project/kuberary/benchmark/perf-tests/100-rayjob/config.yaml)" classname="ClusterLoaderV2" time="712.77299477"/>
<testcase name="kuberay: [step: 01] Start measurements [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.103862542"/>
<testcase name="kuberay: [step: 01] Start measurements [01] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="0.103965395"/>
<testcase name="kuberay: [step: 01] Start measurements [02] - JobLifecycleLatency" classname="ClusterLoaderV2" time="0.103900327"/>
<testcase name="kuberay: [step: 02] Creating RayJobs for PyTorch MNIST fine-tuning" classname="ClusterLoaderV2" time="1.057688327"/>
<testcase name="kuberay: [step: 03] Creating RayJobs for Ray Data Image Resizing" classname="ClusterLoaderV2" time="1.056577084"/>
<testcase name="kuberay: [step: 04] Wait for RayJobs complete [00] - WaitForRayJob" classname="ClusterLoaderV2" time="674.096711566"/>
<testcase name="kuberay: [step: 05] Wait for pods to be running [00] - WaitForControlledPodsRunning" classname="ClusterLoaderV2" time="5.014664289"/>
<testcase name="kuberay: [step: 06] Measure pod startup latency [00] - PodStartupLatency" classname="ClusterLoaderV2" time="0.291397836"/>
<testcase name="kuberay: [step: 07] Measure job finished [00] - JobLifecycleLatency" classname="ClusterLoaderV2" time="1.002516531"/>
</testsuite>
13 changes: 12 additions & 1 deletion benchmark/perf-tests/100-rayjob/wait-for-rayjobs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,18 @@ expect_succeeded=100
echo "waiting for $expect_succeeded RayJobs to be completed successfully"

while true; do
num_succeeded=$(kubectl get rayjob -A -o jsonpath='{range .items[*]}{.metadata.name} {.status.jobStatus}{"\n"}' | grep -c SUCCEEDED)
num_succeeded=$(kubectl get rayjob -A -l perf-test=rayjob-pytorch-mnist -o jsonpath='{range .items[*]}{.metadata.name} {.status.jobStatus}{"\n"}' | grep -c SUCCEEDED)
echo "$num_succeeded RayJobs completed..."

if [[ "$num_succeeded" == "$expect_succeeded" ]]; then
break;
fi

sleep 5
done

while true; do
num_succeeded=$(kubectl get rayjob -A -l perf-test=ray-data-image-resize -o jsonpath='{range .items[*]}{.metadata.name} {.status.jobStatus}{"\n"}' | grep -c SUCCEEDED)
echo "$num_succeeded RayJobs completed..."

if [[ "$num_succeeded" == "$expect_succeeded" ]]; then
Expand Down
5 changes: 5 additions & 0 deletions benchmark/perf-tests/images/ray-pytorch/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM rayproject/ray:2.9.3

RUN pip install torch torchvision numpy
RUN wget https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/pytorch-mnist/ray_train_pytorch_mnist.py
RUN wget https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/ray-data-image-resize/ray_data_image_resize.py

0 comments on commit 2715544

Please sign in to comment.