From 8a413fa07cd9fda3cc1e926d2098596fbade4e05 Mon Sep 17 00:00:00 2001 From: limingzhe 00427178 Date: Mon, 1 Apr 2019 17:14:15 +0800 Subject: [PATCH] add tensorflow & openmpi example --- example/openmpi-hello.yaml | 55 +++++++++++++++++++++++++++ example/tensorflow-benchmark.yaml | 62 +++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 example/openmpi-hello.yaml create mode 100644 example/tensorflow-benchmark.yaml diff --git a/example/openmpi-hello.yaml b/example/openmpi-hello.yaml new file mode 100644 index 0000000000..04522c2c77 --- /dev/null +++ b/example/openmpi-hello.yaml @@ -0,0 +1,55 @@ +apiVersion: batch.volcano.sh/v1alpha1 +kind: Job +metadata: + name: openmpi-hello +spec: + minAvailable: 3 + schedulerName: kube-batch + plugins: + ssh: [] + env: [] + tasks: + - replicas: 1 + name: mpimaster + policies: + - event: TaskCompleted + action: CompleteJob + template: + spec: + imagePullSecrets: + - name: default-secret + containers: + - command: + - /bin/sh + - -c + - | + MPI_HOST=`cat /etc/volcano/mpiworker.host | tr "\n" ","`; + mkdir -p /var/run/sshd; /usr/sbin/sshd; + mpiexec --allow-run-as-root --host ${MPI_HOST} -np 2 mpi_hello_world > /home/re + image: 100.125.5.235:20202/l00427178/openmpi-hello:3.28 + name: mpimaster + ports: + - containerPort: 22 + name: mpijob-port + workingDir: /home + restartPolicy: OnFailure + - replicas: 2 + name: mpiworker + template: + spec: + imagePullSecrets: + - name: default-secret + containers: + - command: + - /bin/sh + - -c + - | + mkdir -p /var/run/sshd; /usr/sbin/sshd -D; + image: 100.125.5.235:20202/l00427178/openmpi-hello:3.28 + name: mpiworker + ports: + - containerPort: 22 + name: mpijob-port + workingDir: /home + restartPolicy: OnFailure + diff --git a/example/tensorflow-benchmark.yaml b/example/tensorflow-benchmark.yaml new file mode 100644 index 0000000000..7d969c6efc --- /dev/null +++ b/example/tensorflow-benchmark.yaml @@ -0,0 +1,62 @@ +apiVersion: batch.volcano.sh/v1alpha1 +kind: Job +metadata: + name: tensorflow-benchmark +spec: + minAvailable: 5 + schedulerName: kube-batch + plugins: + env: [] + policies: + - event: PodEvicted + action: RestartJob + - event: PodFailed + action: RestartTask + tasks: + - replicas: 2 + name: ps + template: + spec: + imagePullSecrets: + - name: default-secret + containers: + - command: + - sh + - -c + - | + PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`; + WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`; + python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=ps --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST} + image: 100.125.5.235:20202/l00427178/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 + name: tensorflow + ports: + - containerPort: 2222 + name: tfjob-port + resources: {} + workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks + restartPolicy: OnFailure + - replicas: 3 + name: worker + policies: + - event: TaskCompleted + action: CompleteJob + template: + spec: + imagePullSecrets: + - name: default-secret + containers: + - command: + - sh + - -c + - | + PS_HOST=`cat /etc/volcano/ps.host | sed 's/$/&:2222/g' | tr "\n" ","`; + WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&:2222/g' | tr "\n" ","`; + python tf_cnn_benchmarks.py --batch_size=32 --model=resnet50 --variable_update=parameter_server --flush_stdout=true --num_gpus=1 --local_parameter_device=cpu --device=cpu --data_format=NHWC --job_name=worker --task_index=${VK_TASK_INDEX} --ps_hosts=${PS_HOST} --worker_hosts=${WORKER_HOST} + image: 100.125.5.235:20202/l00427178/tf-benchmarks-cpu:v20171202-bdab599-dirty-284af3 + name: tensorflow + ports: + - containerPort: 2222 + name: tfjob-port + resources: {} + workingDir: /opt/tf-benchmarks/scripts/tf_cnn_benchmarks + restartPolicy: OnFailure \ No newline at end of file