Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: add support for distributed serving type #1187

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions charts/distributed-serving/.helmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*~
# Various IDEs
.project
.idea/
*.tmproj
3 changes: 3 additions & 0 deletions charts/distributed-serving/CHANGLOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### 0.1.0

* init distributed-serving chart
5 changes: 5 additions & 0 deletions charts/distributed-serving/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: v1
appVersion: "1.0"
description: A Helm chart for distributed-serving
name: distributed-serving
version: 0.1.0
32 changes: 32 additions & 0 deletions charts/distributed-serving/templates/_helpers.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "distributed-serving.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}

{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "distributed-serving.fullname" -}}
{{- if .Values.fullnameOverride -}}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- if contains $name .Release.Name -}}
{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
{{- else -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- end -}}
{{- end -}}

{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "distributed-serving.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
{{- end -}}
63 changes: 63 additions & 0 deletions charts/distributed-serving/templates/configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
{{- $releaseName := .Release.Name }}
{{- $namespace := .Release.Namespace }}
{{- $workerNum := .Values.workers -}}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ $releaseName }}-cm
labels:
app: {{ template "distributed-serving.name" $ }}
chart: {{ template "distributed-serving.chart" $ }}
release: {{ $releaseName }}
heritage: {{ .Release.Service }}
createdBy: "DistributedServing"
data:
{{- range $replica := until (int .Values.replicas) }}
hostfile-{{ $replica }}: |-
{{ $releaseName }}.{{ $releaseName }}-{{ $replica }}.{{ $namespace }}
{{- range $i := until (int $workerNum) }}
{{ $releaseName }}.{{ $releaseName }}-{{ $replica }}-{{ $i }}.{{ $namespace }}
{{- end }}
{{- end }}
master.rayInit: |-
#!/bin/bash

ray_port=6379
ray_init_timeout=300
ray_cluster_size=$WORLD_SIZE
master_command=$1

ray start --head --port=$ray_port

for (( i=0; i < $ray_init_timeout; i+=5 )); do
active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
if [ $active_nodes -eq $ray_cluster_size ]; then
echo "All ray workers are active and the ray cluster is initialized successfully."
$master_command
exit 0
fi
echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active"
sleep 5s;
done
echo "Waiting for all ray workers to be active timed out."
exit 1
worker.rayInit: |-
#!/bin/bash

ray_port=6379
ray_init_timeout=300
ray_address=$MASTER_ADDR
worker_command=$1

for (( i=0; i < $ray_init_timeout; i+=5 )); do
ray start --address=$ray_address:$ray_port
if [ $? -eq 0 ]; then
echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
$worker_command
exit 0
fi
echo "Waiting until the ray worker is active..."
sleep 5s;
done
echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
exit 1
Loading