Skip to content

Commit

Permalink
Merge pull request #3874 from Monokaix/network-topo
Browse files Browse the repository at this point in the history
Network topology scheduling implementations of volcano scheduler
  • Loading branch information
volcano-sh-bot authored Dec 26, 2024
2 parents 087d990 + c01ead5 commit abbcd96
Show file tree
Hide file tree
Showing 31 changed files with 1,707 additions and 120 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ generate-code:
manifests: controller-gen
go mod vendor
# volcano crd base
$(CONTROLLER_GEN) $(CRD_OPTIONS) paths="./vendor/volcano.sh/apis/pkg/apis/scheduling/v1beta1;./vendor/volcano.sh/apis/pkg/apis/batch/v1alpha1;./vendor/volcano.sh/apis/pkg/apis/bus/v1alpha1;./vendor/volcano.sh/apis/pkg/apis/nodeinfo/v1alpha1" output:crd:artifacts:config=config/crd/volcano/bases
$(CONTROLLER_GEN) $(CRD_OPTIONS) paths="./vendor/volcano.sh/apis/pkg/apis/scheduling/v1beta1;./vendor/volcano.sh/apis/pkg/apis/batch/v1alpha1;./vendor/volcano.sh/apis/pkg/apis/bus/v1alpha1;./vendor/volcano.sh/apis/pkg/apis/nodeinfo/v1alpha1;./vendor/volcano.sh/apis/pkg/apis/topology/v1alpha1" output:crd:artifacts:config=config/crd/volcano/bases
# generate volcano job crd yaml without description to avoid yaml size limit when using `kubectl apply`
$(CONTROLLER_GEN) $(CRD_OPTIONS_EXCLUDE_DESCRIPTION) paths="./vendor/volcano.sh/apis/pkg/apis/batch/v1alpha1" output:crd:artifacts:config=config/crd/volcano/bases
# jobflow crd base
Expand Down
12 changes: 12 additions & 0 deletions config/crd/jobflow/bases/flow.volcano.sh_jobtemplates.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,18 @@ spec:
format: int32
minimum: 1
type: integer
networkTopology:
properties:
highestTierAllowed:
default: 1
type: integer
mode:
default: hard
enum:
- hard
- soft
type: string
type: object
plugins:
additionalProperties:
items:
Expand Down
12 changes: 12 additions & 0 deletions config/crd/volcano/bases/batch.volcano.sh_jobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,18 @@ spec:
format: int32
minimum: 1
type: integer
networkTopology:
properties:
highestTierAllowed:
default: 1
type: integer
mode:
default: hard
enum:
- hard
- soft
type: string
type: object
plugins:
additionalProperties:
items:
Expand Down
18 changes: 18 additions & 0 deletions config/crd/volcano/bases/scheduling.volcano.sh_podgroups.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,24 @@ spec:
if there's not enough resources to start each task, the scheduler
will not start anyone.
type: object
networkTopology:
description: NetworkTopology defines the NetworkTopology config, this
field works in conjunction with network topology feature and hyperNode
CRD.
properties:
highestTierAllowed:
default: 1
description: HighestTierAllowed specifies the highest tier that
a job allowed to cross when scheduling.
type: integer
mode:
default: hard
description: Mode specifies the mode of the network topology constrain.
enum:
- hard
- soft
type: string
type: object
priorityClassName:
description: |-
If specified, indicates the PodGroup's priority. "system-node-critical" and
Expand Down
177 changes: 177 additions & 0 deletions config/crd/volcano/bases/topology.volcano.sh_hypernodes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.4
name: hypernodes.topology.volcano.sh
spec:
group: topology.volcano.sh
names:
kind: HyperNode
listKind: HyperNodeList
plural: hypernodes
shortNames:
- hn
singular: hypernode
scope: Cluster
versions:
- additionalPrinterColumns:
- jsonPath: .spec.tier
name: Tier
type: string
- jsonPath: .status.nodeCount
name: NodeCount
type: integer
- jsonPath: .metadata.creationTimestamp
name: Age
type: date
name: v1alpha1
schema:
openAPIV3Schema:
description: HyperNode represents a collection of nodes sharing similar network
topology or performance characteristics.
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: Spec defines the desired configuration of the HyperNode.
properties:
members:
description: Members defines a list of node groups or individual nodes
included in the HyperNode.
items:
description: MemberSpec represents a specific node or a hyperNodes
in the hyperNode.
properties:
selector:
description: Selector defines the selection rules for this member.
properties:
exactMatch:
description: ExactMatch defines the exact match criteria
(required when Type is "Exact").
properties:
name:
description: Name specifies the exact name of the node
to match.
type: string
type: object
regexMatch:
description: RegexMatch defines the regex match criteria
(required when Type is "Regex").
properties:
pattern:
description: Pattern defines the regex pattern to match
node names.
type: string
type: object
type: object
x-kubernetes-validations:
- message: Either ExactMatch or RegexMatch must be specified
rule: self.exactMatch != null || self.regexMatch != null
- message: ExactMatch and RegexMatch cannot be specified together
rule: '!(self.exactMatch != null && self.regexMatch != null)'
type:
description: Type specifies the member type.
enum:
- Node
- HyperNode
type: string
required:
- type
type: object
type: array
tier:
description: Tier categorizes the performance level of the HyperNode.
type: string
required:
- tier
type: object
status:
description: Status provides the current state of the HyperNode.
properties:
conditions:
description: Conditions provide details about the current state of
the HyperNode.
items:
description: Condition contains details for one aspect of the current
state of this API Resource.
properties:
lastTransitionTime:
description: |-
lastTransitionTime is the last time the condition transitioned from one status to another.
This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable.
format: date-time
type: string
message:
description: |-
message is a human readable message indicating details about the transition.
This may be an empty string.
maxLength: 32768
type: string
observedGeneration:
description: |-
observedGeneration represents the .metadata.generation that the condition was set based upon.
For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
with respect to the current state of the instance.
format: int64
minimum: 0
type: integer
reason:
description: |-
reason contains a programmatic identifier indicating the reason for the condition's last transition.
Producers of specific condition types may define expected values and meanings for this field,
and whether the values are considered a guaranteed API.
The value should be a CamelCase string.
This field may not be empty.
maxLength: 1024
minLength: 1
pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
type: string
status:
description: status of the condition, one of True, False, Unknown.
enum:
- "True"
- "False"
- Unknown
type: string
type:
description: type of condition in CamelCase or in foo.example.com/CamelCase.
maxLength: 316
pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
type: string
required:
- lastTransitionTime
- message
- reason
- status
- type
type: object
type: array
nodeCount:
description: NodeCount is the total number of nodes currently in the
HyperNode.
format: int64
minimum: 0
type: integer
type: object
type: object
served: true
storage: true
subresources:
status: {}
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ require (
sigs.k8s.io/controller-runtime v0.13.0
sigs.k8s.io/yaml v1.4.0
stathat.com/c/consistent v1.0.0
volcano.sh/apis v1.10.0-alpha.0.0.20241016111016-bb93758bd51f
volcano.sh/apis v1.10.0-alpha.0.0.20241218081838-e5d361b6bfbe
)

require (
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -510,5 +510,5 @@ sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY=
stathat.com/c/consistent v1.0.0 h1:ezyc51EGcRPJUxfHGSgJjWzJdj3NiMU9pNfLNGiXV0c=
stathat.com/c/consistent v1.0.0/go.mod h1:QkzMWzcbB+yQBL2AttO6sgsQS/JSTapcDISJalmCDS0=
volcano.sh/apis v1.10.0-alpha.0.0.20241016111016-bb93758bd51f h1:wqvGQgzYCPJSS07xE1LZbJ/Mxb1f/xFWThnII6BzMhg=
volcano.sh/apis v1.10.0-alpha.0.0.20241016111016-bb93758bd51f/go.mod h1:XHIjTlHDMZTLRg2Y2JAkj85iP0iiet2tv+HfPQZrsHs=
volcano.sh/apis v1.10.0-alpha.0.0.20241218081838-e5d361b6bfbe h1:iHd1Xt36a7S47IFksuF0h9W9J4LKzhBEz0C9XbkBvB8=
volcano.sh/apis v1.10.0-alpha.0.0.20241218081838-e5d361b6bfbe/go.mod h1:XHIjTlHDMZTLRg2Y2JAkj85iP0iiet2tv+HfPQZrsHs=
2 changes: 2 additions & 0 deletions hack/generate-yaml.sh
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ tail -n +2 ${VOLCANO_CRD_DIR}/bases/bus.volcano.sh_commands.yaml > ${HELM_VOLCAN
tail -n +2 ${VOLCANO_CRD_DIR}/bases/scheduling.volcano.sh_podgroups.yaml > ${HELM_VOLCANO_CRD_DIR}/bases/scheduling.volcano.sh_podgroups.yaml
tail -n +2 ${VOLCANO_CRD_DIR}/bases/scheduling.volcano.sh_queues.yaml > ${HELM_VOLCANO_CRD_DIR}/bases/scheduling.volcano.sh_queues.yaml
tail -n +2 ${VOLCANO_CRD_DIR}/bases/nodeinfo.volcano.sh_numatopologies.yaml > ${HELM_VOLCANO_CRD_DIR}/bases/nodeinfo.volcano.sh_numatopologies.yaml
tail -n +2 ${VOLCANO_CRD_DIR}/bases/topology.volcano.sh_hypernodes.yaml > ${HELM_VOLCANO_CRD_DIR}/bases/topology.volcano.sh_hypernodes.yaml

# sync jobflow bases
tail -n +2 ${JOBFLOW_CRD_DIR}/bases/flow.volcano.sh_jobflows.yaml > ${HELM_JOBFLOW_CRD_DIR}/bases/flow.volcano.sh_jobflows.yaml
Expand Down Expand Up @@ -136,6 +137,7 @@ ${HELM_BIN_DIR}/helm template ${VK_ROOT}/installer/helm/chart/volcano --namespac
-s templates/scheduling_v1beta1_podgroup.yaml \
-s templates/scheduling_v1beta1_queue.yaml \
-s templates/nodeinfo_v1alpha1_numatopologies.yaml \
-s templates/topology_v1alpha1_hypernodes.yaml \
-s templates/webhooks.yaml \
>> ${DEPLOYMENT_FILE}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,18 @@ spec:
format: int32
minimum: 1
type: integer
networkTopology:
properties:
highestTierAllowed:
default: 1
type: integer
mode:
default: hard
enum:
- hard
- soft
type: string
type: object
plugins:
additionalProperties:
items:
Expand Down
12 changes: 12 additions & 0 deletions installer/helm/chart/volcano/crd/bases/batch.volcano.sh_jobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,18 @@ spec:
format: int32
minimum: 1
type: integer
networkTopology:
properties:
highestTierAllowed:
default: 1
type: integer
mode:
default: hard
enum:
- hard
- soft
type: string
type: object
plugins:
additionalProperties:
items:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,24 @@ spec:
if there's not enough resources to start each task, the scheduler
will not start anyone.
type: object
networkTopology:
description: NetworkTopology defines the NetworkTopology config, this
field works in conjunction with network topology feature and hyperNode
CRD.
properties:
highestTierAllowed:
default: 1
description: HighestTierAllowed specifies the highest tier that
a job allowed to cross when scheduling.
type: integer
mode:
default: hard
description: Mode specifies the mode of the network topology constrain.
enum:
- hard
- soft
type: string
type: object
priorityClassName:
description: |-
If specified, indicates the PodGroup's priority. "system-node-critical" and
Expand Down
Loading

0 comments on commit abbcd96

Please sign in to comment.