Skip to content

Commit

Permalink
feat: Added new option enableProvidedByTopology (#780)
Browse files Browse the repository at this point in the history
We are reintroducing a feature originally present in v2.10.0 to prevent
pods from getting stuck in the `pending` state in clusters with
non-cloud nodes. This feature is now optional and can be enabled via the
Helm Chart. By default, it remains disabled to avoid compatibility
issues with Nomad clusters, which have a different CSI spec
implementation.

Learn more about it in #400.
  • Loading branch information
lukasmetzner authored Nov 19, 2024
1 parent a9d8505 commit 7e72431
Show file tree
Hide file tree
Showing 15 changed files with 115 additions and 26 deletions.
4 changes: 4 additions & 0 deletions chart/templates/controller/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,10 @@ spec:
key: {{ .Values.controller.hcloudToken.existingSecret.key }}
{{- end }}
{{- end }}
{{- if .Values.global.enableProvidedByTopology}}
- name: ENABLE_PROVIDED_BY_TOPOLOGY
value: "t"
{{ end }}
{{- if .Values.controller.extraEnvVars }}
{{- include "common.tplvalues.render" (dict "value" .Values.controller.extraEnvVars "context" $) | nindent 12 }}
{{- end }}
Expand Down
8 changes: 8 additions & 0 deletions chart/templates/core/storageclass.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{{ $global := .Values.global }}
{{- if .Values.storageClasses }}
{{- range $key, $val := .Values.storageClasses }}
kind: StorageClass
Expand All @@ -10,6 +11,13 @@ provisioner: csi.hetzner.cloud
volumeBindingMode: WaitForFirstConsumer
allowVolumeExpansion: true
reclaimPolicy: {{ $val.reclaimPolicy | quote }}
{{- if $global.enableProvidedByTopology }}
allowedTopologies:
- matchLabelExpressions:
- key: instance.hetzner.cloud/provided-by
values:
- "cloud"
{{- end}}
---
{{- end }}
{{- end }}
4 changes: 4 additions & 0 deletions chart/templates/node/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,10 @@ spec:
{{- end }}
- name: ENABLE_METRICS
value: {{if .Values.metrics.enabled}}"true"{{ else }}"false"{{end}}
{{- if .Values.global.enableProvidedByTopology}}
- name: ENABLE_PROVIDED_BY_TOPOLOGY
value: "t"
{{ end }}
{{- if .Values.node.extraEnvVars }}
{{- include "common.tplvalues.render" (dict "value" .Values.node.extraEnvVars "context" $) | nindent 12 }}
{{- end }}
Expand Down
9 changes: 9 additions & 0 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ global:
## - myRegistryKeySecretName
##
imagePullSecrets: []
## @param node.enableProvidedByTopology Enables workaround for upstream Kubernetes issue where nodes without the CSI plugin are still considered for scheduling.
## ref: https://github.com/kubernetes-csi/external-provisioner/issues/544
## Warning: Once enabled, this feature cannot be easily disabled.
## It automatically adds required nodeAffinities to each volume and the topology keys to `csinode` objects.
## If the feature is later disabled, the topology keys are removed from the `csinode` objects, leaving volumes with required affinities that cannot be satisfied.
## Note: After enabling this feature, the workaround for the Kubernetes upstream issue only works on newly created volumes, as old volumes are not updated with the required node affinity.
##
enableProvidedByTopology: false

## @section Common parameters
##
Expand Down Expand Up @@ -648,6 +656,7 @@ node:
## kubeletDir: /var/lib/k0s/kubelet
## For microk8s:
## kubeletDir: /var/snap/microk8s/common/var/lib/kubelet
##
kubeletDir: /var/lib/kubelet

## @section Other Parameters
Expand Down
4 changes: 4 additions & 0 deletions cmd/aio/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,13 @@ func main() {
volumeResizeService := volumes.NewLinuxResizeService(logger.With("component", "linux-resize-service"))
volumeStatsService := volumes.NewLinuxStatsService(logger.With("component", "linux-stats-service"))

enableProvidedByTopology := app.GetEnableProvidedByTopology()

nodeService := driver.NewNodeService(
logger.With("component", "driver-node-service"),
strconv.FormatInt(serverID, 10),
serverLocation,
enableProvidedByTopology,
volumeMountService,
volumeResizeService,
volumeStatsService,
Expand All @@ -84,6 +87,7 @@ func main() {
logger.With("component", "driver-controller-service"),
volumeService,
serverLocation,
enableProvidedByTopology,
)

// common
Expand Down
3 changes: 3 additions & 0 deletions cmd/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ func main() {
location = server.Datacenter.Location.Name
}

enableProvidedByTopology := app.GetEnableProvidedByTopology()

volumeService := volumes.NewIdempotentService(
logger.With("component", "idempotent-volume-service"),
api.NewVolumeService(
Expand All @@ -65,6 +67,7 @@ func main() {
logger.With("component", "driver-controller-service"),
volumeService,
location,
enableProvidedByTopology,
)

identityService := driver.NewIdentityService(
Expand Down
3 changes: 3 additions & 0 deletions cmd/node/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,13 @@ func main() {
volumeStatsService := volumes.NewLinuxStatsService(logger.With("component", "linux-stats-service"))
identityService := driver.NewIdentityService(logger.With("component", "driver-identity-service"))

enableProvidedByTopology := app.GetEnableProvidedByTopology()

nodeService := driver.NewNodeService(
logger.With("component", "driver-node-service"),
strconv.FormatInt(serverID, 10),
serverLocation,
enableProvidedByTopology,
volumeMountService,
volumeResizeService,
volumeStatsService,
Expand Down
22 changes: 19 additions & 3 deletions docs/kubernetes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,9 +209,7 @@ $ kubectl apply -f https://raw.githubusercontent.com/hetznercloud/csi-driver/v2.

## Integration with Root Servers

Root servers can be part of the cluster, but the CSI plugin doesn't work there.

Labels are needed to indicate whether a node is a cloud VM or a dedicated server from Robot. If you are using the `hcloud-cloud-controller-manager` version 1.21.0 or later, these labels are added automatically. Otherwise, you will need to label the nodes manually.
Root servers can be part of the cluster, but the CSI plugin doesn't work there. To ensure proper topology evaluation, labels are needed to indicate whether a node is a cloud VM or a dedicated server from Robot. If you are using the `hcloud-cloud-controller-manager` version 1.21.0 or later, these labels are added automatically. Otherwise, you will need to label the nodes manually.

### Adding labels manually

Expand Down Expand Up @@ -240,6 +238,24 @@ kubectl label nodes <node name> instance.hetzner.cloud/is-root-server=false
kubectl label nodes <node name> instance.hetzner.cloud/is-root-server=true
```

### Pods stuck in pending
The current behavior of the scheduler can cause Pods to be stuck in `Pending` when using the integration with Robot servers.

To address this behavior, you can set `enableProvidedByTopology` to `true` in the Helm Chart configuration. This setting prevents pods from being scheduled on nodes — specifically, Robot servers — where Hetzner volumes are unavailable. Enabling this option adds the `instance.hetzner.cloud/provided-by` label to the [allowed topologies](https://kubernetes.io/docs/concepts/storage/storage-classes/#allowed-topologies) section of the storage classes that are created. Additionally, this label is included in the `topologyKeys` section of `csinode` objects, and a node affinity is set up for each persistent volume. This workaround does not work with the [old label](#deprecated-old-label).

> [!WARNING]
> Once enabled, this feature cannot be easily disabled. It automatically adds required nodeAffinities to each volume and the topology keys to `csinode` objects. If the feature is later disabled, the topology keys are removed from the `csinode` objects, leaving volumes with required affinities that cannot be satisfied.

> [!NOTE]
> After enabling this feature, the workaround for the Kubernetes upstream issue only works on newly created volumes, as old volumes are not updated with the required node affinity.

```yaml
global:
enableProvidedByTopology: true
```

Further information on the upstream issue can be found [here](https://github.com/kubernetes-csi/external-provisioner/issues/544).

## Versioning policy

We aim to support the latest three versions of Kubernetes. When a Kubernetes
Expand Down
9 changes: 9 additions & 0 deletions internal/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,15 @@ func CreateLogger() *slog.Logger {
return logger
}

// GetEnableProvidedByTopology parses the ENABLE_PROVIDED_BY_TOPOLOGY environment variable and returns false by default.
func GetEnableProvidedByTopology() bool {
var enableProvidedByTopology bool
if featFlag, exists := os.LookupEnv("ENABLE_PROVIDED_BY_TOPOLOGY"); exists {
enableProvidedByTopology, _ = strconv.ParseBool(featFlag)
}
return enableProvidedByTopology
}

// CreateListener creates and binds the unix socket in location specified by the CSI_ENDPOINT environment variable.
func CreateListener() (net.Listener, error) {
endpoint := os.Getenv("CSI_ENDPOINT")
Expand Down
31 changes: 20 additions & 11 deletions internal/driver/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,23 @@ import (
type ControllerService struct {
proto.UnimplementedControllerServer

logger *slog.Logger
volumeService volumes.Service
location string
logger *slog.Logger
volumeService volumes.Service
location string
enableProvidedByTopology bool
}

func NewControllerService(
logger *slog.Logger,
volumeService volumes.Service,
location string,
enableProvidedByTopology bool,
) *ControllerService {
return &ControllerService{
logger: logger,
volumeService: volumeService,
location: location,
logger: logger,
volumeService: volumeService,
location: location,
enableProvidedByTopology: enableProvidedByTopology,
}
}

Expand Down Expand Up @@ -88,16 +91,22 @@ func (s *ControllerService) CreateVolume(ctx context.Context, req *proto.CreateV
"volume-name", volume.Name,
)

topology := &proto.Topology{
Segments: map[string]string{
TopologySegmentLocation: volume.Location,
},
}

if s.enableProvidedByTopology {
topology.Segments[ProvidedByLabel] = "cloud"
}

resp := &proto.CreateVolumeResponse{
Volume: &proto.Volume{
VolumeId: strconv.FormatInt(volume.ID, 10),
CapacityBytes: volume.SizeBytes(),
AccessibleTopology: []*proto.Topology{
{
Segments: map[string]string{
TopologySegmentLocation: volume.Location,
},
},
topology,
},
VolumeContext: map[string]string{
"fsFormatOptions": req.Parameters["fsFormatOptions"],
Expand Down
8 changes: 8 additions & 0 deletions internal/driver/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ func newControllerServiceTestEnv() *controllerServiceTestEnv {
logger,
volumeService,
"testloc",
false,
),
volumeService: volumeService,
}
Expand All @@ -41,6 +42,8 @@ func newControllerServiceTestEnv() *controllerServiceTestEnv {
func TestControllerServiceCreateVolume(t *testing.T) {
env := newControllerServiceTestEnv()

env.service.enableProvidedByTopology = true

env.volumeService.CreateFunc = func(ctx context.Context, opts volumes.CreateOpts) (*csi.Volume, error) {
if opts.Name != "testvol" {
t.Errorf("unexpected name passed to volume service: %s", opts.Name)
Expand Down Expand Up @@ -94,6 +97,11 @@ func TestControllerServiceCreateVolume(t *testing.T) {
if loc := top.Segments[TopologySegmentLocation]; loc != "testloc" {
t.Errorf("unexpected location segment in topology: %s", loc)
}
if env.service.enableProvidedByTopology {
if provider := top.Segments[ProvidedByLabel]; provider != "cloud" {
t.Errorf("unexpected provider segment in topology: %s", provider)
}
}
} else {
t.Errorf("unexpected number of topologies: %d", len(resp.Volume.AccessibleTopology))
}
Expand Down
1 change: 1 addition & 0 deletions internal/driver/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ const (
DefaultVolumeSize = MinVolumeSize

TopologySegmentLocation = PluginName + "/location"
ProvidedByLabel = "instance.hetzner.cloud/provided-by"
)
32 changes: 20 additions & 12 deletions internal/driver/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,29 +15,32 @@ import (
type NodeService struct {
proto.UnimplementedNodeServer

logger *slog.Logger
serverID string
serverLocation string
volumeMountService volumes.MountService
volumeResizeService volumes.ResizeService
volumeStatsService volumes.StatsService
logger *slog.Logger
serverID string
serverLocation string
enableProvidedByTopology bool
volumeMountService volumes.MountService
volumeResizeService volumes.ResizeService
volumeStatsService volumes.StatsService
}

func NewNodeService(
logger *slog.Logger,
serverID string,
serverLocation string,
enableProvidedByTopology bool,
volumeMountService volumes.MountService,
volumeResizeService volumes.ResizeService,
volumeStatsService volumes.StatsService,
) *NodeService {
return &NodeService{
logger: logger,
serverID: serverID,
serverLocation: serverLocation,
volumeMountService: volumeMountService,
volumeResizeService: volumeResizeService,
volumeStatsService: volumeStatsService,
logger: logger,
serverID: serverID,
serverLocation: serverLocation,
enableProvidedByTopology: enableProvidedByTopology,
volumeMountService: volumeMountService,
volumeResizeService: volumeResizeService,
volumeStatsService: volumeStatsService,
}
}

Expand Down Expand Up @@ -181,6 +184,11 @@ func (s *NodeService) NodeGetInfo(_ context.Context, _ *proto.NodeGetInfoRequest
},
},
}

if s.enableProvidedByTopology {
resp.AccessibleTopology.Segments[ProvidedByLabel] = "cloud"
}

return resp, nil
}

Expand Down
1 change: 1 addition & 0 deletions internal/driver/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ func newNodeServerTestEnv() nodeServiceTestEnv {
testutil.NewNopLogger(),
"1",
"loc",
false,
volumeMountService,
volumeResizeService,
volumeStatsService,
Expand Down
2 changes: 2 additions & 0 deletions internal/driver/sanity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ func TestSanity(t *testing.T) {
logger.With("component", "driver-controller-service"),
volumeService,
"testloc",
false,
)

identityService := NewIdentityService(
Expand All @@ -56,6 +57,7 @@ func TestSanity(t *testing.T) {
logger.With("component", "driver-node-service"),
"123456",
"loc",
false,
volumeMountService,
volumeResizeService,
volumeStatsService,
Expand Down

0 comments on commit 7e72431

Please sign in to comment.