From 1d52a1590442540358b96330b4c4d4b2a5fe8b81 Mon Sep 17 00:00:00 2001 From: Matt Schallert Date: Sun, 28 Apr 2019 14:39:13 -0400 Subject: [PATCH] [api] expand node affinity; support multiple reqs Previously we supported node affinity by either 1) using the default heuristic of the isolation group name being the zone, or 2) by specifying a single key and a set of values to match an m3db pod to a node. This limits users from specifying more complex terms such as "nodes in zone A with instance type 'large'". This change allows users to specify multiple node affinity terms, as well as specifying none at all if they have no need for node affinity. This is a breaking change as users must specify the zone key in their cluster configs to get the same zone affinity as before, and this will be called out in the release notes. Eventually we'll add the ability to specify entire affinity terms per isolation-group, but those can be extremely verbose and this still provides a nice shortcut for most use cases. --- README.md | 36 ++- docs/api.md | 15 +- docs/getting_started/create_cluster.md | 48 +++- example/m3db-cluster-minikube.yaml | 266 ------------------ .../m3db-cluster-per-zone-storageclass.yaml | 18 +- example/m3db-cluster.yaml | 28 +- example/simple-cluster.yaml | 24 +- integration/e2e/basic_test.go | 4 +- integration/manifests/cluster-regional.yaml | 18 +- integration/manifests/cluster-zonal.yaml | 24 +- pkg/apis/m3dboperator/v1alpha1/cluster.go | 26 +- .../v1alpha1/zz_generated.deepcopy.go | 31 +- pkg/controller/controller.go | 12 +- pkg/controller/controller_test.go | 25 +- pkg/controller/fixtures/cluster-3-zones.yaml | 24 +- pkg/controller/fixtures/cluster-simple.yaml | 22 +- pkg/k8sops/fixtures/testM3DBCluster.yaml | 12 + pkg/k8sops/generators.go | 11 +- pkg/k8sops/generators_test.go | 4 +- pkg/k8sops/statefulset.go | 46 +-- pkg/k8sops/statefulset_test.go | 105 +++++-- 21 files changed, 382 insertions(+), 417 deletions(-) delete mode 100644 example/m3db-cluster-minikube.yaml diff --git a/README.md b/README.md index 923c1222..b13a7171 100644 --- a/README.md +++ b/README.md @@ -80,18 +80,38 @@ spec: - http://etcd-1.etcd:2379 - http://etcd-2.etcd:2379 isolationGroups: - - name: - numInstances: 1 - - name: - numInstances: 1 - - name: - numInstances: 1 + - name: group1 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - + - name: group2 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - + - name: group3 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - podIdentityConfig: - sources: - - PodUID + sources: [] namespaces: - name: metrics-10s:2d preset: 10s:2d + dataDirVolumeClaimTemplate: + metadata: + name: m3db-data + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi ``` ### Resizing a Cluster diff --git a/docs/api.md b/docs/api.md index cabcbe61..a283572c 100644 --- a/docs/api.md +++ b/docs/api.md @@ -10,6 +10,7 @@ This document enumerates the Custom Resource Definitions used by the M3DB Operat * [M3DBCluster](#m3dbcluster) * [M3DBClusterList](#m3dbclusterlist) * [M3DBStatus](#m3dbstatus) +* [NodeAffinityTerm](#nodeaffinityterm) * [IndexOptions](#indexoptions) * [Namespace](#namespace) * [NamespaceOptions](#namespaceoptions) @@ -63,8 +64,7 @@ IsolationGroup defines the name of zone as well attributes for the zone configur | Field | Description | Scheme | Required | | ----- | ----------- | ------ | -------- | | name | Name is the value that will be used in StatefulSet labels, pod labels, and M3DB placement \"isolationGroup\" fields. | string | true | -| nodeAffinityKey | NodeAffinityKey is the node label that will be used in corresponding StatefulSet match expression to assign pods to nodes. Defaults to \"failure-domain.beta.kubernetes.io/zone\". | string | false | -| nodeAffinityValues | NodeSelectorValues is the node label value that will be used to assign pods to nodes. Defaults to the isolation group's name, but can be overridden to allow multiple IsolationGroups to be assigned to the same zone. | []string | false | +| nodeAffinityTerms | NodeAffinityTerms is an array of NodeAffinityTerm requirements, which are ANDed together to indicate what nodes an isolation group can be assigned to. | [][NodeAffinityTerm](#nodeaffinityterm) | false | | numInstances | NumInstances defines the number of instances. | int32 | true | | storageClassName | StorageClassName is the name of the StorageClass to use for this isolation group. This allows ensuring that PVs will be created in the same zone as the pinned statefulset on Kubernetes < 1.12 (when topology aware volume scheduling was introduced). Only has effect if the clusters `dataDirVolumeClaimTemplate` is non-nil. If set, the volume claim template will have its storageClassName field overridden per-isolationgroup. If unset the storageClassName of the volumeClaimTemplate will be used. | string | false | @@ -107,6 +107,17 @@ M3DBStatus contains the current state the M3DB cluster along with a human readab [Back to TOC](#table-of-contents) +## NodeAffinityTerm + +NodeAffinityTerm represents a node label and a set of label values, any of which can be matched to assign a pod to a node. + +| Field | Description | Scheme | Required | +| ----- | ----------- | ------ | -------- | +| key | Key is the label of the node. | string | true | +| values | Values is an array of values, any of which a node can have for a pod to be assigned to it. | []string | true | + +[Back to TOC](#table-of-contents) + ## IndexOptions IndexOptions defines parameters for indexing. diff --git a/docs/getting_started/create_cluster.md b/docs/getting_started/create_cluster.md index 070f81cb..bc7278e1 100644 --- a/docs/getting_started/create_cluster.md +++ b/docs/getting_started/create_cluster.md @@ -47,12 +47,24 @@ spec: - http://etcd-1.etcd:2379 - http://etcd-2.etcd:2379 isolationGroups: - - name: us-east1-b - numInstances: 1 - - name: us-east1-c - numInstances: 1 - - name: us-east1-d - numInstances: 1 + - name: group1 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - + - name: group2 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - + - name: group3 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - podIdentityConfig: sources: - PodUID @@ -114,12 +126,24 @@ spec: replicationFactor: 3 numberOfShards: 256 isolationGroups: - - name: us-east1-b - numInstances: 1 - - name: us-east1-c - numInstances: 1 - - name: us-east1-d - numInstances: 1 + - name: group1 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - + - name: group2 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - + - name: group3 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - etcdEndpoints: - http://etcd-0.etcd:2379 - http://etcd-1.etcd:2379 diff --git a/example/m3db-cluster-minikube.yaml b/example/m3db-cluster-minikube.yaml deleted file mode 100644 index 658d763f..00000000 --- a/example/m3db-cluster-minikube.yaml +++ /dev/null @@ -1,266 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: m3-configuration -data: - m3.yml: |+ - coordinator: - listenAddress: - type: "config" - value: "0.0.0.0:7201" - metrics: - scope: - prefix: "coordinator" - prometheus: - handlerPath: /metrics - listenAddress: 0.0.0.0:7203 - sanitization: prometheus - samplingRate: 1.0 - extended: none - - db: - logging: - level: info - - metrics: - prometheus: - handlerPath: /metrics - sanitization: prometheus - samplingRate: 1.0 - extended: detailed - - listenAddress: 0.0.0.0:9000 - clusterListenAddress: 0.0.0.0:9001 - httpNodeListenAddress: 0.0.0.0:9002 - httpClusterListenAddress: 0.0.0.0:9003 - debugListenAddress: 0.0.0.0:9004 - - hostID: - resolver: hostname - - client: - writeConsistencyLevel: majority - readConsistencyLevel: unstrict_majority - writeTimeout: 10s - fetchTimeout: 15s - connectTimeout: 20s - writeRetry: - initialBackoff: 500ms - backoffFactor: 3 - maxRetries: 2 - jitter: true - fetchRetry: - initialBackoff: 500ms - backoffFactor: 2 - maxRetries: 3 - jitter: true - backgroundHealthCheckFailLimit: 4 - backgroundHealthCheckFailThrottleFactor: 0.5 - - gcPercentage: 100 - - writeNewSeriesAsync: true - writeNewSeriesLimitPerSecond: 1048576 - writeNewSeriesBackoffDuration: 2ms - - bootstrap: - bootstrappers: - - filesystem - - commitlog - - peers - - uninitialized_topology - fs: - numProcessorsPerCPU: 0.125 - - commitlog: - flushMaxBytes: 524288 - flushEvery: 1s - queue: - calculationType: fixed - size: 2097152 - blockSize: 10m - - fs: - filePathPrefix: /var/lib/m3db - writeBufferSize: 65536 - dataReadBufferSize: 65536 - infoReadBufferSize: 128 - seekReadBufferSize: 4096 - throughputLimitMbps: 100.0 - throughputCheckEvery: 128 - - repair: - enabled: false - interval: 2h - offset: 30m - jitter: 1h - throttle: 2m - checkInterval: 1m - - pooling: - blockAllocSize: 16 - type: simple - seriesPool: - size: 262144 - lowWatermark: 0.7 - highWatermark: 1.0 - blockPool: - size: 262144 - lowWatermark: 0.7 - highWatermark: 1.0 - encoderPool: - size: 262144 - lowWatermark: 0.7 - highWatermark: 1.0 - closersPool: - size: 104857 - lowWatermark: 0.7 - highWatermark: 1.0 - contextPool: - size: 262144 - lowWatermark: 0.7 - highWatermark: 1.0 - segmentReaderPool: - size: 16384 - lowWatermark: 0.7 - highWatermark: 1.0 - iteratorPool: - size: 2048 - lowWatermark: 0.7 - highWatermark: 1.0 - fetchBlockMetadataResultsPool: - size: 65536 - capacity: 32 - lowWatermark: 0.7 - highWatermark: 1.0 - fetchBlocksMetadataResultsPool: - size: 32 - capacity: 4096 - lowWatermark: 0.7 - highWatermark: 1.0 - hostBlockMetadataSlicePool: - size: 131072 - capacity: 3 - lowWatermark: 0.7 - highWatermark: 1.0 - blockMetadataPool: - size: 65536 - lowWatermark: 0.7 - highWatermark: 1.0 - blockMetadataSlicePool: - size: 65536 - capacity: 32 - lowWatermark: 0.7 - highWatermark: 1.0 - blocksMetadataPool: - size: 65536 - lowWatermark: 0.7 - highWatermark: 1.0 - blocksMetadataSlicePool: - size: 32 - capacity: 4096 - lowWatermark: 0.7 - highWatermark: 1.0 - identifierPool: - size: 262144 - lowWatermark: 0.7 - highWatermark: 1.0 - bytesPool: - buckets: - - capacity: 16 - size: 524288 - lowWatermark: 0.7 - highWatermark: 1.0 - - capacity: 32 - size: 262144 - lowWatermark: 0.7 - highWatermark: 1.0 - - capacity: 64 - size: 131072 - lowWatermark: 0.7 - highWatermark: 1.0 - - capacity: 128 - size: 65536 - lowWatermark: 0.7 - highWatermark: 1.0 - - capacity: 256 - size: 65536 - lowWatermark: 0.7 - highWatermark: 1.0 - - capacity: 1440 - size: 16384 - lowWatermark: 0.7 - highWatermark: 1.0 - - capacity: 4096 - size: 8192 - lowWatermark: 0.7 - highWatermark: 1.0 - config: - service: - env: default_env - zone: embedded - service: m3db - cacheDir: /var/lib/m3kv - etcdClusters: - - zone: embedded - endpoints: - - http://etcd-0.etcd:2379 - - http://etcd-1.etcd:2379 - - http://etcd-2.etcd:2379 ---- -apiVersion: operator.m3db.io/v1alpha1 -kind: M3DBCluster -metadata: - name: m3db-cluster -spec: - image: quay.io/m3db/m3dbnode:latest - replicationFactor: 1 - numberOfShards: 8 - isolationGroups: - - name: us-west1-a - numInstances: 1 - - name: us-west1-b - numInstances: 1 - serviceConfigurations: - - name: m3dbnode - labels: - - name: app - value: m3dbnode - selectors: - - name: app - value: m3dbnode - ports: - - number: 9000 - name: client - - number: 9001 - name: cluster - - number: 9002 - name: http-node - - number: 9003 - name: http-cluster - - number: 9004 - name: debug - - number: 7201 - name: coordinator - - number: 7203 - name: metrics - - name: m3coordinator - labels: - - name: app - value: m3coordinator - selectors: - - name: app - value: m3dbnode # allow m3coordinator to communicate with the db - ports: - - number: 7201 - name: coordinator - - number: 7203 - name: metrics - clusterIP: true - resources: - requests: - memory: 1Gi - cpu: "250m" - limits: - cpu: "250m" - memory: 2Gi diff --git a/example/m3db-cluster-per-zone-storageclass.yaml b/example/m3db-cluster-per-zone-storageclass.yaml index c40c6457..313c5955 100644 --- a/example/m3db-cluster-per-zone-storageclass.yaml +++ b/example/m3db-cluster-per-zone-storageclass.yaml @@ -35,15 +35,27 @@ spec: replicationFactor: 3 numberOfShards: 1024 isolationGroups: - - name: us-east1-b + - name: zone-b numInstances: 1 storageClassName: fast-east1-b - - name: us-east1-c + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-b + - name: zone-c numInstances: 1 storageClassName: fast-east1-c - - name: us-east1-d + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-c + - name: zone-d numInstances: 1 storageClassName: fast-east1-d + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-d etcdEndpoints: - http://etcd-0.etcd:2379 - http://etcd-1.etcd:2379 diff --git a/example/m3db-cluster.yaml b/example/m3db-cluster.yaml index 09b3481e..c8a9858e 100644 --- a/example/m3db-cluster.yaml +++ b/example/m3db-cluster.yaml @@ -7,12 +7,24 @@ spec: replicationFactor: 3 numberOfShards: 256 isolationGroups: - - name: us-east1-b - numInstances: 1 - - name: us-east1-c - numInstances: 1 - - name: us-east1-d - numInstances: 1 + - name: group1 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-b + - name: group2 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-c + - name: group3 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-d namespaces: - name: metrics-10s:2d preset: 10s:2d @@ -25,5 +37,5 @@ spec: memory: 4Gi cpu: '1' limits: - memory: 12Gi - cpu: '4' + memory: 32Gi + cpu: '8' diff --git a/example/simple-cluster.yaml b/example/simple-cluster.yaml index 4e8bf787..46c63b83 100644 --- a/example/simple-cluster.yaml +++ b/example/simple-cluster.yaml @@ -11,12 +11,24 @@ spec: - http://etcd-1.etcd:2379 - http://etcd-2.etcd:2379 isolationGroups: - - name: us-east1-b - numInstances: 1 - - name: us-east1-c - numInstances: 1 - - name: us-east1-d - numInstances: 1 + - name: group1 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-b + - name: group2 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-c + - name: group3 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-d namespaces: - name: metrics-10s:2d preset: 10s:2d diff --git a/integration/e2e/basic_test.go b/integration/e2e/basic_test.go index c4106b41..089ba75c 100644 --- a/integration/e2e/basic_test.go +++ b/integration/e2e/basic_test.go @@ -72,7 +72,7 @@ func TestCreateCluster_Regional(t *testing.T) { } sort.Strings(isoGroups) - assert.Equal(t, []string{"us-east1-b", "us-east1-c", "us-east1-d"}, isoGroups) + assert.Equal(t, []string{"group1", "group2", "group3"}, isoGroups) return true, nil }) @@ -111,7 +111,7 @@ func TestCreateCluster_Zonal(t *testing.T) { } sort.Strings(isoGroups) - assert.Equal(t, []string{"rep0", "rep1", "rep2"}, isoGroups) + assert.Equal(t, []string{"group1", "group2", "group3"}, isoGroups) return true, nil }) diff --git a/integration/manifests/cluster-regional.yaml b/integration/manifests/cluster-regional.yaml index 5fc8f426..ffb78e82 100644 --- a/integration/manifests/cluster-regional.yaml +++ b/integration/manifests/cluster-regional.yaml @@ -11,12 +11,24 @@ spec: - http://etcd-1.etcd:2379 - http://etcd-2.etcd:2379 isolationGroups: - - name: us-east1-b + - name: group1 numInstances: 1 - - name: us-east1-c + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-b + - name: group2 numInstances: 1 - - name: us-east1-d + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-c + - name: group3 numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-d namespaces: - name: metrics-10s:2d preset: 10s:2d diff --git a/integration/manifests/cluster-zonal.yaml b/integration/manifests/cluster-zonal.yaml index 2fb187b2..e7b405be 100644 --- a/integration/manifests/cluster-zonal.yaml +++ b/integration/manifests/cluster-zonal.yaml @@ -11,18 +11,24 @@ spec: - http://etcd-1.etcd:2379 - http://etcd-2.etcd:2379 isolationGroups: - - name: rep0 + - name: group1 numInstances: 1 - nodeAffinityValues: - - us-east1-b - - name: rep1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-b + - name: group2 numInstances: 1 - nodeAffinityValues: - - us-east1-b - - name: rep2 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-b + - name: group3 numInstances: 1 - nodeAffinityValues: - - us-east1-b + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-east1-b namespaces: - name: metrics-10s:2d preset: 10s:2d diff --git a/pkg/apis/m3dboperator/v1alpha1/cluster.go b/pkg/apis/m3dboperator/v1alpha1/cluster.go index d5ffb0fc..13f88a7a 100644 --- a/pkg/apis/m3dboperator/v1alpha1/cluster.go +++ b/pkg/apis/m3dboperator/v1alpha1/cluster.go @@ -233,23 +233,27 @@ type ClusterSpec struct { PriorityClassName string `json:"priorityClassName,omitempty"` } +// NodeAffinityTerm represents a node label and a set of label values, any of +// which can be matched to assign a pod to a node. +type NodeAffinityTerm struct { + // Key is the label of the node. + Key string `json:"key"` + + // Values is an array of values, any of which a node can have for a pod to be + // assigned to it. + Values []string `json:"values"` +} + // IsolationGroup defines the name of zone as well attributes for the zone configuration type IsolationGroup struct { // Name is the value that will be used in StatefulSet labels, pod labels, and // M3DB placement "isolationGroup" fields. Name string `json:"name"` - // NodeAffinityKey is the node label that will be used in corresponding - // StatefulSet match expression to assign pods to nodes. Defaults to - // "failure-domain.beta.kubernetes.io/zone". - // +optional - NodeAffinityKey string `json:"nodeAffinityKey,omitempty"` - - // NodeSelectorValues is the node label value that will be used to assign pods - // to nodes. Defaults to the isolation group's name, but can be overridden to - // allow multiple IsolationGroups to be assigned to the same zone. - // +optional - NodeAffinityValues []string `json:"nodeAffinityValues,omitempty"` + // NodeAffinityTerms is an array of NodeAffinityTerm requirements, which are + // ANDed together to indicate what nodes an isolation group can be assigned + // to. + NodeAffinityTerms []NodeAffinityTerm `json:"nodeAffinityTerms,omitempty"` // NumInstances defines the number of instances. NumInstances int32 `json:"numInstances"` diff --git a/pkg/apis/m3dboperator/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/m3dboperator/v1alpha1/zz_generated.deepcopy.go index 0545b68c..7f0f5e31 100644 --- a/pkg/apis/m3dboperator/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/m3dboperator/v1alpha1/zz_generated.deepcopy.go @@ -139,10 +139,12 @@ func (in *IndexOptions) DeepCopy() *IndexOptions { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *IsolationGroup) DeepCopyInto(out *IsolationGroup) { *out = *in - if in.NodeAffinityValues != nil { - in, out := &in.NodeAffinityValues, &out.NodeAffinityValues - *out = make([]string, len(*in)) - copy(*out, *in) + if in.NodeAffinityTerms != nil { + in, out := &in.NodeAffinityTerms, &out.NodeAffinityTerms + *out = make([]NodeAffinityTerm, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } } return } @@ -300,6 +302,27 @@ func (in *NamespaceOptions) DeepCopy() *NamespaceOptions { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeAffinityTerm) DeepCopyInto(out *NodeAffinityTerm) { + *out = *in + if in.Values != nil { + in, out := &in.Values, &out.Values + *out = make([]string, len(*in)) + copy(*out, *in) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeAffinityTerm. +func (in *NodeAffinityTerm) DeepCopy() *NodeAffinityTerm { + if in == nil { + return nil + } + out := new(NodeAffinityTerm) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *PodIdentity) DeepCopyInto(out *PodIdentity) { *out = *in diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 71189dbc..5bb25eab 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -54,6 +54,7 @@ import ( "k8s.io/client-go/util/workqueue" "github.com/kubernetes/utils/pointer" + pkgerrors "github.com/pkg/errors" "github.com/uber-go/tally" "go.uber.org/zap" ) @@ -65,9 +66,8 @@ const ( ) var ( - errOrphanedPod = errors.New("pod does not belong to an m3db cluster") - errInvalidNumIsoGroups = errors.New("must specify number of isolationgroups equal to replication factor") - errEmptyNodeAffinityVals = errors.New("NodeAffinityValues cannot be empty if NodeAffinityKey set") + errOrphanedPod = errors.New("pod does not belong to an m3db cluster") + errInvalidNumIsoGroups = errors.New("must specify number of isolationgroups equal to replication factor") ) // Controller object @@ -782,16 +782,12 @@ func validateIsolationGroups(cluster *myspec.M3DBCluster) error { groups := cluster.Spec.IsolationGroups if cluster.Spec.ReplicationFactor != int32(len(groups)) { - return errInvalidNumIsoGroups + return pkgerrors.WithMessagef(errInvalidNumIsoGroups, "replication factor is %d but number of isogroups is %d", cluster.Spec.ReplicationFactor, len(groups)) } names := make(map[string]struct{}, len(groups)) for _, g := range groups { names[g.Name] = struct{}{} - - if g.NodeAffinityKey != "" && len(g.NodeAffinityValues) == 0 { - return errEmptyNodeAffinityVals - } } if len(names) != len(groups) { diff --git a/pkg/controller/controller_test.go b/pkg/controller/controller_test.go index ac56c71b..cffe3ef3 100644 --- a/pkg/controller/controller_test.go +++ b/pkg/controller/controller_test.go @@ -98,7 +98,7 @@ func TestGetChildStatefulSets(t *testing.T) { cluster: newMeta("cluster1", map[string]string{"foo": "bar"}), sets: []*metav1.ObjectMeta{ newMeta("set1", map[string]string{ - "foo": "bar", + "foo": "bar", "operator.m3db.io/app": "m3db", "operator.m3db.io/cluster": "cluster1", }), @@ -109,7 +109,7 @@ func TestGetChildStatefulSets(t *testing.T) { cluster: newMeta("cluster1", map[string]string{"foo": "bar"}), sets: []*metav1.ObjectMeta{ newMeta("set1", map[string]string{ - "foo": "bar", + "foo": "bar", "operator.m3db.io/app": "m3db", "operator.m3db.io/cluster": "cluster2", }), @@ -398,27 +398,6 @@ func TestValidateIsolationGroups(t *testing.T) { }, doExpErr: true, }, - { - rf: 1, - groups: []myspec.IsolationGroup{ - { - Name: "foo", - NodeAffinityKey: "key", - }, - }, - expErr: errEmptyNodeAffinityVals, - doExpErr: true, - }, - { - rf: 1, - groups: []myspec.IsolationGroup{ - { - Name: "foo", - NodeAffinityKey: "key", - NodeAffinityValues: []string{"v1"}, - }, - }, - }, } for _, test := range tests { diff --git a/pkg/controller/fixtures/cluster-3-zones.yaml b/pkg/controller/fixtures/cluster-3-zones.yaml index 84e60654..165867c6 100644 --- a/pkg/controller/fixtures/cluster-3-zones.yaml +++ b/pkg/controller/fixtures/cluster-3-zones.yaml @@ -9,12 +9,24 @@ spec: replicationFactor: 3 numberOfShards: 8 isolationGroups: - - name: us-fake1-a - numInstances: 3 - - name: us-fake1-b - numInstances: 3 - - name: us-fake1-c - numInstances: 3 + - name: us-fake1-a + numInstances: 3 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-fake1-a + - name: us-fake1-b + numInstances: 3 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-fake1-b + - name: us-fake1-c + numInstances: 3 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-fake1-c namespaces: - name: metrics-10s:2d preset: 10s:2d diff --git a/pkg/controller/fixtures/cluster-simple.yaml b/pkg/controller/fixtures/cluster-simple.yaml index 7fc636d8..694f8b59 100644 --- a/pkg/controller/fixtures/cluster-simple.yaml +++ b/pkg/controller/fixtures/cluster-simple.yaml @@ -6,11 +6,27 @@ metadata: namespace: fake spec: image: fake.fake/fake/m3dbnode:latest - replicationFactor: 2 + replicationFactor: 3 numberOfShards: 8 isolationGroups: - - name: us-fake1-a - numInstances: 1 + - name: group1 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-fake1-b + - name: group2 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-fake1-c + - name: group3 + numInstances: 1 + nodeAffinityTerms: + - key: failure-domain.beta.kubernetes.io/zone + values: + - us-fake1-d namespaces: - name: metrics-10s:2d preset: 10s:2d diff --git a/pkg/k8sops/fixtures/testM3DBCluster.yaml b/pkg/k8sops/fixtures/testM3DBCluster.yaml index 04280ab0..dcc68c45 100644 --- a/pkg/k8sops/fixtures/testM3DBCluster.yaml +++ b/pkg/k8sops/fixtures/testM3DBCluster.yaml @@ -11,10 +11,22 @@ spec: isolationGroups: - name: us-fake1-a numInstances: 1 + nodeAffinityTerms: + - key: zone + values: + - zone-a - name: us-fake1-b numInstances: 1 + nodeAffinityTerms: + - key: zone + values: + - zone-a - name: us-fake1-c numInstances: 1 + nodeAffinityTerms: + - key: zone + values: + - zone-a namespaces: - name: metrics-10s:2d preset: 10s:2d diff --git a/pkg/k8sops/generators.go b/pkg/k8sops/generators.go index 82f7d91e..fde658e9 100644 --- a/pkg/k8sops/generators.go +++ b/pkg/k8sops/generators.go @@ -34,6 +34,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/kubernetes/utils/pointer" + pkgerrors "github.com/pkg/errors" ) const ( @@ -110,8 +111,8 @@ func GenerateStatefulSet( isolationGroupName string, instanceAmount int32, ) (*appsv1.StatefulSet, error) { - // TODO(schallert): always sort zones alphabetically. + // TODO(schallert): always sort zones alphabetically. stsID := -1 var isolationGroup myspec.IsolationGroup for i, g := range cluster.Spec.IsolationGroups { @@ -126,16 +127,20 @@ func GenerateStatefulSet( return nil, fmt.Errorf("could not find isogroup '%s' in spec", isolationGroupName) } + clusterSpec := cluster.Spec clusterName := cluster.GetName() ssName := StatefulSetName(clusterName, stsID) - clusterSpec := cluster.Spec + affinity, err := GenerateStatefulSetAffinity(isolationGroup) + if err != nil { + return nil, pkgerrors.Wrap(err, "error generating statefulset affinity") + } statefulSet := NewBaseStatefulSet(ssName, isolationGroupName, cluster, instanceAmount) m3dbContainer := &statefulSet.Spec.Template.Spec.Containers[0] m3dbContainer.Resources = clusterSpec.ContainerResources m3dbContainer.Ports = generateContainerPorts() - statefulSet.Spec.Template.Spec.Affinity = GenerateStatefulSetAffinity(isolationGroup) + statefulSet.Spec.Template.Spec.Affinity = affinity statefulSet.Spec.Template.Spec.Tolerations = cluster.Spec.Tolerations // Set owner ref so sts will be GC'd when the cluster is deleted diff --git a/pkg/k8sops/generators_test.go b/pkg/k8sops/generators_test.go index ce28ac6b..e6904f7e 100644 --- a/pkg/k8sops/generators_test.go +++ b/pkg/k8sops/generators_test.go @@ -143,9 +143,9 @@ func TestGenerateStatefulSet(t *testing.T) { { MatchExpressions: []v1.NodeSelectorRequirement{ { - Key: FailureDomainZoneKey, + Key: "zone", Operator: v1.NodeSelectorOpIn, - Values: []string{isolationGroup}, + Values: []string{"zone-a"}, }, }, }, diff --git a/pkg/k8sops/statefulset.go b/pkg/k8sops/statefulset.go index 4ae93ff6..79612f95 100644 --- a/pkg/k8sops/statefulset.go +++ b/pkg/k8sops/statefulset.go @@ -21,6 +21,7 @@ package k8sops import ( + "errors" errorz "errors" "fmt" "strings" @@ -48,6 +49,11 @@ const ( podIdentityVolumeName = "pod-identity" ) +var ( + errEmptyNodeAffinityKey = errors.New("node affinity term key cannot be empty") + errEmptyNodeAffinityValues = errors.New("node affinity term values cannot be empty") +) + // MultiLabelSelector provides a ListOptions with a LabelSelector // given a map of strings func (k *k8sops) MultiLabelSelector(kvs map[string]string) metav1.ListOptions { @@ -335,15 +341,26 @@ func generateDownwardAPIVolumeMount() v1.VolumeMount { } // GenerateStatefulSetAffinity generates a node affinity requiring a strict match for -// given key and value. -func GenerateStatefulSetAffinity(isoGroup myspec.IsolationGroup) *v1.Affinity { - affinityKey := FailureDomainZoneKey - if k := isoGroup.NodeAffinityKey; k != "" { - affinityKey = k +// given key and values. +func GenerateStatefulSetAffinity(isoGroup myspec.IsolationGroup) (*v1.Affinity, error) { + if len(isoGroup.NodeAffinityTerms) == 0 { + return nil, nil } - affinityValues := []string{isoGroup.Name} - if len(isoGroup.NodeAffinityValues) > 0 { - affinityValues = isoGroup.NodeAffinityValues + + expressions := make([]v1.NodeSelectorRequirement, len(isoGroup.NodeAffinityTerms)) + for i, term := range isoGroup.NodeAffinityTerms { + if term.Key == "" { + return nil, errEmptyNodeAffinityKey + } + if len(term.Values) == 0 { + return nil, errEmptyNodeAffinityValues + } + + expressions[i] = v1.NodeSelectorRequirement{ + Key: term.Key, + Operator: v1.NodeSelectorOpIn, + Values: term.Values, + } } return &v1.Affinity{ @@ -351,18 +368,12 @@ func GenerateStatefulSetAffinity(isoGroup myspec.IsolationGroup) *v1.Affinity { RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ NodeSelectorTerms: []v1.NodeSelectorTerm{ { - MatchExpressions: []v1.NodeSelectorRequirement{ - { - Key: affinityKey, - Operator: v1.NodeSelectorOpIn, - Values: affinityValues, - }, - }, + MatchExpressions: expressions, }, }, }, }, - } + }, nil } // GenerateOwnerRef generates an owner reference to a given m3db cluster. @@ -370,7 +381,6 @@ func GenerateOwnerRef(cluster *myspec.M3DBCluster) *metav1.OwnerReference { return metav1.NewControllerRef(cluster, schema.GroupVersionKind{ Group: myspec.SchemeGroupVersion.Group, Version: myspec.SchemeGroupVersion.Version, - // TODO(schallert): use a const here - Kind: "m3dbcluster", + Kind: "m3dbcluster", }) } diff --git a/pkg/k8sops/statefulset_test.go b/pkg/k8sops/statefulset_test.go index 3f2e1f58..ea8955a0 100644 --- a/pkg/k8sops/statefulset_test.go +++ b/pkg/k8sops/statefulset_test.go @@ -81,45 +81,110 @@ func TestGenerateDownwardAPIVolumePath(t *testing.T) { } func TestGenerateStatefulSetAffinity(t *testing.T) { + type expTerm struct { + key string + values []string + } tests := []struct { - isoGroup myspec.IsolationGroup - expKey string - expValues []string + isoGroup myspec.IsolationGroup + expErr error + expTerms []expTerm }{ { isoGroup: myspec.IsolationGroup{ Name: "group1", }, - expKey: FailureDomainZoneKey, - expValues: []string{"group1"}, }, { isoGroup: myspec.IsolationGroup{ - Name: "group2", - NodeAffinityKey: "foobar", + Name: "group2", + NodeAffinityTerms: []myspec.NodeAffinityTerm{ + { + Key: "foobar", + Values: []string{"group2"}, + }, + }, + }, + expTerms: []expTerm{ + { + key: "foobar", + values: []string{"group2"}, + }, + }, + }, + { + isoGroup: myspec.IsolationGroup{ + Name: "zone-and-inst-type", + NodeAffinityTerms: []myspec.NodeAffinityTerm{ + { + Key: "zone", + Values: []string{"zone-a"}, + }, + { + Key: "instance-type", + Values: []string{"large"}, + }, + }, + }, + expTerms: []expTerm{ + { + key: "zone", + values: []string{"zone-a"}, + }, + { + key: "instance-type", + values: []string{"large"}, + }, + }, + }, + { + isoGroup: myspec.IsolationGroup{ + Name: "group3", + NodeAffinityTerms: []myspec.NodeAffinityTerm{ + { + Key: "foobar", + }, + }, }, - expKey: "foobar", - expValues: []string{"group2"}, + expErr: errEmptyNodeAffinityValues, }, { isoGroup: myspec.IsolationGroup{ - Name: "group3", - NodeAffinityKey: "foobar", - NodeAffinityValues: []string{"baz", "bar"}, + Name: "group4", + NodeAffinityTerms: []myspec.NodeAffinityTerm{ + { + Values: []string{"group2"}, + }, + }, }, - expKey: "foobar", - expValues: []string{"baz", "bar"}, + expErr: errEmptyNodeAffinityKey, }, } for _, test := range tests { - affinity := GenerateStatefulSetAffinity(test.isoGroup) + affinity, err := GenerateStatefulSetAffinity(test.isoGroup) + if test.expErr != nil { + assert.Equal(t, test.expErr, err) + continue + } + + if len(test.expTerms) == 0 { + assert.Nil(t, affinity) + continue + } + terms := affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms assert.Len(t, terms, 1) - exprs := terms[0].MatchExpressions - assert.Len(t, exprs, 1) - expr := exprs[0] - assert.Equal(t, test.expKey, expr.Key) - assert.Equal(t, test.expValues, expr.Values) + + expTerms := make([]corev1.NodeSelectorRequirement, len(test.expTerms)) + for i, term := range test.expTerms { + expTerms[i] = corev1.NodeSelectorRequirement{ + Key: term.key, + Operator: corev1.NodeSelectorOpIn, + Values: term.values, + } + } + + assert.Equal(t, expTerms, terms[0].MatchExpressions) } }