diff --git a/docs/reference/cluster_manifest.md b/docs/reference/cluster_manifest.md index 75de35097..efc850aff 100644 --- a/docs/reference/cluster_manifest.md +++ b/docs/reference/cluster_manifest.md @@ -96,7 +96,19 @@ Those are parameters grouped directly under the `spec` key in the manifest. that should be assigned to the cluster pods. When not specified, the value is taken from the `pod_priority_class_name` operator parameter, if not set then the default priority class is taken. The priority class itself must be defined in advance. - + +* **enableShmVolume** + Start a database pod without limitations on shm memory. By default docker + limit `/dev/shm` to `64M` (see e.g. the [docker + issue](https://github.com/docker-library/postgres/issues/416), which could be + not enough if PostgreSQL uses parallel workers heavily. If this option is + present and value is `true`, to the target database pod will be mounted a new + tmpfs volume to remove this limitation. If it's not present, the decision + about mounting a volume will be made based on operator configuration + (`enable_shm_volume`, which is `true` by default). It it's present and value + is `false`, then no volume will be mounted no matter how operator was + configured (so you can override the operator configuration). + ## Postgres parameters Those parameters are grouped under the `postgresql` top-level key. @@ -112,6 +124,7 @@ Those parameters are grouped under the `postgresql` top-level key. cluster. Optional (Spilo automatically sets reasonable defaults for parameters like work_mem or max_connections). + ## Patroni parameters Those parameters are grouped under the `patroni` top-level key. See the [patroni diff --git a/docs/reference/operator_parameters.md b/docs/reference/operator_parameters.md index 47f67228c..3f96b450c 100644 --- a/docs/reference/operator_parameters.md +++ b/docs/reference/operator_parameters.md @@ -224,6 +224,14 @@ CRD-based configuration. * **set_memory_request_to_limit** Set `memory_request` to `memory_limit` for all Postgres clusters (the default value is also increased). This prevents certain cases of memory overcommitment at the cost of overprovisioning memory and potential scheduling problems for containers with high memory limits due to the lack of memory on Kubernetes cluster nodes. This affects all containers (Postgres, Scalyr sidecar, and other sidecars). The default is `false`. +* **enable_shm_volume** + Instruct operator to start any new database pod without limitations on shm + memory. If this option is enabled, to the target database pod will be mounted + a new tmpfs volume to remove shm memory limitation (see e.g. the [docker + issue](https://github.com/docker-library/postgres/issues/416)). This option + is global for an operator object, and can be overwritten by `enableShmVolume` + parameter from Postgres manifest. The default is `true` + ## Operator timeouts This set of parameters define various timeouts related to some operator diff --git a/manifests/complete-postgres-manifest.yaml b/manifests/complete-postgres-manifest.yaml index e0f76e4d4..c5f80f373 100644 --- a/manifests/complete-postgres-manifest.yaml +++ b/manifests/complete-postgres-manifest.yaml @@ -13,12 +13,13 @@ spec: - superuser - createdb enableMasterLoadBalancer: true - enableReplicaLoadBalancer: true + enableReplicaLoadBalancer: true allowedSourceRanges: # load balancers' source ranges for both master and replica services - 127.0.0.1/32 databases: foo: zalando #Expert section + enableShmVolume: true postgresql: version: "10" parameters: diff --git a/pkg/apis/acid.zalan.do/v1/postgresql_type.go b/pkg/apis/acid.zalan.do/v1/postgresql_type.go index aedb0512f..2a8f60f71 100644 --- a/pkg/apis/acid.zalan.do/v1/postgresql_type.go +++ b/pkg/apis/acid.zalan.do/v1/postgresql_type.go @@ -51,6 +51,7 @@ type PostgresSpec struct { Tolerations []v1.Toleration `json:"tolerations,omitempty"` Sidecars []Sidecar `json:"sidecars,omitempty"` PodPriorityClassName string `json:"pod_priority_class_name,omitempty"` + ShmVolume *bool `json:"enableShmVolume,omitempty"` } // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object diff --git a/pkg/apis/acid.zalan.do/v1/util_test.go b/pkg/apis/acid.zalan.do/v1/util_test.go index b6a27542c..01be31e88 100644 --- a/pkg/apis/acid.zalan.do/v1/util_test.go +++ b/pkg/apis/acid.zalan.do/v1/util_test.go @@ -499,7 +499,7 @@ func TestMarshal(t *testing.T) { t.Errorf("Marshal error: %v", err) } if !bytes.Equal(m, tt.marshal) { - t.Errorf("Marshal Postgresql expected: %q, got: %q", string(tt.marshal), string(m)) + t.Errorf("Marshal Postgresql \nexpected: %q, \ngot: %q", string(tt.marshal), string(m)) } } } @@ -507,11 +507,11 @@ func TestMarshal(t *testing.T) { func TestPostgresMeta(t *testing.T) { for _, tt := range unmarshalCluster { if a := tt.out.GetObjectKind(); a != &tt.out.TypeMeta { - t.Errorf("GetObjectKindMeta expected: %v, got: %v", tt.out.TypeMeta, a) + t.Errorf("GetObjectKindMeta \nexpected: %v, \ngot: %v", tt.out.TypeMeta, a) } if a := tt.out.GetObjectMeta(); reflect.DeepEqual(a, tt.out.ObjectMeta) { - t.Errorf("GetObjectMeta expected: %v, got: %v", tt.out.ObjectMeta, a) + t.Errorf("GetObjectMeta \nexpected: %v, \ngot: %v", tt.out.ObjectMeta, a) } } } diff --git a/pkg/cluster/k8sres.go b/pkg/cluster/k8sres.go index b775ee636..6a3a052bd 100644 --- a/pkg/cluster/k8sres.go +++ b/pkg/cluster/k8sres.go @@ -18,6 +18,7 @@ import ( acidv1 "github.com/zalando-incubator/postgres-operator/pkg/apis/acid.zalan.do/v1" "github.com/zalando-incubator/postgres-operator/pkg/spec" "github.com/zalando-incubator/postgres-operator/pkg/util" + "github.com/zalando-incubator/postgres-operator/pkg/util/config" "github.com/zalando-incubator/postgres-operator/pkg/util/constants" "k8s.io/apimachinery/pkg/labels" ) @@ -396,6 +397,16 @@ func generateSidecarContainers(sidecars []acidv1.Sidecar, return nil, nil } +// Check whether or not we're requested to mount an shm volume, +// taking into account that PostgreSQL manifest has precedence. +func mountShmVolumeNeeded(opConfig config.Config, pgSpec *acidv1.PostgresSpec) bool { + if pgSpec.ShmVolume != nil { + return *pgSpec.ShmVolume + } + + return opConfig.ShmVolume +} + func generatePodTemplate( namespace string, labels labels.Set, @@ -407,6 +418,7 @@ func generatePodTemplate( podServiceAccountName string, kubeIAMRole string, priorityClassName string, + shmVolume bool, ) (*v1.PodTemplateSpec, error) { terminateGracePeriodSeconds := terminateGracePeriod @@ -420,6 +432,10 @@ func generatePodTemplate( Tolerations: *tolerationsSpec, } + if shmVolume { + addShmVolume(&podSpec) + } + if nodeAffinity != nil { podSpec.Affinity = nodeAffinity } @@ -733,7 +749,12 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*v1beta1.State volumeMounts := generateVolumeMounts() // generate the spilo container - spiloContainer := generateSpiloContainer(c.containerName(), &effectiveDockerImage, resourceRequirements, spiloEnvVars, volumeMounts) + spiloContainer := generateSpiloContainer(c.containerName(), + &effectiveDockerImage, + resourceRequirements, + spiloEnvVars, + volumeMounts, + ) // resolve conflicts between operator-global and per-cluster sidecards sideCars := c.mergeSidecars(spec.Sidecars) @@ -775,7 +796,8 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*v1beta1.State int64(c.OpConfig.PodTerminateGracePeriod.Seconds()), c.OpConfig.PodServiceAccountName, c.OpConfig.KubeIAMRole, - effectivePodPriorityClassName); err != nil { + effectivePodPriorityClassName, + mountShmVolumeNeeded(c.OpConfig, spec)); err != nil { return nil, fmt.Errorf("could not generate pod template: %v", err) } @@ -882,6 +904,32 @@ func (c *Cluster) getNumberOfInstances(spec *acidv1.PostgresSpec) int32 { return newcur } +// To avoid issues with limited /dev/shm inside docker environment, when +// PostgreSQL can't allocate enough of dsa segments from it, we can +// mount an extra memory volume +// +// see https://docs.okd.io/latest/dev_guide/shared_memory.html +func addShmVolume(podSpec *v1.PodSpec) { + volumes := append(podSpec.Volumes, v1.Volume{ + Name: constants.ShmVolumeName, + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{ + Medium: "Memory", + }, + }, + }) + + pgIdx := constants.PostgresContainerIdx + mounts := append(podSpec.Containers[pgIdx].VolumeMounts, + v1.VolumeMount{ + Name: constants.ShmVolumeName, + MountPath: constants.ShmVolumePath, + }) + + podSpec.Containers[0].VolumeMounts = mounts + podSpec.Volumes = volumes +} + func generatePersistentVolumeClaimTemplate(volumeSize, volumeStorageClass string) (*v1.PersistentVolumeClaim, error) { var storageClassName *string diff --git a/pkg/cluster/k8sres_test.go b/pkg/cluster/k8sres_test.go index 12e145c04..92946ab2b 100644 --- a/pkg/cluster/k8sres_test.go +++ b/pkg/cluster/k8sres_test.go @@ -1,8 +1,11 @@ package cluster import ( + "k8s.io/api/core/v1" + acidv1 "github.com/zalando-incubator/postgres-operator/pkg/apis/acid.zalan.do/v1" "github.com/zalando-incubator/postgres-operator/pkg/util/config" + "github.com/zalando-incubator/postgres-operator/pkg/util/constants" "github.com/zalando-incubator/postgres-operator/pkg/util/k8sutil" "testing" ) @@ -75,3 +78,54 @@ func TestCreateLoadBalancerLogic(t *testing.T) { } } } + +func TestShmVolume(t *testing.T) { + testName := "TestShmVolume" + tests := []struct { + subTest string + podSpec *v1.PodSpec + shmPos int + }{ + { + subTest: "empty PodSpec", + podSpec: &v1.PodSpec{ + Volumes: []v1.Volume{}, + Containers: []v1.Container{ + v1.Container{ + VolumeMounts: []v1.VolumeMount{}, + }, + }, + }, + shmPos: 0, + }, + { + subTest: "non empty PodSpec", + podSpec: &v1.PodSpec{ + Volumes: []v1.Volume{v1.Volume{}}, + Containers: []v1.Container{ + v1.Container{ + VolumeMounts: []v1.VolumeMount{ + v1.VolumeMount{}, + }, + }, + }, + }, + shmPos: 1, + }, + } + for _, tt := range tests { + addShmVolume(tt.podSpec) + + volumeName := tt.podSpec.Volumes[tt.shmPos].Name + volumeMountName := tt.podSpec.Containers[0].VolumeMounts[tt.shmPos].Name + + if volumeName != constants.ShmVolumeName { + t.Errorf("%s %s: Expected volume %s was not created, have %s instead", + testName, tt.subTest, constants.ShmVolumeName, volumeName) + } + if volumeMountName != constants.ShmVolumeName { + t.Errorf("%s %s: Expected mount %s was not created, have %s instead", + testName, tt.subTest, constants.ShmVolumeName, volumeMountName) + } + } +} diff --git a/pkg/util/config/config.go b/pkg/util/config/config.go index 2bd7924ad..d855e0a2a 100644 --- a/pkg/util/config/config.go +++ b/pkg/util/config/config.go @@ -38,6 +38,7 @@ type Resources struct { NodeReadinessLabel map[string]string `name:"node_readiness_label" default:""` MaxInstances int32 `name:"max_instances" default:"-1"` MinInstances int32 `name:"min_instances" default:"-1"` + ShmVolume bool `name:"enable_shm_volume" default:"true"` } // Auth describes authentication specific configuration parameters diff --git a/pkg/util/constants/kubernetes.go b/pkg/util/constants/kubernetes.go index 2604f124d..a4ea73e80 100644 --- a/pkg/util/constants/kubernetes.go +++ b/pkg/util/constants/kubernetes.go @@ -5,6 +5,7 @@ import "time" // General kubernetes-related constants const ( PostgresContainerName = "postgres" + PostgresContainerIdx = 0 K8sAPIPath = "/apis" StatefulsetDeletionInterval = 1 * time.Second StatefulsetDeletionTimeout = 30 * time.Second diff --git a/pkg/util/constants/postgresql.go b/pkg/util/constants/postgresql.go index 7556e8858..e39fd423f 100644 --- a/pkg/util/constants/postgresql.go +++ b/pkg/util/constants/postgresql.go @@ -10,4 +10,7 @@ const ( PostgresConnectRetryTimeout = 2 * time.Minute PostgresConnectTimeout = 15 * time.Second + + ShmVolumeName = "dshm" + ShmVolumePath = "/dev/shm" )