From e5ca72f4bec7e03c658f95487f0b10912f2b9525 Mon Sep 17 00:00:00 2001 From: Benjamin Schimke Date: Fri, 30 Aug 2024 08:57:06 +0200 Subject: [PATCH] Make snap service start more robust. (#630) --- src/k8s/pkg/k8sd/app/app.go | 9 +++++++ src/k8s/pkg/k8sd/app/hooks_bootstrap.go | 26 ++++++++++++++++--- src/k8s/pkg/k8sd/app/hooks_join.go | 15 +++++++++-- .../k8sd/controllers/node_configuration.go | 18 ++++++++++--- .../controllers/update_node_configuration.go | 10 +++++-- src/k8s/pkg/snap/util/services.go | 4 +-- 6 files changed, 68 insertions(+), 14 deletions(-) diff --git a/src/k8s/pkg/k8sd/app/app.go b/src/k8s/pkg/k8sd/app/app.go index a1a93cc06..d6172d124 100644 --- a/src/k8s/pkg/k8sd/app/app.go +++ b/src/k8s/pkg/k8sd/app/app.go @@ -233,8 +233,15 @@ func (a *App) Run(ctx context.Context, customHooks *state.Hooks) error { return nil } +// markNodeReady will decrement the readyWg counter to signal that the node is ready. +// The node is ready if: +// - the microcluster database is accessible +// - the kubernetes endpoint is reachable func (a *App) markNodeReady(ctx context.Context, s state.State) error { + log := log.FromContext(ctx).WithValues("startup", "waitForReady") + // wait for the database to be open + log.V(1).Info("Waiting for database to be open") if err := control.WaitUntilReady(ctx, func() (bool, error) { return s.Database().IsOpen(ctx) == nil, nil }); err != nil { @@ -242,6 +249,7 @@ func (a *App) markNodeReady(ctx context.Context, s state.State) error { } // check kubernetes endpoint + log.V(1).Info("Waiting for kubernetes endpoint") if err := control.WaitUntilReady(ctx, func() (bool, error) { client, err := a.snap.KubernetesNodeClient("") if err != nil { @@ -255,6 +263,7 @@ func (a *App) markNodeReady(ctx context.Context, s state.State) error { return fmt.Errorf("failed to wait for kubernetes endpoint: %w", err) } + log.V(1).Info("Marking node as ready") a.readyWg.Done() return nil } diff --git a/src/k8s/pkg/k8sd/app/hooks_bootstrap.go b/src/k8s/pkg/k8sd/app/hooks_bootstrap.go index 567e5df36..4144a0f84 100644 --- a/src/k8s/pkg/k8sd/app/hooks_bootstrap.go +++ b/src/k8s/pkg/k8sd/app/hooks_bootstrap.go @@ -20,6 +20,7 @@ import ( "github.com/canonical/k8s/pkg/log" snaputil "github.com/canonical/k8s/pkg/snap/util" "github.com/canonical/k8s/pkg/utils" + "github.com/canonical/k8s/pkg/utils/control" "github.com/canonical/k8s/pkg/utils/experimental/snapdconfig" "github.com/canonical/microcluster/v3/state" ) @@ -243,9 +244,15 @@ func (a *App) onBootstrapWorkerNode(ctx context.Context, s state.State, encodedT } // Start services + // This may fail if the node controllers try to restart the services at the same time, hence the retry. log.Info("Starting worker services") - if err := snaputil.StartWorkerServices(ctx, snap); err != nil { - return fmt.Errorf("failed to start worker services: %w", err) + if err := control.RetryFor(ctx, 5, 5*time.Second, func() error { + if err := snaputil.StartWorkerServices(ctx, snap); err != nil { + return fmt.Errorf("failed to start worker services: %w", err) + } + return nil + }); err != nil { + return fmt.Errorf("failed after retry: %w", err) } return nil @@ -254,6 +261,8 @@ func (a *App) onBootstrapWorkerNode(ctx context.Context, s state.State, encodedT func (a *App) onBootstrapControlPlane(ctx context.Context, s state.State, bootstrapConfig apiv1.BootstrapConfig) (rerr error) { snap := a.Snap() + log := log.FromContext(ctx).WithValues("hook", "bootstrap") + cfg, err := types.ClusterConfigFromBootstrapConfig(bootstrapConfig) if err != nil { return fmt.Errorf("invalid bootstrap config: %w", err) @@ -440,14 +449,23 @@ func (a *App) onBootstrapControlPlane(ctx context.Context, s state.State, bootst } // Start services - if err := startControlPlaneServices(ctx, snap, cfg.Datastore.GetType()); err != nil { - return fmt.Errorf("failed to start services: %w", err) + // This may fail if the node controllers try to restart the services at the same time, hence the retry. + log.Info("Starting control-plane services") + if err := control.RetryFor(ctx, 5, 5*time.Second, func() error { + if err := startControlPlaneServices(ctx, snap, cfg.Datastore.GetType()); err != nil { + return fmt.Errorf("failed to start services: %w", err) + } + return nil + }); err != nil { + return fmt.Errorf("failed after retry: %w", err) } // Wait until Kube-API server is ready + log.Info("Waiting for kube-apiserver to become ready") if err := waitApiServerReady(ctx, snap); err != nil { return fmt.Errorf("kube-apiserver did not become ready in time: %w", err) } + log.Info("API server is ready - notify controllers") a.NotifyFeatureController( cfg.Network.GetEnabled(), diff --git a/src/k8s/pkg/k8sd/app/hooks_join.go b/src/k8s/pkg/k8sd/app/hooks_join.go index 5b182fd5e..0e164858f 100644 --- a/src/k8s/pkg/k8sd/app/hooks_join.go +++ b/src/k8s/pkg/k8sd/app/hooks_join.go @@ -9,7 +9,9 @@ import ( databaseutil "github.com/canonical/k8s/pkg/k8sd/database/util" "github.com/canonical/k8s/pkg/k8sd/pki" "github.com/canonical/k8s/pkg/k8sd/setup" + "github.com/canonical/k8s/pkg/log" "github.com/canonical/k8s/pkg/utils" + "github.com/canonical/k8s/pkg/utils/control" "github.com/canonical/k8s/pkg/utils/experimental/snapdconfig" "github.com/canonical/microcluster/v3/state" ) @@ -17,6 +19,8 @@ import ( // onPostJoin is called when a control plane node joins the cluster. // onPostJoin retrieves the cluster config from the database and configures local services. func (a *App) onPostJoin(ctx context.Context, s state.State, initConfig map[string]string) (rerr error) { + log := log.FromContext(ctx).WithValues("hook", "postJoin") + snap := a.Snap() // NOTE: Set the notBefore certificate time to the current time. @@ -195,8 +199,15 @@ func (a *App) onPostJoin(ctx context.Context, s state.State, initConfig map[stri } // Start services - if err := startControlPlaneServices(ctx, snap, cfg.Datastore.GetType()); err != nil { - return fmt.Errorf("failed to start services: %w", err) + // This may fail if the node controllers try to restart the services at the same time, hence the retry. + log.Info("Starting control-plane services") + if err := control.RetryFor(ctx, 5, 5*time.Second, func() error { + if err := startControlPlaneServices(ctx, snap, cfg.Datastore.GetType()); err != nil { + return fmt.Errorf("failed to start services: %w", err) + } + return nil + }); err != nil { + return fmt.Errorf("failed after retry: %w", err) } // Wait until Kube-API server is ready diff --git a/src/k8s/pkg/k8sd/controllers/node_configuration.go b/src/k8s/pkg/k8sd/controllers/node_configuration.go index 6157ea061..ec55b6be7 100644 --- a/src/k8s/pkg/k8sd/controllers/node_configuration.go +++ b/src/k8s/pkg/k8sd/controllers/node_configuration.go @@ -11,6 +11,7 @@ import ( "github.com/canonical/k8s/pkg/log" "github.com/canonical/k8s/pkg/snap" snaputil "github.com/canonical/k8s/pkg/snap/util" + "github.com/canonical/k8s/pkg/utils/control" v1 "k8s.io/api/core/v1" ) @@ -42,11 +43,14 @@ func (c *NodeConfigurationController) retryNewK8sClient(ctx context.Context) (*k } func (c *NodeConfigurationController) Run(ctx context.Context, getRSAKey func(context.Context) (*rsa.PublicKey, error)) { + ctx = log.NewContext(ctx, log.FromContext(ctx).WithValues("controller", "node-configuration")) + log := log.FromContext(ctx) + + log.Info("Waiting for node to be ready") // wait for microcluster node to be ready c.waitReady() - ctx = log.NewContext(ctx, log.FromContext(ctx).WithValues("controller", "node-configuration")) - log := log.FromContext(ctx) + log.Info("Starting node configuration controller") for { client, err := c.retryNewK8sClient(ctx) @@ -107,8 +111,14 @@ func (c *NodeConfigurationController) reconcile(ctx context.Context, configMap * } if mustRestartKubelet { - if err := c.snap.RestartService(ctx, "kubelet"); err != nil { - return fmt.Errorf("failed to restart kubelet to apply node configuration: %w", err) + // This may fail if other controllers try to restart the services at the same time, hence the retry. + if err := control.RetryFor(ctx, 5, 5*time.Second, func() error { + if err := c.snap.RestartService(ctx, "kubelet"); err != nil { + return fmt.Errorf("failed to restart kubelet to apply node configuration: %w", err) + } + return nil + }); err != nil { + return fmt.Errorf("failed after retry: %w", err) } } diff --git a/src/k8s/pkg/k8sd/controllers/update_node_configuration.go b/src/k8s/pkg/k8sd/controllers/update_node_configuration.go index 6d8046d8b..8cbb4d296 100644 --- a/src/k8s/pkg/k8sd/controllers/update_node_configuration.go +++ b/src/k8s/pkg/k8sd/controllers/update_node_configuration.go @@ -56,9 +56,12 @@ func (c *UpdateNodeConfigurationController) retryNewK8sClient(ctx context.Contex // Run accepts a function that retrieves the current cluster configuration. // Run will loop everytime the TriggerCh is triggered. func (c *UpdateNodeConfigurationController) Run(ctx context.Context, getClusterConfig func(context.Context) (types.ClusterConfig, error)) { - c.waitReady() + ctx = log.NewContext(ctx, log.FromContext(ctx).WithValues("controller", "update-node-configuration")) + log := log.FromContext(ctx) - log := log.FromContext(ctx).WithValues("controller", "update-node-configuration") + log.V(1).Info("Waiting for node to be ready") + c.waitReady() + log.V(1).Info("Starting update node configuration controller") for { select { @@ -99,6 +102,9 @@ func (c *UpdateNodeConfigurationController) Run(ctx context.Context, getClusterC } func (c *UpdateNodeConfigurationController) reconcile(ctx context.Context, client *kubernetes.Client, config types.ClusterConfig) error { + log := log.FromContext(ctx) + log.V(1).Info("Reconciling node configuration") + keyPEM := config.Certificates.GetK8sdPrivateKey() key, err := pkiutil.LoadRSAPrivateKey(keyPEM) if err != nil && keyPEM != "" { diff --git a/src/k8s/pkg/snap/util/services.go b/src/k8s/pkg/snap/util/services.go index 9cd3a6f0d..f6ae56862 100644 --- a/src/k8s/pkg/snap/util/services.go +++ b/src/k8s/pkg/snap/util/services.go @@ -15,14 +15,14 @@ var ( "kubelet", "kube-proxy", } - // ControlPlaneServices contains all k8s services that run on a control plane except of k8sd. + // controlPlaneServices contains all k8s services that run on a control plane except of k8sd. controlPlaneServices = []string{ "containerd", - "kube-apiserver", "kube-controller-manager", "kube-proxy", "kube-scheduler", "kubelet", + "kube-apiserver", } )