Skip to content

Commit

Permalink
Make snap service start more robust. (#630)
Browse files Browse the repository at this point in the history
  • Loading branch information
bschimke95 committed Aug 30, 2024
1 parent 2182037 commit e5ca72f
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 14 deletions.
9 changes: 9 additions & 0 deletions src/k8s/pkg/k8sd/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,15 +233,23 @@ func (a *App) Run(ctx context.Context, customHooks *state.Hooks) error {
return nil
}

// markNodeReady will decrement the readyWg counter to signal that the node is ready.
// The node is ready if:
// - the microcluster database is accessible
// - the kubernetes endpoint is reachable
func (a *App) markNodeReady(ctx context.Context, s state.State) error {
log := log.FromContext(ctx).WithValues("startup", "waitForReady")

// wait for the database to be open
log.V(1).Info("Waiting for database to be open")
if err := control.WaitUntilReady(ctx, func() (bool, error) {
return s.Database().IsOpen(ctx) == nil, nil
}); err != nil {
return fmt.Errorf("failed to wait for database to be open: %w", err)
}

// check kubernetes endpoint
log.V(1).Info("Waiting for kubernetes endpoint")
if err := control.WaitUntilReady(ctx, func() (bool, error) {
client, err := a.snap.KubernetesNodeClient("")
if err != nil {
Expand All @@ -255,6 +263,7 @@ func (a *App) markNodeReady(ctx context.Context, s state.State) error {
return fmt.Errorf("failed to wait for kubernetes endpoint: %w", err)
}

log.V(1).Info("Marking node as ready")
a.readyWg.Done()
return nil
}
26 changes: 22 additions & 4 deletions src/k8s/pkg/k8sd/app/hooks_bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/canonical/k8s/pkg/log"
snaputil "github.com/canonical/k8s/pkg/snap/util"
"github.com/canonical/k8s/pkg/utils"
"github.com/canonical/k8s/pkg/utils/control"
"github.com/canonical/k8s/pkg/utils/experimental/snapdconfig"
"github.com/canonical/microcluster/v3/state"
)
Expand Down Expand Up @@ -243,9 +244,15 @@ func (a *App) onBootstrapWorkerNode(ctx context.Context, s state.State, encodedT
}

// Start services
// This may fail if the node controllers try to restart the services at the same time, hence the retry.
log.Info("Starting worker services")
if err := snaputil.StartWorkerServices(ctx, snap); err != nil {
return fmt.Errorf("failed to start worker services: %w", err)
if err := control.RetryFor(ctx, 5, 5*time.Second, func() error {
if err := snaputil.StartWorkerServices(ctx, snap); err != nil {
return fmt.Errorf("failed to start worker services: %w", err)
}
return nil
}); err != nil {
return fmt.Errorf("failed after retry: %w", err)
}

return nil
Expand All @@ -254,6 +261,8 @@ func (a *App) onBootstrapWorkerNode(ctx context.Context, s state.State, encodedT
func (a *App) onBootstrapControlPlane(ctx context.Context, s state.State, bootstrapConfig apiv1.BootstrapConfig) (rerr error) {
snap := a.Snap()

log := log.FromContext(ctx).WithValues("hook", "bootstrap")

cfg, err := types.ClusterConfigFromBootstrapConfig(bootstrapConfig)
if err != nil {
return fmt.Errorf("invalid bootstrap config: %w", err)
Expand Down Expand Up @@ -440,14 +449,23 @@ func (a *App) onBootstrapControlPlane(ctx context.Context, s state.State, bootst
}

// Start services
if err := startControlPlaneServices(ctx, snap, cfg.Datastore.GetType()); err != nil {
return fmt.Errorf("failed to start services: %w", err)
// This may fail if the node controllers try to restart the services at the same time, hence the retry.
log.Info("Starting control-plane services")
if err := control.RetryFor(ctx, 5, 5*time.Second, func() error {
if err := startControlPlaneServices(ctx, snap, cfg.Datastore.GetType()); err != nil {
return fmt.Errorf("failed to start services: %w", err)
}
return nil
}); err != nil {
return fmt.Errorf("failed after retry: %w", err)
}

// Wait until Kube-API server is ready
log.Info("Waiting for kube-apiserver to become ready")
if err := waitApiServerReady(ctx, snap); err != nil {
return fmt.Errorf("kube-apiserver did not become ready in time: %w", err)
}
log.Info("API server is ready - notify controllers")

a.NotifyFeatureController(
cfg.Network.GetEnabled(),
Expand Down
15 changes: 13 additions & 2 deletions src/k8s/pkg/k8sd/app/hooks_join.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@ import (
databaseutil "github.com/canonical/k8s/pkg/k8sd/database/util"
"github.com/canonical/k8s/pkg/k8sd/pki"
"github.com/canonical/k8s/pkg/k8sd/setup"
"github.com/canonical/k8s/pkg/log"
"github.com/canonical/k8s/pkg/utils"
"github.com/canonical/k8s/pkg/utils/control"
"github.com/canonical/k8s/pkg/utils/experimental/snapdconfig"
"github.com/canonical/microcluster/v3/state"
)

// onPostJoin is called when a control plane node joins the cluster.
// onPostJoin retrieves the cluster config from the database and configures local services.
func (a *App) onPostJoin(ctx context.Context, s state.State, initConfig map[string]string) (rerr error) {
log := log.FromContext(ctx).WithValues("hook", "postJoin")

snap := a.Snap()

// NOTE: Set the notBefore certificate time to the current time.
Expand Down Expand Up @@ -195,8 +199,15 @@ func (a *App) onPostJoin(ctx context.Context, s state.State, initConfig map[stri
}

// Start services
if err := startControlPlaneServices(ctx, snap, cfg.Datastore.GetType()); err != nil {
return fmt.Errorf("failed to start services: %w", err)
// This may fail if the node controllers try to restart the services at the same time, hence the retry.
log.Info("Starting control-plane services")
if err := control.RetryFor(ctx, 5, 5*time.Second, func() error {
if err := startControlPlaneServices(ctx, snap, cfg.Datastore.GetType()); err != nil {
return fmt.Errorf("failed to start services: %w", err)
}
return nil
}); err != nil {
return fmt.Errorf("failed after retry: %w", err)
}

// Wait until Kube-API server is ready
Expand Down
18 changes: 14 additions & 4 deletions src/k8s/pkg/k8sd/controllers/node_configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/canonical/k8s/pkg/log"
"github.com/canonical/k8s/pkg/snap"
snaputil "github.com/canonical/k8s/pkg/snap/util"
"github.com/canonical/k8s/pkg/utils/control"
v1 "k8s.io/api/core/v1"
)

Expand Down Expand Up @@ -42,11 +43,14 @@ func (c *NodeConfigurationController) retryNewK8sClient(ctx context.Context) (*k
}

func (c *NodeConfigurationController) Run(ctx context.Context, getRSAKey func(context.Context) (*rsa.PublicKey, error)) {
ctx = log.NewContext(ctx, log.FromContext(ctx).WithValues("controller", "node-configuration"))
log := log.FromContext(ctx)

log.Info("Waiting for node to be ready")
// wait for microcluster node to be ready
c.waitReady()

ctx = log.NewContext(ctx, log.FromContext(ctx).WithValues("controller", "node-configuration"))
log := log.FromContext(ctx)
log.Info("Starting node configuration controller")

for {
client, err := c.retryNewK8sClient(ctx)
Expand Down Expand Up @@ -107,8 +111,14 @@ func (c *NodeConfigurationController) reconcile(ctx context.Context, configMap *
}

if mustRestartKubelet {
if err := c.snap.RestartService(ctx, "kubelet"); err != nil {
return fmt.Errorf("failed to restart kubelet to apply node configuration: %w", err)
// This may fail if other controllers try to restart the services at the same time, hence the retry.
if err := control.RetryFor(ctx, 5, 5*time.Second, func() error {
if err := c.snap.RestartService(ctx, "kubelet"); err != nil {
return fmt.Errorf("failed to restart kubelet to apply node configuration: %w", err)
}
return nil
}); err != nil {
return fmt.Errorf("failed after retry: %w", err)
}
}

Expand Down
10 changes: 8 additions & 2 deletions src/k8s/pkg/k8sd/controllers/update_node_configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,12 @@ func (c *UpdateNodeConfigurationController) retryNewK8sClient(ctx context.Contex
// Run accepts a function that retrieves the current cluster configuration.
// Run will loop everytime the TriggerCh is triggered.
func (c *UpdateNodeConfigurationController) Run(ctx context.Context, getClusterConfig func(context.Context) (types.ClusterConfig, error)) {
c.waitReady()
ctx = log.NewContext(ctx, log.FromContext(ctx).WithValues("controller", "update-node-configuration"))
log := log.FromContext(ctx)

log := log.FromContext(ctx).WithValues("controller", "update-node-configuration")
log.V(1).Info("Waiting for node to be ready")
c.waitReady()
log.V(1).Info("Starting update node configuration controller")

for {
select {
Expand Down Expand Up @@ -99,6 +102,9 @@ func (c *UpdateNodeConfigurationController) Run(ctx context.Context, getClusterC
}

func (c *UpdateNodeConfigurationController) reconcile(ctx context.Context, client *kubernetes.Client, config types.ClusterConfig) error {
log := log.FromContext(ctx)
log.V(1).Info("Reconciling node configuration")

keyPEM := config.Certificates.GetK8sdPrivateKey()
key, err := pkiutil.LoadRSAPrivateKey(keyPEM)
if err != nil && keyPEM != "" {
Expand Down
4 changes: 2 additions & 2 deletions src/k8s/pkg/snap/util/services.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ var (
"kubelet",
"kube-proxy",
}
// ControlPlaneServices contains all k8s services that run on a control plane except of k8sd.
// controlPlaneServices contains all k8s services that run on a control plane except of k8sd.
controlPlaneServices = []string{
"containerd",
"kube-apiserver",
"kube-controller-manager",
"kube-proxy",
"kube-scheduler",
"kubelet",
"kube-apiserver",
}
)

Expand Down

0 comments on commit e5ca72f

Please sign in to comment.