Skip to content

Commit

Permalink
fix: fall back to old scheme of getting talsoconfig for older templates
Browse files Browse the repository at this point in the history
This PR contains couple of fixes. In case if tcp has `init` nodes
config:
- mark controlplane as bootstrapped immediately.
- populate nodes kubeconfig using workload cluster nodes info instead of
using machines addresses.

Signed-off-by: Artem Chernyshev <artem.chernyshev@talos-systems.com>
  • Loading branch information
Unix4ever committed Jan 14, 2022
1 parent 89f793e commit efa0345
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 20 deletions.
80 changes: 79 additions & 1 deletion controllers/configs.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@ import (
"context"
"fmt"
"net"
"reflect"
"time"

cabptv1 "github.com/talos-systems/cluster-api-bootstrap-provider-talos/api/v1alpha3"
controlplanev1 "github.com/talos-systems/cluster-api-control-plane-provider-talos/api/v1alpha3"
talosclient "github.com/talos-systems/talos/pkg/machinery/client"
talosconfig "github.com/talos-systems/talos/pkg/machinery/client/config"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
Expand Down Expand Up @@ -75,11 +78,15 @@ func (r *TalosControlPlaneReconciler) kubeconfigForCluster(ctx context.Context,
}

// talosconfigForMachine will generate a talosconfig that uses *all* found addresses as the endpoints.
func (r *TalosControlPlaneReconciler) talosconfigForMachines(ctx context.Context, machines ...clusterv1.Machine) (*talosclient.Client, error) {
func (r *TalosControlPlaneReconciler) talosconfigForMachines(ctx context.Context, tcp *controlplanev1.TalosControlPlane, machines ...clusterv1.Machine) (*talosclient.Client, error) {
if len(machines) == 0 {
return nil, fmt.Errorf("at least one machine should be provided")
}

if !reflect.ValueOf(tcp.Spec.ControlPlaneConfig.InitConfig).IsZero() {
return r.talosconfigFromWorkloadCluster(ctx, client.ObjectKey{Namespace: tcp.GetNamespace(), Name: tcp.GetLabels()["cluster.x-k8s.io/cluster-name"]}, machines...)
}

addrList := []string{}

var t *talosconfig.Config
Expand Down Expand Up @@ -130,3 +137,74 @@ func (r *TalosControlPlaneReconciler) talosconfigForMachines(ctx context.Context

return talosclient.New(ctx, talosclient.WithEndpoints(addrList...), talosclient.WithConfig(t))
}

// talosconfigFromWorkloadCluster gets talosconfig and populates endoints using workload cluster nodes.
func (r *TalosControlPlaneReconciler) talosconfigFromWorkloadCluster(ctx context.Context, cluster client.ObjectKey, machines ...clusterv1.Machine) (*talosclient.Client, error) {
if len(machines) == 0 {
return nil, fmt.Errorf("at least one machine should be provided")
}

clientset, err := r.kubeconfigForCluster(ctx, cluster)
if err != nil {
return nil, err
}

addrList := []string{}

var t *talosconfig.Config

for _, machine := range machines {
if machine.Status.NodeRef == nil {
return nil, fmt.Errorf("%q machine does not have a nodeRef", machine.Name)
}

// grab all addresses as endpoints
node, err := clientset.CoreV1().Nodes().Get(ctx, machine.Status.NodeRef.Name, metav1.GetOptions{})
if err != nil {
return nil, err
}

for _, addr := range node.Status.Addresses {
if addr.Type == corev1.NodeExternalIP || addr.Type == corev1.NodeInternalIP {
addrList = append(addrList, addr.Address)
}
}

if len(addrList) == 0 {
return nil, fmt.Errorf("no addresses were found for node %q", node.Name)
}

if t == nil {
var (
cfgs cabptv1.TalosConfigList
found *cabptv1.TalosConfig
)

// find talosconfig in the machine's namespace
err = r.Client.List(ctx, &cfgs, client.InNamespace(machine.Namespace))
if err != nil {
return nil, err
}

for _, cfg := range cfgs.Items {
for _, ref := range cfg.OwnerReferences {
if ref.Kind == "Machine" && ref.Name == machine.Name {
found = &cfg
break
}
}
}

if found == nil {
return nil, fmt.Errorf("failed to find TalosConfig for %q", machine.Name)
}

t, err = talosconfig.FromString(found.Status.TalosConfig)
if err != nil {
return nil, err
}
}
}

return talosclient.New(ctx, talosclient.WithEndpoints(addrList...), talosclient.WithConfig(t))
}
9 changes: 5 additions & 4 deletions controllers/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@ import (
"fmt"
"strings"

controlplanev1 "github.com/talos-systems/cluster-api-control-plane-provider-talos/api/v1alpha3"
"github.com/talos-systems/talos/pkg/machinery/api/machine"
talosclient "github.com/talos-systems/talos/pkg/machinery/client"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/controller-runtime/pkg/client"
)

func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, cluster *clusterv1.Cluster, ownedMachines []clusterv1.Machine) error {
func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, ownedMachines []clusterv1.Machine) error {
kubeclient, err := r.kubeconfigForCluster(ctx, util.ObjectKey(cluster))
if err != nil {
return err
Expand All @@ -32,7 +33,7 @@ func (r *TalosControlPlaneReconciler) etcdHealthcheck(ctx context.Context, clust
}
}

c, err := r.talosconfigForMachines(ctx, machines...)
c, err := r.talosconfigForMachines(ctx, tcp, machines...)
if err != nil {
return err
}
Expand Down Expand Up @@ -148,7 +149,7 @@ func (r *TalosControlPlaneReconciler) forceEtcdLeave(ctx context.Context, c *tal

// auditEtcd rolls through all etcd members to see if there's a matching controlplane machine
// It uses the first controlplane node returned as the etcd endpoint
func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, cluster client.ObjectKey, cpName string) error {
func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster client.ObjectKey, cpName string) error {
machines, err := r.getControlPlaneMachinesForCluster(ctx, cluster, cpName)
if err != nil {
return err
Expand Down Expand Up @@ -182,7 +183,7 @@ func (r *TalosControlPlaneReconciler) auditEtcd(ctx context.Context, cluster cli
return fmt.Errorf("no CP machine which is not being deleted and has node ref")
}

c, err := r.talosconfigForMachines(ctx, designatedCPMachine)
c, err := r.talosconfigForMachines(ctx, tcp, designatedCPMachine)
if err != nil {
return err
}
Expand Down
9 changes: 5 additions & 4 deletions controllers/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"time"

"github.com/pkg/errors"
controlplanev1 "github.com/talos-systems/cluster-api-control-plane-provider-talos/api/v1alpha3"
machineapi "github.com/talos-systems/talos/pkg/machinery/api/machine"
talosclient "github.com/talos-systems/talos/pkg/machinery/client"
"google.golang.org/grpc/codes"
Expand All @@ -27,8 +28,8 @@ func (e *errServiceUnhealthy) Error() string {
return fmt.Sprintf("Service %s is unhealthy: %s", e.service, e.reason)
}

func (r *TalosControlPlaneReconciler) nodesHealthcheck(ctx context.Context, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
client, err := r.talosconfigForMachines(ctx, machines...)
func (r *TalosControlPlaneReconciler) nodesHealthcheck(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
client, err := r.talosconfigForMachines(ctx, tcp, machines...)
if err != nil {
return err
}
Expand All @@ -54,8 +55,8 @@ func (r *TalosControlPlaneReconciler) nodesHealthcheck(ctx context.Context, clus
return nil
}

func (r *TalosControlPlaneReconciler) ensureNodesBooted(ctx context.Context, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
client, err := r.talosconfigForMachines(ctx, machines...)
func (r *TalosControlPlaneReconciler) ensureNodesBooted(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
client, err := r.talosconfigForMachines(ctx, tcp, machines...)
if err != nil {
return err
}
Expand Down
27 changes: 16 additions & 11 deletions controllers/taloscontrolplane_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ func newControlPlane(cluster *clusterv1.Cluster, tcp *controlplanev1.TalosContro
}
}

func (r *TalosControlPlaneReconciler) scaleDownControlPlane(ctx context.Context, cluster client.ObjectKey, cpName string, machines []clusterv1.Machine) (ctrl.Result, error) {
func (r *TalosControlPlaneReconciler) scaleDownControlPlane(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster client.ObjectKey, cpName string, machines []clusterv1.Machine) (ctrl.Result, error) {
if len(machines) == 0 {
return ctrl.Result{}, fmt.Errorf("no machines found")
}
Expand Down Expand Up @@ -359,7 +359,7 @@ func (r *TalosControlPlaneReconciler) scaleDownControlPlane(ctx context.Context,

node := deleteMachine.Status.NodeRef

c, err := r.talosconfigForMachines(ctx, deleteMachine)
c, err := r.talosconfigForMachines(ctx, tcp, deleteMachine)
if err != nil {
return ctrl.Result{RequeueAfter: 20 * time.Second}, err
}
Expand Down Expand Up @@ -525,8 +525,8 @@ func (r *TalosControlPlaneReconciler) bootControlPlane(ctx context.Context, clus
return ctrl.Result{Requeue: true}, nil
}

func (r *TalosControlPlaneReconciler) bootstrapCluster(ctx context.Context, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
c, err := r.talosconfigForMachines(ctx, machines...)
func (r *TalosControlPlaneReconciler) bootstrapCluster(ctx context.Context, tcp *controlplanev1.TalosControlPlane, cluster *clusterv1.Cluster, machines []clusterv1.Machine) error {
c, err := r.talosconfigForMachines(ctx, tcp, machines...)
if err != nil {
return err
}
Expand Down Expand Up @@ -761,11 +761,11 @@ func (r *TalosControlPlaneReconciler) reconcileKubeconfig(ctx context.Context, c
func (r *TalosControlPlaneReconciler) reconcileEtcdMembers(ctx context.Context, cluster *clusterv1.Cluster, tcp *controlplanev1.TalosControlPlane, machines []clusterv1.Machine) (result ctrl.Result, err error) {
var errs error
// Audit the etcd member list to remove any nodes that no longer exist
if err := r.auditEtcd(ctx, util.ObjectKey(cluster), tcp.Name); err != nil {
if err := r.auditEtcd(ctx, tcp, util.ObjectKey(cluster), tcp.Name); err != nil {
errs = kerrors.NewAggregate([]error{errs, err})
}

if err := r.etcdHealthcheck(ctx, cluster, machines); err != nil {
if err := r.etcdHealthcheck(ctx, tcp, cluster, machines); err != nil {
conditions.MarkFalse(tcp, controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason,
clusterv1.ConditionSeverityWarning, err.Error())
errs = kerrors.NewAggregate([]error{errs, err})
Expand All @@ -781,7 +781,7 @@ func (r *TalosControlPlaneReconciler) reconcileEtcdMembers(ctx context.Context,
}

func (r *TalosControlPlaneReconciler) reconcileNodeHealth(ctx context.Context, cluster *clusterv1.Cluster, tcp *controlplanev1.TalosControlPlane, machines []clusterv1.Machine) (result ctrl.Result, err error) {
if err := r.nodesHealthcheck(ctx, cluster, machines); err != nil {
if err := r.nodesHealthcheck(ctx, tcp, cluster, machines); err != nil {
reason := controlplanev1.ControlPlaneComponentsInspectionFailedReason

if errors.Is(err, &errServiceUnhealthy{}) {
Expand Down Expand Up @@ -851,7 +851,7 @@ func (r *TalosControlPlaneReconciler) reconcileMachines(ctx context.Context, clu
return res, nil
}

if err := r.ensureNodesBooted(ctx, cluster, machines); err != nil {
if err := r.ensureNodesBooted(ctx, controlPlane.TCP, cluster, machines); err != nil {
logger.Info("waiting for all nodes to finish boot sequence", "error", err)

return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
Expand All @@ -865,7 +865,7 @@ func (r *TalosControlPlaneReconciler) reconcileMachines(ctx context.Context, clu

logger.Info("scaling down control plane", "Desired", desiredReplicas, "Existing", numMachines)

res, err = r.scaleDownControlPlane(ctx, util.ObjectKey(cluster), controlPlane.TCP.Name, machines)
res, err = r.scaleDownControlPlane(ctx, tcp, util.ObjectKey(cluster), controlPlane.TCP.Name, machines)
if err != nil {
if res.Requeue || res.RequeueAfter > 0 {
logger.Info("failed to scale down control plane", "error", err)
Expand All @@ -876,8 +876,13 @@ func (r *TalosControlPlaneReconciler) reconcileMachines(ctx context.Context, clu

return res, err
default:
if !tcp.Status.Bootstrapped && reflect.ValueOf(tcp.Spec.ControlPlaneConfig.InitConfig).IsZero() {
if err := r.bootstrapCluster(ctx, cluster, machines); err != nil {
if !reflect.ValueOf(tcp.Spec.ControlPlaneConfig.InitConfig).IsZero() {
tcp.Status.Bootstrapped = true
conditions.MarkTrue(tcp, controlplanev1.MachinesBootstrapped)
}

if !tcp.Status.Bootstrapped {
if err := r.bootstrapCluster(ctx, tcp, cluster, machines); err != nil {
conditions.MarkFalse(tcp, controlplanev1.MachinesBootstrapped, controlplanev1.WaitingForTalosBootReason, clusterv1.ConditionSeverityInfo, err.Error())

logger.Info("bootstrap failed, retrying in 20 seconds", "error", err)
Expand Down

0 comments on commit efa0345

Please sign in to comment.