From 1661f1024a68245bf768925a326ca558f2ea527e Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Mon, 3 Jun 2024 20:40:09 +0000 Subject: [PATCH] Fix bug that caused agents to bypass local loadbalancer If proxy.SetAPIServerPort was called multiple times, all calls after the first one would cause the apiserver address to be set to the default server address, bypassing the local load-balancer. This was most likely to occur on RKE2, where the supervisor may be up for a period of time before it is ready to manage node password secrets, causing the agent to retry. Signed-off-by: Brad Davidson --- pkg/agent/config/config.go | 30 +++++++++++++++++++------- pkg/agent/loadbalancer/loadbalancer.go | 7 +++++- pkg/agent/proxy/apiproxy.go | 17 ++++++++++----- pkg/agent/tunnel/tunnel.go | 2 +- 4 files changed, 41 insertions(+), 15 deletions(-) diff --git a/pkg/agent/config/config.go b/pkg/agent/config/config.go index dfab68d023ba..fc0e7241549b 100644 --- a/pkg/agent/config/config.go +++ b/pkg/agent/config/config.go @@ -375,9 +375,10 @@ func get(ctx context.Context, envInfo *cmds.Agent, proxy proxy.Proxy) (*config.N if controlConfig.SupervisorPort != controlConfig.HTTPSPort { isIPv6 := utilsnet.IsIPv6(net.ParseIP([]string{envInfo.NodeIP.String()}[0])) if err := proxy.SetAPIServerPort(controlConfig.HTTPSPort, isIPv6); err != nil { - return nil, errors.Wrapf(err, "failed to setup access to API Server port %d on at %s", controlConfig.HTTPSPort, proxy.SupervisorURL()) + return nil, errors.Wrapf(err, "failed to set apiserver port to %d", controlConfig.HTTPSPort) } } + apiServerURL := proxy.APIServerURL() var flannelIface *net.Interface if controlConfig.FlannelBackend != config.FlannelBackendNone && len(envInfo.FlannelIface) > 0 { @@ -482,40 +483,53 @@ func get(ctx context.Context, envInfo *cmds.Agent, proxy proxy.Proxy) (*config.N os.Setenv("NODE_NAME", nodeName) + // Ensure that the kubelet's server certificate is valid for all configured node IPs. Note + // that in the case of an external CCM, additional IPs may be added by the infra provider + // that the cert will not be valid for, as they are not present in the list collected here. nodeExternalAndInternalIPs := append(nodeIPs, nodeExternalIPs...) + + // Ask the server to generate a kubelet server cert+key. These files are unique to this node. servingCert, err := getServingCert(nodeName, nodeExternalAndInternalIPs, servingKubeletCert, servingKubeletKey, newNodePasswordFile, info) if err != nil { - return nil, err + return nil, errors.Wrap(err, servingKubeletCert) } + // Ask the server to genrate a kubelet client cert+key. These files are unique to this node. if err := getNodeNamedHostFile(clientKubeletCert, clientKubeletKey, nodeName, nodeIPs, newNodePasswordFile, info); err != nil { - return nil, err + return nil, errors.Wrap(err, clientKubeletCert) } + // Generate a kubeconfig for the kubelet. kubeconfigKubelet := filepath.Join(envInfo.DataDir, "agent", "kubelet.kubeconfig") - if err := deps.KubeConfig(kubeconfigKubelet, proxy.APIServerURL(), serverCAFile, clientKubeletCert, clientKubeletKey); err != nil { + if err := deps.KubeConfig(kubeconfigKubelet, apiServerURL, serverCAFile, clientKubeletCert, clientKubeletKey); err != nil { return nil, err } clientKubeProxyCert := filepath.Join(envInfo.DataDir, "agent", "client-kube-proxy.crt") clientKubeProxyKey := filepath.Join(envInfo.DataDir, "agent", "client-kube-proxy.key") + + // Ask the server to send us its kube-proxy client cert+key. These files are not unique to this node. if err := getHostFile(clientKubeProxyCert, clientKubeProxyKey, info); err != nil { - return nil, err + return nil, errors.Wrap(err, clientKubeProxyCert) } + // Generate a kubeconfig for kube-proxy. kubeconfigKubeproxy := filepath.Join(envInfo.DataDir, "agent", "kubeproxy.kubeconfig") - if err := deps.KubeConfig(kubeconfigKubeproxy, proxy.APIServerURL(), serverCAFile, clientKubeProxyCert, clientKubeProxyKey); err != nil { + if err := deps.KubeConfig(kubeconfigKubeproxy, apiServerURL, serverCAFile, clientKubeProxyCert, clientKubeProxyKey); err != nil { return nil, err } clientK3sControllerCert := filepath.Join(envInfo.DataDir, "agent", "client-"+version.Program+"-controller.crt") clientK3sControllerKey := filepath.Join(envInfo.DataDir, "agent", "client-"+version.Program+"-controller.key") + + // Ask the server to send us its agent controller client cert+key. These files are not unique to this node. if err := getHostFile(clientK3sControllerCert, clientK3sControllerKey, info); err != nil { - return nil, err + return nil, errors.Wrap(err, clientK3sControllerCert) } + // Generate a kubeconfig for the agent controller. kubeconfigK3sController := filepath.Join(envInfo.DataDir, "agent", version.Program+"controller.kubeconfig") - if err := deps.KubeConfig(kubeconfigK3sController, proxy.APIServerURL(), serverCAFile, clientK3sControllerCert, clientK3sControllerKey); err != nil { + if err := deps.KubeConfig(kubeconfigK3sController, apiServerURL, serverCAFile, clientK3sControllerCert, clientK3sControllerKey); err != nil { return nil, err } diff --git a/pkg/agent/loadbalancer/loadbalancer.go b/pkg/agent/loadbalancer/loadbalancer.go index feddb4d872fc..36019470c8d2 100644 --- a/pkg/agent/loadbalancer/loadbalancer.go +++ b/pkg/agent/loadbalancer/loadbalancer.go @@ -172,6 +172,11 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net return conn, nil } logrus.Debugf("Dial error from load balancer %s: %s", lb.serviceName, err) + // Don't close connections to the failed server if we're retrying with health checks ignored. + // We don't want to disrupt active connections if it is unlikely they will have anywhere to go. + if !allChecksFailed { + defer server.closeAll() + } } newServer, err := lb.nextServer(targetServer) @@ -179,7 +184,7 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net return nil, err } if targetServer != newServer { - logrus.Debugf("Failed over to new server for load balancer %s: %s", lb.serviceName, newServer) + logrus.Debugf("Failed over to new server for load balancer %s: %s -> %s", lb.serviceName, targetServer, newServer) } if ctx.Err() != nil { return nil, ctx.Err() diff --git a/pkg/agent/proxy/apiproxy.go b/pkg/agent/proxy/apiproxy.go index becc2a0defd2..e711623e467e 100644 --- a/pkg/agent/proxy/apiproxy.go +++ b/pkg/agent/proxy/apiproxy.go @@ -63,6 +63,7 @@ func NewSupervisorProxy(ctx context.Context, lbEnabled bool, dataDir, supervisor p.fallbackSupervisorAddress = u.Host p.supervisorPort = u.Port() + logrus.Debugf("Supervisor proxy using supervisor=%s apiserver=%s lb=%v", p.supervisorURL, p.apiServerURL, p.lbEnabled) return &p, nil } @@ -132,6 +133,11 @@ func (p *proxy) setSupervisorPort(addresses []string) []string { // load-balancer, and the address of this load-balancer is returned instead of the actual apiserver // addresses. func (p *proxy) SetAPIServerPort(port int, isIPv6 bool) error { + if p.apiServerEnabled { + logrus.Debugf("Supervisor proxy apiserver port already set") + return nil + } + u, err := url.Parse(p.initialSupervisorURL) if err != nil { return errors.Wrapf(err, "failed to parse server URL %s", p.initialSupervisorURL) @@ -139,22 +145,23 @@ func (p *proxy) SetAPIServerPort(port int, isIPv6 bool) error { p.apiServerPort = strconv.Itoa(port) u.Host = sysnet.JoinHostPort(u.Hostname(), p.apiServerPort) - p.apiServerURL = u.String() - p.apiServerEnabled = true - if p.lbEnabled && p.apiServerLB == nil { lbServerPort := p.lbServerPort if lbServerPort != 0 { lbServerPort = lbServerPort - 1 } - lb, err := loadbalancer.New(p.context, p.dataDir, loadbalancer.APIServerServiceName, p.apiServerURL, lbServerPort, isIPv6) + lb, err := loadbalancer.New(p.context, p.dataDir, loadbalancer.APIServerServiceName, u.String(), lbServerPort, isIPv6) if err != nil { return err } - p.apiServerURL = lb.LoadBalancerServerURL() p.apiServerLB = lb + p.apiServerURL = lb.LoadBalancerServerURL() + } else { + p.apiServerURL = u.String() } + logrus.Debugf("Supervisor proxy apiserver port changed; apiserver=%s lb=%v", p.apiServerURL, p.lbEnabled) + p.apiServerEnabled = true return nil } diff --git a/pkg/agent/tunnel/tunnel.go b/pkg/agent/tunnel/tunnel.go index 6245d529101b..23c6dac404b8 100644 --- a/pkg/agent/tunnel/tunnel.go +++ b/pkg/agent/tunnel/tunnel.go @@ -451,7 +451,7 @@ func (a *agentTunnel) connect(rootCtx context.Context, waitGroup *sync.WaitGroup err := remotedialer.ConnectToProxy(ctx, wsURL, nil, auth, ws, onConnect) connected = false if err != nil && !errors.Is(err, context.Canceled) { - logrus.WithField("url", wsURL).WithError(err).Error("Remotedialer proxy error; reconecting...") + logrus.WithField("url", wsURL).WithError(err).Error("Remotedialer proxy error; reconnecting...") // wait between reconnection attempts to avoid hammering the server time.Sleep(endpointDebounceDelay) }