Skip to content

Commit

Permalink
Fix bug that caused agents to bypass local loadbalancer
Browse files Browse the repository at this point in the history
If proxy.SetAPIServerPort was called multiple times, all calls after the
first one would cause the apiserver address to be set to the default
server address, bypassing the local load-balancer. This was most likely
to occur on RKE2, where the supervisor may be up for a period of time
before it is ready to manage node password secrets, causing the agent
to retry.

Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
  • Loading branch information
brandond committed Jun 4, 2024
1 parent 79ba10f commit 1661f10
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 15 deletions.
30 changes: 22 additions & 8 deletions pkg/agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -375,9 +375,10 @@ func get(ctx context.Context, envInfo *cmds.Agent, proxy proxy.Proxy) (*config.N
if controlConfig.SupervisorPort != controlConfig.HTTPSPort {
isIPv6 := utilsnet.IsIPv6(net.ParseIP([]string{envInfo.NodeIP.String()}[0]))
if err := proxy.SetAPIServerPort(controlConfig.HTTPSPort, isIPv6); err != nil {
return nil, errors.Wrapf(err, "failed to setup access to API Server port %d on at %s", controlConfig.HTTPSPort, proxy.SupervisorURL())
return nil, errors.Wrapf(err, "failed to set apiserver port to %d", controlConfig.HTTPSPort)
}
}
apiServerURL := proxy.APIServerURL()

var flannelIface *net.Interface
if controlConfig.FlannelBackend != config.FlannelBackendNone && len(envInfo.FlannelIface) > 0 {
Expand Down Expand Up @@ -482,40 +483,53 @@ func get(ctx context.Context, envInfo *cmds.Agent, proxy proxy.Proxy) (*config.N

os.Setenv("NODE_NAME", nodeName)

// Ensure that the kubelet's server certificate is valid for all configured node IPs. Note
// that in the case of an external CCM, additional IPs may be added by the infra provider
// that the cert will not be valid for, as they are not present in the list collected here.
nodeExternalAndInternalIPs := append(nodeIPs, nodeExternalIPs...)

// Ask the server to generate a kubelet server cert+key. These files are unique to this node.
servingCert, err := getServingCert(nodeName, nodeExternalAndInternalIPs, servingKubeletCert, servingKubeletKey, newNodePasswordFile, info)
if err != nil {
return nil, err
return nil, errors.Wrap(err, servingKubeletCert)
}

// Ask the server to genrate a kubelet client cert+key. These files are unique to this node.
if err := getNodeNamedHostFile(clientKubeletCert, clientKubeletKey, nodeName, nodeIPs, newNodePasswordFile, info); err != nil {
return nil, err
return nil, errors.Wrap(err, clientKubeletCert)
}

// Generate a kubeconfig for the kubelet.
kubeconfigKubelet := filepath.Join(envInfo.DataDir, "agent", "kubelet.kubeconfig")
if err := deps.KubeConfig(kubeconfigKubelet, proxy.APIServerURL(), serverCAFile, clientKubeletCert, clientKubeletKey); err != nil {
if err := deps.KubeConfig(kubeconfigKubelet, apiServerURL, serverCAFile, clientKubeletCert, clientKubeletKey); err != nil {
return nil, err
}

clientKubeProxyCert := filepath.Join(envInfo.DataDir, "agent", "client-kube-proxy.crt")
clientKubeProxyKey := filepath.Join(envInfo.DataDir, "agent", "client-kube-proxy.key")

// Ask the server to send us its kube-proxy client cert+key. These files are not unique to this node.
if err := getHostFile(clientKubeProxyCert, clientKubeProxyKey, info); err != nil {
return nil, err
return nil, errors.Wrap(err, clientKubeProxyCert)
}

// Generate a kubeconfig for kube-proxy.
kubeconfigKubeproxy := filepath.Join(envInfo.DataDir, "agent", "kubeproxy.kubeconfig")
if err := deps.KubeConfig(kubeconfigKubeproxy, proxy.APIServerURL(), serverCAFile, clientKubeProxyCert, clientKubeProxyKey); err != nil {
if err := deps.KubeConfig(kubeconfigKubeproxy, apiServerURL, serverCAFile, clientKubeProxyCert, clientKubeProxyKey); err != nil {
return nil, err
}

clientK3sControllerCert := filepath.Join(envInfo.DataDir, "agent", "client-"+version.Program+"-controller.crt")
clientK3sControllerKey := filepath.Join(envInfo.DataDir, "agent", "client-"+version.Program+"-controller.key")

// Ask the server to send us its agent controller client cert+key. These files are not unique to this node.
if err := getHostFile(clientK3sControllerCert, clientK3sControllerKey, info); err != nil {
return nil, err
return nil, errors.Wrap(err, clientK3sControllerCert)
}

// Generate a kubeconfig for the agent controller.
kubeconfigK3sController := filepath.Join(envInfo.DataDir, "agent", version.Program+"controller.kubeconfig")
if err := deps.KubeConfig(kubeconfigK3sController, proxy.APIServerURL(), serverCAFile, clientK3sControllerCert, clientK3sControllerKey); err != nil {
if err := deps.KubeConfig(kubeconfigK3sController, apiServerURL, serverCAFile, clientK3sControllerCert, clientK3sControllerKey); err != nil {
return nil, err
}

Expand Down
7 changes: 6 additions & 1 deletion pkg/agent/loadbalancer/loadbalancer.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,19 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net
return conn, nil
}
logrus.Debugf("Dial error from load balancer %s: %s", lb.serviceName, err)
// Don't close connections to the failed server if we're retrying with health checks ignored.
// We don't want to disrupt active connections if it is unlikely they will have anywhere to go.
if !allChecksFailed {
defer server.closeAll()
}
}

newServer, err := lb.nextServer(targetServer)
if err != nil {
return nil, err
}
if targetServer != newServer {
logrus.Debugf("Failed over to new server for load balancer %s: %s", lb.serviceName, newServer)
logrus.Debugf("Failed over to new server for load balancer %s: %s -> %s", lb.serviceName, targetServer, newServer)
}
if ctx.Err() != nil {
return nil, ctx.Err()
Expand Down
17 changes: 12 additions & 5 deletions pkg/agent/proxy/apiproxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ func NewSupervisorProxy(ctx context.Context, lbEnabled bool, dataDir, supervisor
p.fallbackSupervisorAddress = u.Host
p.supervisorPort = u.Port()

logrus.Debugf("Supervisor proxy using supervisor=%s apiserver=%s lb=%v", p.supervisorURL, p.apiServerURL, p.lbEnabled)
return &p, nil
}

Expand Down Expand Up @@ -132,29 +133,35 @@ func (p *proxy) setSupervisorPort(addresses []string) []string {
// load-balancer, and the address of this load-balancer is returned instead of the actual apiserver
// addresses.
func (p *proxy) SetAPIServerPort(port int, isIPv6 bool) error {
if p.apiServerEnabled {
logrus.Debugf("Supervisor proxy apiserver port already set")
return nil
}

u, err := url.Parse(p.initialSupervisorURL)
if err != nil {
return errors.Wrapf(err, "failed to parse server URL %s", p.initialSupervisorURL)
}
p.apiServerPort = strconv.Itoa(port)
u.Host = sysnet.JoinHostPort(u.Hostname(), p.apiServerPort)

p.apiServerURL = u.String()
p.apiServerEnabled = true

if p.lbEnabled && p.apiServerLB == nil {
lbServerPort := p.lbServerPort
if lbServerPort != 0 {
lbServerPort = lbServerPort - 1
}
lb, err := loadbalancer.New(p.context, p.dataDir, loadbalancer.APIServerServiceName, p.apiServerURL, lbServerPort, isIPv6)
lb, err := loadbalancer.New(p.context, p.dataDir, loadbalancer.APIServerServiceName, u.String(), lbServerPort, isIPv6)
if err != nil {
return err
}
p.apiServerURL = lb.LoadBalancerServerURL()
p.apiServerLB = lb
p.apiServerURL = lb.LoadBalancerServerURL()
} else {
p.apiServerURL = u.String()
}

logrus.Debugf("Supervisor proxy apiserver port changed; apiserver=%s lb=%v", p.apiServerURL, p.lbEnabled)
p.apiServerEnabled = true
return nil
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/agent/tunnel/tunnel.go
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ func (a *agentTunnel) connect(rootCtx context.Context, waitGroup *sync.WaitGroup
err := remotedialer.ConnectToProxy(ctx, wsURL, nil, auth, ws, onConnect)
connected = false
if err != nil && !errors.Is(err, context.Canceled) {
logrus.WithField("url", wsURL).WithError(err).Error("Remotedialer proxy error; reconecting...")
logrus.WithField("url", wsURL).WithError(err).Error("Remotedialer proxy error; reconnecting...")
// wait between reconnection attempts to avoid hammering the server
time.Sleep(endpointDebounceDelay)
}
Expand Down

0 comments on commit 1661f10

Please sign in to comment.