Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix incorrect MTU configurations #5880

Merged
merged 2 commits into from
Jan 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cmd/antrea-agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ func run(o *Options) error {
nodeNetworkPolicyEnabled := features.DefaultFeatureGate.Enabled(features.NodeNetworkPolicy)
l7FlowExporterEnabled := features.DefaultFeatureGate.Enabled(features.L7FlowExporter)
enableMulticlusterGW := features.DefaultFeatureGate.Enabled(features.Multicluster) && o.config.Multicluster.EnableGateway
_, multiclusterEncryptionMode := config.GetTrafficEncryptionModeFromStr(o.config.Multicluster.TrafficEncryptionMode)
enableMulticlusterNP := features.DefaultFeatureGate.Enabled(features.Multicluster) && o.config.Multicluster.EnableStretchedNetworkPolicy
enableFlowExporter := features.DefaultFeatureGate.Enabled(features.FlowExporter) && o.config.FlowExporter.Enable
var nodeIPTracker *nodeip.Tracker
Expand Down Expand Up @@ -210,7 +211,8 @@ func run(o *Options) error {
IPsecConfig: config.IPsecConfig{
AuthenticationMode: ipsecAuthenticationMode,
},
EnableMulticlusterGW: enableMulticlusterGW,
EnableMulticlusterGW: enableMulticlusterGW,
MulticlusterEncryptionMode: multiclusterEncryptionMode,
}

wireguardConfig := &config.WireGuardConfig{
Expand Down
6 changes: 1 addition & 5 deletions pkg/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -1094,7 +1094,7 @@ func (i *Initializer) waitForIPsecMonitorDaemon() error {

// initializeWireguard checks if preconditions are met for using WireGuard and initializes WireGuard client or cleans up.
func (i *Initializer) initializeWireGuard() error {
i.wireGuardConfig.MTU = i.nodeConfig.NodeTransportInterfaceMTU - config.WireGuardOverhead
i.wireGuardConfig.MTU = i.nodeConfig.NodeTransportInterfaceMTU - i.networkConfig.WireGuardMTUDeduction
wgClient, err := wireguard.New(i.nodeConfig, i.wireGuardConfig)
if err != nil {
return err
Expand Down Expand Up @@ -1197,10 +1197,6 @@ func (i *Initializer) getInterfaceMTU(transportInterface *net.Interface) (int, e

isIPv6 := i.nodeConfig.NodeIPv6Addr != nil
mtu -= i.networkConfig.CalculateMTUDeduction(isIPv6)

if i.networkConfig.TrafficEncryptionMode == config.TrafficEncryptionModeIPSec {
mtu -= config.IPSecESPOverhead
}
return mtu, nil
}

Expand Down
76 changes: 55 additions & 21 deletions pkg/agent/config/node_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,15 @@ const (
)

const (
vxlanOverhead = 50
geneveOverhead = 50
greOverhead = 38
vxlanOverhead = 50
geneveOverhead = 50
// GRE overhead: 14-byte outer MAC, 20-byte outer IPv4, 8-byte GRE header (4-byte standard header + 4-byte key field)
greOverhead = 42

ipv6ExtraOverhead = 20

WireGuardOverhead = 80
// WireGuard overhead: 20-byte outer IPv4, 8-byte UDP header, 4-byte type, 4-byte key index, 8-byte nonce, 16-byte authentication tag
WireGuardOverhead = 60
// IPsec ESP can add a maximum of 38 bytes to the packet including the ESP
// header and trailer.
IPSecESPOverhead = 38
Expand Down Expand Up @@ -209,14 +212,19 @@ type NetworkConfig struct {
TransportIfaceCIDRs []string
IPv4Enabled bool
IPv6Enabled bool
// MTUDeduction only counts IPv4 tunnel overhead, no IPsec and WireGuard overhead.
// MTUDeduction is the MTU deduction for encapsulation and encryption in cluster.
MTUDeduction int
// WireGuardMTUDeduction is the MTU deduction for WireGuard encryption.
// It is calculated based on whether IPv6 is used.
WireGuardMTUDeduction int
// Set by the defaultMTU config option or auto discovered.
// Auto discovery will use MTU value of the Node's transport interface.
// For Encap and Hybrid mode, InterfaceMTU will be adjusted to account for
// encap header.
InterfaceMTU int
EnableMulticlusterGW bool
InterfaceMTU int

EnableMulticlusterGW bool
MulticlusterEncryptionMode TrafficEncryptionModeType
}

// IsIPv4Enabled returns true if the cluster network supports IPv4. Legal cases are:
Expand Down Expand Up @@ -279,24 +287,50 @@ func (nc *NetworkConfig) NeedsDirectRoutingToPeer(peerIP net.IP, localIP *net.IP
return (nc.TrafficEncapMode == TrafficEncapModeNoEncap || nc.TrafficEncapMode == TrafficEncapModeHybrid) && localIP.Contains(peerIP)
}

func (nc *NetworkConfig) getEncapMTUDeduction(isIPv6 bool) int {
var deduction int
if nc.TunnelType == ovsconfig.VXLANTunnel {
deduction = vxlanOverhead
} else if nc.TunnelType == ovsconfig.GeneveTunnel {
deduction = geneveOverhead
} else if nc.TunnelType == ovsconfig.GRETunnel {
deduction = greOverhead
} else {
return 0
}
if isIPv6 {
deduction += ipv6ExtraOverhead
}
return deduction
}

func (nc *NetworkConfig) CalculateMTUDeduction(isIPv6 bool) int {
var mtuDeduction int
// When Multi-cluster Gateway is enabled, we need to reduce MTU for potential cross-cluster traffic.
if nc.TrafficEncapMode.SupportsEncap() || nc.EnableMulticlusterGW {
if nc.TunnelType == ovsconfig.VXLANTunnel {
mtuDeduction = vxlanOverhead
} else if nc.TunnelType == ovsconfig.GeneveTunnel {
mtuDeduction = geneveOverhead
} else if nc.TunnelType == ovsconfig.GRETunnel {
mtuDeduction = greOverhead
}
nc.WireGuardMTUDeduction = WireGuardOverhead
if isIPv6 {
nc.WireGuardMTUDeduction += ipv6ExtraOverhead
}

if nc.TrafficEncapMode.SupportsEncap() && isIPv6 {
mtuDeduction += ipv6ExtraOverhead
if nc.EnableMulticlusterGW {
nc.MTUDeduction = nc.getEncapMTUDeduction(isIPv6)
// When multi-cluster WireGuard is enabled, cross-cluster traffic will be encapsulated and encrypted, we need to
// reduce MTU for both encapsulation and encryption.
if nc.MulticlusterEncryptionMode == TrafficEncryptionModeWireGuard {
nc.MTUDeduction += nc.WireGuardMTUDeduction
}
return nc.MTUDeduction
}
if nc.TrafficEncapMode.SupportsEncap() {
nc.MTUDeduction = nc.getEncapMTUDeduction(isIPv6)
}
if nc.TrafficEncryptionMode == TrafficEncryptionModeWireGuard {
// When WireGuard is enabled, cross-node traffic will only be encrypted, just reduce MTU for encryption.
nc.MTUDeduction = nc.WireGuardMTUDeduction
} else if nc.TrafficEncryptionMode == TrafficEncryptionModeIPSec {
// When IPsec is enabled, cross-node traffic will be encapsulated and encrypted, we need to reduce MTU for both
// encapsulation and encryption.
nc.MTUDeduction += IPSecESPOverhead
}
nc.MTUDeduction = mtuDeduction
return mtuDeduction
return nc.MTUDeduction
}

// ServiceConfig includes K8s Service CIDR and available IP addresses for NodePort.
Expand Down
48 changes: 47 additions & 1 deletion pkg/agent/config/node_config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,14 +298,60 @@ func TestCalculateMTUDeduction(t *testing.T) {
{
name: "GRE encap without IPv6",
nc: &NetworkConfig{TunnelType: ovsconfig.GRETunnel},
expectedMTUDeduction: 38,
expectedMTUDeduction: 42,
},
{
name: "Default encap with IPv6",
nc: &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel},
isIPv6: true,
expectedMTUDeduction: 70,
},
{
name: "WireGuard enabled",
nc: &NetworkConfig{TrafficEncryptionMode: TrafficEncryptionModeWireGuard},
expectedMTUDeduction: 60,
},
{
name: "IPv6 with WireGuard enabled",
nc: &NetworkConfig{TrafficEncryptionMode: TrafficEncryptionModeWireGuard},
isIPv6: true,
expectedMTUDeduction: 80,
},
{
name: "Multicluster enabled with Geneve encap",
nc: &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, EnableMulticlusterGW: true},
expectedMTUDeduction: 50,
},
{
name: "Geneve encap with Multicluster WireGuard enabled",
nc: &NetworkConfig{
TunnelType: ovsconfig.GeneveTunnel,
EnableMulticlusterGW: true,
MulticlusterEncryptionMode: TrafficEncryptionModeWireGuard,
},
expectedMTUDeduction: 110,
},
tnqn marked this conversation as resolved.
Show resolved Hide resolved
{
name: "Geneve encap with IPSec enabled",
nc: &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
expectedMTUDeduction: 88,
},
{
name: "Geneve encap with IPSec enabled and IPv6",
nc: &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
isIPv6: true,
expectedMTUDeduction: 108,
},
{
name: "VXLan encap with IPSec enabled",
nc: &NetworkConfig{TunnelType: ovsconfig.VXLANTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
expectedMTUDeduction: 88,
},
{
name: "GRE encap with IPSec enabled",
nc: &NetworkConfig{TunnelType: ovsconfig.GRETunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
expectedMTUDeduction: 80,
},
}

for _, tt := range tests {
Expand Down
4 changes: 3 additions & 1 deletion pkg/agent/multicluster/mc_route_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,9 @@ func NewMCDefaultRouteController(
controller.wireGuardConfig = &config.WireGuardConfig{
Port: multiclusterConfig.WireGuard.Port,
Name: multiclusterWireGuardInterface,
MTU: controller.nodeConfig.NodeTransportInterfaceMTU - controller.networkConfig.MTUDeduction - config.WireGuardOverhead,
// Regardless of the tunnel type, the WireGuard device must only reduce MTU for encryption because the
// packets it transmits have been encapsulated.
MTU: nodeConfig.NodeTransportInterfaceMTU - networkConfig.WireGuardMTUDeduction,
}
}
controller.gwInformer.Informer().AddEventHandlerWithResyncPeriod(
Expand Down
2 changes: 1 addition & 1 deletion test/e2e-secondary-network/secondary_network_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ func (data *TestData) pingBetweenInterfaces(t *testing.T) error {
} else {
IPToPing = antreae2e.PodIPs{IPv6: &ip}
}
err := data.e2eTestData.RunPingCommandFromTestPod(antreae2e.PodInfo{Name: podData[sourcePod].nameOfPods, OS: osType, NodeName: clusterInfo.controlPlaneNodeName, Namespace: nameSpace}, nameSpace, &IPToPing, ctrName, count, size)
err := data.e2eTestData.RunPingCommandFromTestPod(antreae2e.PodInfo{Name: podData[sourcePod].nameOfPods, OS: osType, NodeName: clusterInfo.controlPlaneNodeName, Namespace: nameSpace}, nameSpace, &IPToPing, ctrName, count, size, false)
if err == nil {
logs.Infof("Ping '%s' -> '%s'( Interface: %s, IP Address: %s): OK", podData[sourcePod].nameOfPods, podData[targetPod].nameOfPods, podData[targetPod].nameOfInterfacePerPod[targetInterface], secondaryIpAddress)
} else {
Expand Down
10 changes: 5 additions & 5 deletions test/e2e/antreaipam_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -273,16 +273,16 @@ func testAntreaIPAMPodConnectivitySameNode(t *testing.T, data *TestData) {
})
workerNode := workerNodeName(1)

t.Logf("Creating %d agnhost Pods on '%s'", numPods+1, workerNode)
t.Logf("Creating %d toolbox Pods on '%s'", numPods+1, workerNode)
for i := range PodInfos {
PodInfos[i].OS = clusterInfo.nodesOS[workerNode]
if err := data.createAgnhostPodOnNodeWithAnnotations(PodInfos[i].Name, PodInfos[i].Namespace, workerNode, nil); err != nil {
t.Fatalf("Error when creating agnhost test Pod '%s': %v", PodInfos[i], err)
if err := data.createToolboxPodOnNode(PodInfos[i].Name, PodInfos[i].Namespace, workerNode, false); err != nil {
t.Fatalf("Error when creating toolbox test Pod '%s': %v", PodInfos[i], err)
}
defer deletePodWrapper(t, data, PodInfos[i].Namespace, PodInfos[i].Name)
}

data.runPingMesh(t, PodInfos, agnhostContainerName)
data.runPingMesh(t, PodInfos, toolboxContainerName, true)
}

func testAntreaIPAMPodConnectivityDifferentNodes(t *testing.T, data *TestData) {
Expand All @@ -296,7 +296,7 @@ func testAntreaIPAMPodConnectivityDifferentNodes(t *testing.T, data *TestData) {
}
PodInfos = append(PodInfos, createdPodInfos...)
}
data.runPingMesh(t, PodInfos, agnhostContainerName)
data.runPingMesh(t, PodInfos, toolboxContainerName, true)
}

func testAntreaIPAMStatefulSet(t *testing.T, data *TestData, dedicatedIPPoolKey *string) {
Expand Down
36 changes: 19 additions & 17 deletions test/e2e/connectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,9 @@ func waitForPodIPs(t *testing.T, data *TestData, podInfos []PodInfo) map[string]

// runPingMesh runs a ping mesh between all the provided Pods after first retrieving their IP
// addresses.
func (data *TestData) runPingMesh(t *testing.T, podInfos []PodInfo, ctrname string) {
// When dontFragment is true, it will specify the packet size to the maximum value the MTU allows and set DF flag to
// validate the MTU is correct.
func (data *TestData) runPingMesh(t *testing.T, podInfos []PodInfo, ctrname string, dontFragment bool) {
podIPs := waitForPodIPs(t, data, podInfos)

t.Logf("Ping mesh test between all Pods")
Expand All @@ -110,7 +112,7 @@ func (data *TestData) runPingMesh(t *testing.T, podInfos []PodInfo, ctrname stri
if pi2.Namespace != "" {
pod2Namespace = pi2.Namespace
}
if err := data.RunPingCommandFromTestPod(pi1, podNamespace, podIPs[pi2.Name], ctrname, pingCount, 0); err != nil {
if err := data.RunPingCommandFromTestPod(pi1, podNamespace, podIPs[pi2.Name], ctrname, pingCount, 0, dontFragment); err != nil {
t.Errorf("Ping '%s' -> '%s': ERROR (%v)", k8s.NamespacedName(podNamespace, pi1.Name), k8s.NamespacedName(pod2Namespace, pi2.Name), err)
} else {
t.Logf("Ping '%s' -> '%s': OK", k8s.NamespacedName(podNamespace, pi1.Name), k8s.NamespacedName(pod2Namespace, pi2.Name))
Expand All @@ -131,16 +133,16 @@ func (data *TestData) testPodConnectivitySameNode(t *testing.T) {
workerNode = workerNodeName(clusterInfo.windowsNodes[0])
}

t.Logf("Creating %d agnhost Pods on '%s'", numPods, workerNode)
t.Logf("Creating %d toolbox Pods on '%s'", numPods, workerNode)
for i := range podInfos {
podInfos[i].OS = clusterInfo.nodesOS[workerNode]
if err := data.createAgnhostPodOnNode(podInfos[i].Name, data.testNamespace, workerNode, false); err != nil {
t.Fatalf("Error when creating agnhost test Pod '%s': %v", podInfos[i], err)
if err := data.createToolboxPodOnNode(podInfos[i].Name, data.testNamespace, workerNode, false); err != nil {
t.Fatalf("Error when creating toolbox test Pod '%s': %v", podInfos[i], err)
}
defer deletePodWrapper(t, data, data.testNamespace, podInfos[i].Name)
}

data.runPingMesh(t, podInfos, agnhostContainerName)
data.runPingMesh(t, podInfos, toolboxContainerName, true)
}

// testPodConnectivityOnSameNode checks that Pods running on the same Node can reach each other, by
Expand Down Expand Up @@ -185,13 +187,13 @@ func testHostPortPodConnectivity(t *testing.T, data *TestData) {
data.testHostPortPodConnectivity(t, data.testNamespace, data.testNamespace)
}

// createPodsOnDifferentNodes creates agnhost Pods through a DaemonSet. This function returns information of the created
// createPodsOnDifferentNodes creates toolbox Pods through a DaemonSet. This function returns information of the created
// Pods as well as a function which will delete the Pods when called. Since Pods can be on Nodes of different OSes, podInfo
// slice instead of PodName slice is used to inform caller of correct commands and options. Linux and Windows Pods are
// alternating in this podInfo slice so that the test can cover different connectivity cases between different OSes.
func createPodsOnDifferentNodes(t *testing.T, data *TestData, namespace, tag string) (podInfos []PodInfo, cleanup func() error) {
dsName := "connectivity-test" + tag
_, deleteDaemonSet, err := data.createDaemonSet(dsName, namespace, agnhostContainerName, agnhostImage, []string{"sleep", "3600"}, nil)
_, deleteDaemonSet, err := data.createDaemonSet(dsName, namespace, toolboxContainerName, toolboxImage, []string{"sleep", "3600"}, nil)
if err != nil {
t.Fatalf("Error when creating DaemonSet '%s': %v", dsName, err)
}
Expand Down Expand Up @@ -264,7 +266,7 @@ func (data *TestData) testPodConnectivityDifferentNodes(t *testing.T) {
if len(podInfos) > maxPods {
podInfos = podInfos[:maxPods]
}
data.runPingMesh(t, podInfos[:numPods], agnhostContainerName)
data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true)
}

// testPodConnectivityDifferentNodes checks that Pods running on different Nodes can reach each
Expand Down Expand Up @@ -315,11 +317,11 @@ func testPodConnectivityAfterAntreaRestart(t *testing.T, data *TestData, namespa
podInfos, deletePods := createPodsOnDifferentNodes(t, data, namespace, "antrearestart")
defer deletePods()

data.runPingMesh(t, podInfos[:numPods], agnhostContainerName)
data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true)

data.redeployAntrea(t, deployAntreaDefault)

data.runPingMesh(t, podInfos[:numPods], agnhostContainerName)
data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true)
}

// testOVSRestartSameNode verifies that datapath flows are not removed when the Antrea Agent Pod is
Expand Down Expand Up @@ -396,16 +398,16 @@ func testOVSFlowReplay(t *testing.T, data *TestData, namespace string) {
}
workerNode := workerNodeName(1)

t.Logf("Creating %d busybox test Pods on '%s'", numPods, workerNode)
t.Logf("Creating %d toolbox test Pods on '%s'", numPods, workerNode)
for i := range podInfos {
podInfos[i].OS = clusterInfo.nodesOS[workerNode]
if err := data.createBusyboxPodOnNode(podInfos[i].Name, namespace, workerNode, false); err != nil {
t.Fatalf("Error when creating busybox test Pod '%s': %v", podInfos[i].Name, err)
if err := data.createToolboxPodOnNode(podInfos[i].Name, namespace, workerNode, false); err != nil {
t.Fatalf("Error when creating toolbox test Pod '%s': %v", podInfos[i].Name, err)
}
defer deletePodWrapper(t, data, namespace, podInfos[i].Name)
}

data.runPingMesh(t, podInfos, busyboxContainerName)
data.runPingMesh(t, podInfos, toolboxContainerName, true)

var antreaPodName string
var err error
Expand Down Expand Up @@ -487,7 +489,7 @@ func testOVSFlowReplay(t *testing.T, data *TestData, namespace string) {
// This should give Antrea ~10s to restore flows, since we generate 10 "pings" with a 1s
// interval.
t.Logf("Running second ping mesh to check that flows have been restored")
data.runPingMesh(t, podInfos, busyboxContainerName)
data.runPingMesh(t, podInfos, toolboxContainerName, true)

flows2, groups2 := dumpFlows(), dumpGroups()
numFlows2, numGroups2 := len(flows2), len(groups2)
Expand Down Expand Up @@ -515,7 +517,7 @@ func testPingLargeMTU(t *testing.T, data *TestData) {

pingSize := 2000
t.Logf("Running ping with size %d between Pods %s and %s", pingSize, podInfos[0].Name, podInfos[1].Name)
if err := data.RunPingCommandFromTestPod(podInfos[0], data.testNamespace, podIPs[podInfos[1].Name], agnhostContainerName, pingCount, pingSize); err != nil {
if err := data.RunPingCommandFromTestPod(podInfos[0], data.testNamespace, podIPs[podInfos[1].Name], toolboxContainerName, pingCount, pingSize, false); err != nil {
t.Error(err)
}
}
Loading
Loading