antrea-io · tnqn · Jan 26, 2024 · Jan 21, 2024 · Jan 23, 2024
diff --git a/cmd/antrea-agent/agent.go b/cmd/antrea-agent/agent.go
@@ -145,6 +145,7 @@ func run(o *Options) error {
 	nodeNetworkPolicyEnabled := features.DefaultFeatureGate.Enabled(features.NodeNetworkPolicy)
 	l7FlowExporterEnabled := features.DefaultFeatureGate.Enabled(features.L7FlowExporter)
 	enableMulticlusterGW := features.DefaultFeatureGate.Enabled(features.Multicluster) && o.config.Multicluster.EnableGateway
+	_, multiclusterEncryptionMode := config.GetTrafficEncryptionModeFromStr(o.config.Multicluster.TrafficEncryptionMode)
 	enableMulticlusterNP := features.DefaultFeatureGate.Enabled(features.Multicluster) && o.config.Multicluster.EnableStretchedNetworkPolicy
 	enableFlowExporter := features.DefaultFeatureGate.Enabled(features.FlowExporter) && o.config.FlowExporter.Enable
 	var nodeIPTracker *nodeip.Tracker
@@ -210,7 +211,8 @@ func run(o *Options) error {
 		IPsecConfig: config.IPsecConfig{
 			AuthenticationMode: ipsecAuthenticationMode,
 		},
-		EnableMulticlusterGW: enableMulticlusterGW,
+		EnableMulticlusterGW:       enableMulticlusterGW,
+		MulticlusterEncryptionMode: multiclusterEncryptionMode,
 	}
 
 	wireguardConfig := &config.WireGuardConfig{

diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go
@@ -1094,7 +1094,7 @@ func (i *Initializer) waitForIPsecMonitorDaemon() error {
 
 // initializeWireguard checks if preconditions are met for using WireGuard and initializes WireGuard client or cleans up.
 func (i *Initializer) initializeWireGuard() error {
-	i.wireGuardConfig.MTU = i.nodeConfig.NodeTransportInterfaceMTU - config.WireGuardOverhead
+	i.wireGuardConfig.MTU = i.nodeConfig.NodeTransportInterfaceMTU - i.networkConfig.WireGuardMTUDeduction
 	wgClient, err := wireguard.New(i.nodeConfig, i.wireGuardConfig)
 	if err != nil {
 		return err
@@ -1197,10 +1197,6 @@ func (i *Initializer) getInterfaceMTU(transportInterface *net.Interface) (int, e
 
 	isIPv6 := i.nodeConfig.NodeIPv6Addr != nil
 	mtu -= i.networkConfig.CalculateMTUDeduction(isIPv6)
-
-	if i.networkConfig.TrafficEncryptionMode == config.TrafficEncryptionModeIPSec {
-		mtu -= config.IPSecESPOverhead
-	}
 	return mtu, nil
 }
 

diff --git a/pkg/agent/config/node_config.go b/pkg/agent/config/node_config.go
@@ -34,12 +34,15 @@ const (
 )
 
 const (
-	vxlanOverhead     = 50
-	geneveOverhead    = 50
-	greOverhead       = 38
+	vxlanOverhead  = 50
+	geneveOverhead = 50
+	// GRE overhead: 14-byte outer MAC, 20-byte outer IPv4, 8-byte GRE header (4-byte standard header + 4-byte key field)
+	greOverhead = 42
+
 	ipv6ExtraOverhead = 20
 
-	WireGuardOverhead = 80
+	// WireGuard overhead: 20-byte outer IPv4, 8-byte UDP header, 4-byte type, 4-byte key index, 8-byte nonce, 16-byte authentication tag
+	WireGuardOverhead = 60
 	// IPsec ESP can add a maximum of 38 bytes to the packet including the ESP
 	// header and trailer.
 	IPSecESPOverhead = 38
@@ -209,14 +212,19 @@ type NetworkConfig struct {
 	TransportIfaceCIDRs   []string
 	IPv4Enabled           bool
 	IPv6Enabled           bool
-	// MTUDeduction only counts IPv4 tunnel overhead, no IPsec and WireGuard overhead.
+	// MTUDeduction is the MTU deduction for encapsulation and encryption in cluster.
 	MTUDeduction int
+	// WireGuardMTUDeduction is the MTU deduction for WireGuard encryption.
+	// It is calculated based on whether IPv6 is used.
+	WireGuardMTUDeduction int
 	// Set by the defaultMTU config option or auto discovered.
 	// Auto discovery will use MTU value of the Node's transport interface.
 	// For Encap and Hybrid mode, InterfaceMTU will be adjusted to account for
 	// encap header.
-	InterfaceMTU         int
-	EnableMulticlusterGW bool
+	InterfaceMTU int
+
+	EnableMulticlusterGW       bool
+	MulticlusterEncryptionMode TrafficEncryptionModeType
 }
 
 // IsIPv4Enabled returns true if the cluster network supports IPv4. Legal cases are:
@@ -279,24 +287,50 @@ func (nc *NetworkConfig) NeedsDirectRoutingToPeer(peerIP net.IP, localIP *net.IP
 	return (nc.TrafficEncapMode == TrafficEncapModeNoEncap || nc.TrafficEncapMode == TrafficEncapModeHybrid) && localIP.Contains(peerIP)
 }
 
+func (nc *NetworkConfig) getEncapMTUDeduction(isIPv6 bool) int {
+	var deduction int
+	if nc.TunnelType == ovsconfig.VXLANTunnel {
+		deduction = vxlanOverhead
+	} else if nc.TunnelType == ovsconfig.GeneveTunnel {
+		deduction = geneveOverhead
+	} else if nc.TunnelType == ovsconfig.GRETunnel {
+		deduction = greOverhead
+	} else {
+		return 0
+	}
+	if isIPv6 {
+		deduction += ipv6ExtraOverhead
+	}
+	return deduction
+}
+
 func (nc *NetworkConfig) CalculateMTUDeduction(isIPv6 bool) int {
-	var mtuDeduction int
-	// When Multi-cluster Gateway is enabled, we need to reduce MTU for potential cross-cluster traffic.
-	if nc.TrafficEncapMode.SupportsEncap() || nc.EnableMulticlusterGW {
-		if nc.TunnelType == ovsconfig.VXLANTunnel {
-			mtuDeduction = vxlanOverhead
-		} else if nc.TunnelType == ovsconfig.GeneveTunnel {
-			mtuDeduction = geneveOverhead
-		} else if nc.TunnelType == ovsconfig.GRETunnel {
-			mtuDeduction = greOverhead
-		}
+	nc.WireGuardMTUDeduction = WireGuardOverhead
+	if isIPv6 {
+		nc.WireGuardMTUDeduction += ipv6ExtraOverhead
 	}
 
-	if nc.TrafficEncapMode.SupportsEncap() && isIPv6 {
-		mtuDeduction += ipv6ExtraOverhead
+	if nc.EnableMulticlusterGW {
+		nc.MTUDeduction = nc.getEncapMTUDeduction(isIPv6)
+		// When multi-cluster WireGuard is enabled, cross-cluster traffic will be encapsulated and encrypted, we need to
+		// reduce MTU for both encapsulation and encryption.
+		if nc.MulticlusterEncryptionMode == TrafficEncryptionModeWireGuard {
+			nc.MTUDeduction += nc.WireGuardMTUDeduction
+		}
+		return nc.MTUDeduction
+	}
+	if nc.TrafficEncapMode.SupportsEncap() {
+		nc.MTUDeduction = nc.getEncapMTUDeduction(isIPv6)
+	}
+	if nc.TrafficEncryptionMode == TrafficEncryptionModeWireGuard {
+		// When WireGuard is enabled, cross-node traffic will only be encrypted, just reduce MTU for encryption.
+		nc.MTUDeduction = nc.WireGuardMTUDeduction
+	} else if nc.TrafficEncryptionMode == TrafficEncryptionModeIPSec {
+		// When IPsec is enabled, cross-node traffic will be encapsulated and encrypted, we need to reduce MTU for both
+		// encapsulation and encryption.
+		nc.MTUDeduction += IPSecESPOverhead
 	}
-	nc.MTUDeduction = mtuDeduction
-	return mtuDeduction
+	return nc.MTUDeduction
 }
 
 // ServiceConfig includes K8s Service CIDR and available IP addresses for NodePort.

diff --git a/pkg/agent/config/node_config_test.go b/pkg/agent/config/node_config_test.go
@@ -298,14 +298,60 @@ func TestCalculateMTUDeduction(t *testing.T) {
 		{
 			name:                 "GRE encap without IPv6",
 			nc:                   &NetworkConfig{TunnelType: ovsconfig.GRETunnel},
-			expectedMTUDeduction: 38,
+			expectedMTUDeduction: 42,
 		},
 		{
 			name:                 "Default encap with IPv6",
 			nc:                   &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel},
 			isIPv6:               true,
 			expectedMTUDeduction: 70,
 		},
+		{
+			name:                 "WireGuard enabled",
+			nc:                   &NetworkConfig{TrafficEncryptionMode: TrafficEncryptionModeWireGuard},
+			expectedMTUDeduction: 60,
+		},
+		{
+			name:                 "IPv6 with WireGuard enabled",
+			nc:                   &NetworkConfig{TrafficEncryptionMode: TrafficEncryptionModeWireGuard},
+			isIPv6:               true,
+			expectedMTUDeduction: 80,
+		},
+		{
+			name:                 "Multicluster enabled with Geneve encap",
+			nc:                   &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, EnableMulticlusterGW: true},
+			expectedMTUDeduction: 50,
+		},
+		{
+			name: "Geneve encap with Multicluster WireGuard enabled",
+			nc: &NetworkConfig{
+				TunnelType:                 ovsconfig.GeneveTunnel,
+				EnableMulticlusterGW:       true,
+				MulticlusterEncryptionMode: TrafficEncryptionModeWireGuard,
+			},
+			expectedMTUDeduction: 110,
+		},
+		{
+			name:                 "Geneve encap with IPSec enabled",
+			nc:                   &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
+			expectedMTUDeduction: 88,
+		},
+		{
+			name:                 "Geneve encap with IPSec enabled and IPv6",
+			nc:                   &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
+			isIPv6:               true,
+			expectedMTUDeduction: 108,
+		},
+		{
+			name:                 "VXLan encap with IPSec enabled",
+			nc:                   &NetworkConfig{TunnelType: ovsconfig.VXLANTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
+			expectedMTUDeduction: 88,
+		},
+		{
+			name:                 "GRE encap with IPSec enabled",
+			nc:                   &NetworkConfig{TunnelType: ovsconfig.GRETunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
+			expectedMTUDeduction: 80,
+		},
 	}
 
 	for _, tt := range tests {

diff --git a/pkg/agent/multicluster/mc_route_controller.go b/pkg/agent/multicluster/mc_route_controller.go
@@ -129,7 +129,9 @@ func NewMCDefaultRouteController(
 		controller.wireGuardConfig = &config.WireGuardConfig{
 			Port: multiclusterConfig.WireGuard.Port,
 			Name: multiclusterWireGuardInterface,
-			MTU:  controller.nodeConfig.NodeTransportInterfaceMTU - controller.networkConfig.MTUDeduction - config.WireGuardOverhead,
+			// Regardless of the tunnel type, the WireGuard device must only reduce MTU for encryption because the
+			// packets it transmits have been encapsulated.
+			MTU: nodeConfig.NodeTransportInterfaceMTU - networkConfig.WireGuardMTUDeduction,
 		}
 	}
 	controller.gwInformer.Informer().AddEventHandlerWithResyncPeriod(

diff --git a/test/e2e-secondary-network/secondary_network_test.go b/test/e2e-secondary-network/secondary_network_test.go
@@ -198,7 +198,7 @@ func (data *TestData) pingBetweenInterfaces(t *testing.T) error {
 						} else {
 							IPToPing = antreae2e.PodIPs{IPv6: &ip}
 						}
-						err := data.e2eTestData.RunPingCommandFromTestPod(antreae2e.PodInfo{Name: podData[sourcePod].nameOfPods, OS: osType, NodeName: clusterInfo.controlPlaneNodeName, Namespace: nameSpace}, nameSpace, &IPToPing, ctrName, count, size)
+						err := data.e2eTestData.RunPingCommandFromTestPod(antreae2e.PodInfo{Name: podData[sourcePod].nameOfPods, OS: osType, NodeName: clusterInfo.controlPlaneNodeName, Namespace: nameSpace}, nameSpace, &IPToPing, ctrName, count, size, false)
 						if err == nil {
 							logs.Infof("Ping '%s' -> '%s'( Interface: %s, IP Address: %s): OK", podData[sourcePod].nameOfPods, podData[targetPod].nameOfPods, podData[targetPod].nameOfInterfacePerPod[targetInterface], secondaryIpAddress)
 						} else {

diff --git a/test/e2e/antreaipam_test.go b/test/e2e/antreaipam_test.go
@@ -273,16 +273,16 @@ func testAntreaIPAMPodConnectivitySameNode(t *testing.T, data *TestData) {
 	})
 	workerNode := workerNodeName(1)
 
-	t.Logf("Creating %d agnhost Pods on '%s'", numPods+1, workerNode)
+	t.Logf("Creating %d toolbox Pods on '%s'", numPods+1, workerNode)
 	for i := range PodInfos {
 		PodInfos[i].OS = clusterInfo.nodesOS[workerNode]
-		if err := data.createAgnhostPodOnNodeWithAnnotations(PodInfos[i].Name, PodInfos[i].Namespace, workerNode, nil); err != nil {
-			t.Fatalf("Error when creating agnhost test Pod '%s': %v", PodInfos[i], err)
+		if err := data.createToolboxPodOnNode(PodInfos[i].Name, PodInfos[i].Namespace, workerNode, false); err != nil {
+			t.Fatalf("Error when creating toolbox test Pod '%s': %v", PodInfos[i], err)
 		}
 		defer deletePodWrapper(t, data, PodInfos[i].Namespace, PodInfos[i].Name)
 	}
 
-	data.runPingMesh(t, PodInfos, agnhostContainerName)
+	data.runPingMesh(t, PodInfos, toolboxContainerName, true)
 }
 
 func testAntreaIPAMPodConnectivityDifferentNodes(t *testing.T, data *TestData) {
@@ -296,7 +296,7 @@ func testAntreaIPAMPodConnectivityDifferentNodes(t *testing.T, data *TestData) {
 		}
 		PodInfos = append(PodInfos, createdPodInfos...)
 	}
-	data.runPingMesh(t, PodInfos, agnhostContainerName)
+	data.runPingMesh(t, PodInfos, toolboxContainerName, true)
 }
 
 func testAntreaIPAMStatefulSet(t *testing.T, data *TestData, dedicatedIPPoolKey *string) {

diff --git a/test/e2e/connectivity_test.go b/test/e2e/connectivity_test.go
@@ -93,7 +93,9 @@ func waitForPodIPs(t *testing.T, data *TestData, podInfos []PodInfo) map[string]
 
 // runPingMesh runs a ping mesh between all the provided Pods after first retrieving their IP
 // addresses.
-func (data *TestData) runPingMesh(t *testing.T, podInfos []PodInfo, ctrname string) {
+// When dontFragment is true, it will specify the packet size to the maximum value the MTU allows and set DF flag to
+// validate the MTU is correct.
+func (data *TestData) runPingMesh(t *testing.T, podInfos []PodInfo, ctrname string, dontFragment bool) {
 	podIPs := waitForPodIPs(t, data, podInfos)
 
 	t.Logf("Ping mesh test between all Pods")
@@ -110,7 +112,7 @@ func (data *TestData) runPingMesh(t *testing.T, podInfos []PodInfo, ctrname stri
 			if pi2.Namespace != "" {
 				pod2Namespace = pi2.Namespace
 			}
-			if err := data.RunPingCommandFromTestPod(pi1, podNamespace, podIPs[pi2.Name], ctrname, pingCount, 0); err != nil {
+			if err := data.RunPingCommandFromTestPod(pi1, podNamespace, podIPs[pi2.Name], ctrname, pingCount, 0, dontFragment); err != nil {
 				t.Errorf("Ping '%s' -> '%s': ERROR (%v)", k8s.NamespacedName(podNamespace, pi1.Name), k8s.NamespacedName(pod2Namespace, pi2.Name), err)
 			} else {
 				t.Logf("Ping '%s' -> '%s': OK", k8s.NamespacedName(podNamespace, pi1.Name), k8s.NamespacedName(pod2Namespace, pi2.Name))
@@ -131,16 +133,16 @@ func (data *TestData) testPodConnectivitySameNode(t *testing.T) {
 		workerNode = workerNodeName(clusterInfo.windowsNodes[0])
 	}
 
-	t.Logf("Creating %d agnhost Pods on '%s'", numPods, workerNode)
+	t.Logf("Creating %d toolbox Pods on '%s'", numPods, workerNode)
 	for i := range podInfos {
 		podInfos[i].OS = clusterInfo.nodesOS[workerNode]
-		if err := data.createAgnhostPodOnNode(podInfos[i].Name, data.testNamespace, workerNode, false); err != nil {
-			t.Fatalf("Error when creating agnhost test Pod '%s': %v", podInfos[i], err)
+		if err := data.createToolboxPodOnNode(podInfos[i].Name, data.testNamespace, workerNode, false); err != nil {
+			t.Fatalf("Error when creating toolbox test Pod '%s': %v", podInfos[i], err)
 		}
 		defer deletePodWrapper(t, data, data.testNamespace, podInfos[i].Name)
 	}
 
-	data.runPingMesh(t, podInfos, agnhostContainerName)
+	data.runPingMesh(t, podInfos, toolboxContainerName, true)
 }
 
 // testPodConnectivityOnSameNode checks that Pods running on the same Node can reach each other, by
@@ -185,13 +187,13 @@ func testHostPortPodConnectivity(t *testing.T, data *TestData) {
 	data.testHostPortPodConnectivity(t, data.testNamespace, data.testNamespace)
 }
 
-// createPodsOnDifferentNodes creates agnhost Pods through a DaemonSet. This function returns information of the created
+// createPodsOnDifferentNodes creates toolbox Pods through a DaemonSet. This function returns information of the created
 // Pods as well as a function which will delete the Pods when called. Since Pods can be on Nodes of different OSes, podInfo
 // slice instead of PodName slice is used to inform caller of correct commands and options. Linux and Windows Pods are
 // alternating in this podInfo slice so that the test can cover different connectivity cases between different OSes.
 func createPodsOnDifferentNodes(t *testing.T, data *TestData, namespace, tag string) (podInfos []PodInfo, cleanup func() error) {
 	dsName := "connectivity-test" + tag
-	_, deleteDaemonSet, err := data.createDaemonSet(dsName, namespace, agnhostContainerName, agnhostImage, []string{"sleep", "3600"}, nil)
+	_, deleteDaemonSet, err := data.createDaemonSet(dsName, namespace, toolboxContainerName, toolboxImage, []string{"sleep", "3600"}, nil)
 	if err != nil {
 		t.Fatalf("Error when creating DaemonSet '%s': %v", dsName, err)
 	}
@@ -264,7 +266,7 @@ func (data *TestData) testPodConnectivityDifferentNodes(t *testing.T) {
 	if len(podInfos) > maxPods {
 		podInfos = podInfos[:maxPods]
 	}
-	data.runPingMesh(t, podInfos[:numPods], agnhostContainerName)
+	data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true)
 }
 
 // testPodConnectivityDifferentNodes checks that Pods running on different Nodes can reach each
@@ -315,11 +317,11 @@ func testPodConnectivityAfterAntreaRestart(t *testing.T, data *TestData, namespa
 	podInfos, deletePods := createPodsOnDifferentNodes(t, data, namespace, "antrearestart")
 	defer deletePods()
 
-	data.runPingMesh(t, podInfos[:numPods], agnhostContainerName)
+	data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true)
 
 	data.redeployAntrea(t, deployAntreaDefault)
 
-	data.runPingMesh(t, podInfos[:numPods], agnhostContainerName)
+	data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true)
 }
 
 // testOVSRestartSameNode verifies that datapath flows are not removed when the Antrea Agent Pod is
@@ -396,16 +398,16 @@ func testOVSFlowReplay(t *testing.T, data *TestData, namespace string) {
 	}
 	workerNode := workerNodeName(1)
 
-	t.Logf("Creating %d busybox test Pods on '%s'", numPods, workerNode)
+	t.Logf("Creating %d toolbox test Pods on '%s'", numPods, workerNode)
 	for i := range podInfos {
 		podInfos[i].OS = clusterInfo.nodesOS[workerNode]
-		if err := data.createBusyboxPodOnNode(podInfos[i].Name, namespace, workerNode, false); err != nil {
-			t.Fatalf("Error when creating busybox test Pod '%s': %v", podInfos[i].Name, err)
+		if err := data.createToolboxPodOnNode(podInfos[i].Name, namespace, workerNode, false); err != nil {
+			t.Fatalf("Error when creating toolbox test Pod '%s': %v", podInfos[i].Name, err)
 		}
 		defer deletePodWrapper(t, data, namespace, podInfos[i].Name)
 	}
 
-	data.runPingMesh(t, podInfos, busyboxContainerName)
+	data.runPingMesh(t, podInfos, toolboxContainerName, true)
 
 	var antreaPodName string
 	var err error
@@ -487,7 +489,7 @@ func testOVSFlowReplay(t *testing.T, data *TestData, namespace string) {
 	// This should give Antrea ~10s to restore flows, since we generate 10 "pings" with a 1s
 	// interval.
 	t.Logf("Running second ping mesh to check that flows have been restored")
-	data.runPingMesh(t, podInfos, busyboxContainerName)
+	data.runPingMesh(t, podInfos, toolboxContainerName, true)
 
 	flows2, groups2 := dumpFlows(), dumpGroups()
 	numFlows2, numGroups2 := len(flows2), len(groups2)
@@ -515,7 +517,7 @@ func testPingLargeMTU(t *testing.T, data *TestData) {
 
 	pingSize := 2000
 	t.Logf("Running ping with size %d between Pods %s and %s", pingSize, podInfos[0].Name, podInfos[1].Name)
-	if err := data.RunPingCommandFromTestPod(podInfos[0], data.testNamespace, podIPs[podInfos[1].Name], agnhostContainerName, pingCount, pingSize); err != nil {
+	if err := data.RunPingCommandFromTestPod(podInfos[0], data.testNamespace, podIPs[podInfos[1].Name], toolboxContainerName, pingCount, pingSize, false); err != nil {
 		t.Error(err)
 	}
 }