From 18a492ab252a9ebad3b042f9e0b7cf6e4bb98e3d Mon Sep 17 00:00:00 2001 From: Quan Tian Date: Thu, 11 Jun 2020 22:56:03 +0800 Subject: [PATCH] Fix IP not released after restarting antrea-agent There was no sync-up between the IPAM cache of antrea-agent and the IPAM plugin, the IPAM DEL could be skipped incorrectly when deleting a Pod created before antrea-agent was restarted, leading to IP address leak. The handling of multiple CNI DELs is idempotent. There is no need to check in-memory cache before calling IPAM DEL. This patches fixes the issue by removing the cache check and adds a verification in e2e test. --- pkg/agent/cniserver/ipam/ipam_service.go | 4 --- pkg/agent/cniserver/server_test.go | 5 ++-- test/e2e/basic_test.go | 34 +++++++++++++++++++++--- 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/pkg/agent/cniserver/ipam/ipam_service.go b/pkg/agent/cniserver/ipam/ipam_service.go index db3e21bb13d..1198a49002a 100644 --- a/pkg/agent/cniserver/ipam/ipam_service.go +++ b/pkg/agent/cniserver/ipam/ipam_service.go @@ -82,10 +82,6 @@ func ExecIPAMAdd(cniArgs *cnipb.CniCmdArgs, ipamType string, resultKey string) ( } func ExecIPAMDelete(cniArgs *cnipb.CniCmdArgs, ipamType string, resultKey string) error { - _, ok := GetIPFromCache(resultKey) - if !ok { - return nil - } args := argsFromEnv(cniArgs) driver := ipamDrivers[ipamType] err := driver.Del(args, cniArgs.NetworkConfiguration) diff --git a/pkg/agent/cniserver/server_test.go b/pkg/agent/cniserver/server_test.go index 7c8a0cd8951..ffaab64a8cf 100644 --- a/pkg/agent/cniserver/server_test.go +++ b/pkg/agent/cniserver/server_test.go @@ -132,6 +132,7 @@ func TestIPAMService(t *testing.T) { t.Run("Error on ADD", func(t *testing.T) { ipamMock.EXPECT().Add(gomock.Any(), gomock.Any()).Return(nil, fmt.Errorf("IPAM add error")) + ipamMock.EXPECT().Del(gomock.Any(), gomock.Any()).Return(nil) response, err := cniServer.CmdAdd(cxt, &requestMsg) require.Nil(t, err, "expected no rpc error") checkErrorResponse(t, response, cnipb.ErrorCode_IPAM_FAILURE, "IPAM add error") @@ -165,7 +166,7 @@ func TestIPAMService(t *testing.T) { t.Run("Idempotent Call of IPAM ADD/DEL for the same Pod", func(t *testing.T) { ipamMock.EXPECT().Add(gomock.Any(), gomock.Any()).Times(1) - ipamMock.EXPECT().Del(gomock.Any(), gomock.Any()).Times(1) + ipamMock.EXPECT().Del(gomock.Any(), gomock.Any()).Times(2) cniConfig, response := cniServer.checkRequestMessage(&requestMsg) require.Nil(t, response, "expected no rpc error") podKey := util.GenerateContainerInterfaceName(string(cniConfig.K8S_POD_NAME), string(cniConfig.K8S_POD_NAMESPACE)) @@ -182,7 +183,7 @@ func TestIPAMService(t *testing.T) { t.Run("Idempotent Call of IPAM ADD/DEL for the same Pod with different containers", func(t *testing.T) { ipamMock.EXPECT().Add(gomock.Any(), gomock.Any()).Times(1) - ipamMock.EXPECT().Del(gomock.Any(), gomock.Any()).Times(1) + ipamMock.EXPECT().Del(gomock.Any(), gomock.Any()).Times(2) cniConfig, response := cniServer.checkRequestMessage(&requestMsg) require.Nil(t, response, "expected no rpc error") podKey := util.GenerateContainerInterfaceName(string(cniConfig.K8S_POD_NAME), string(cniConfig.K8S_POD_NAMESPACE)) diff --git a/test/e2e/basic_test.go b/test/e2e/basic_test.go index 78f1bb5250b..b1d6ce5d7b9 100755 --- a/test/e2e/basic_test.go +++ b/test/e2e/basic_test.go @@ -26,9 +26,9 @@ import ( "k8s.io/apimachinery/pkg/util/wait" + "github.com/vmware-tanzu/antrea/pkg/agent/apiserver/handlers/podinterface" "github.com/vmware-tanzu/antrea/pkg/agent/config" "github.com/vmware-tanzu/antrea/pkg/agent/openflow/cookie" - "github.com/vmware-tanzu/antrea/pkg/agent/util" ) // TestDeploy is a "no-op" test that simply performs setup and teardown. @@ -75,9 +75,6 @@ func TestPodAssignIP(t *testing.T) { } func (data *TestData) testDeletePod(t *testing.T, podName string, nodeName string) { - ifName := util.GenerateContainerInterfaceName(podName, testNamespace) - t.Logf("Host interface name for Pod is '%s'", ifName) - var antreaPodName string var err error if antreaPodName, err = data.getAntreaPodOnNode(nodeName); err != nil { @@ -85,6 +82,19 @@ func (data *TestData) testDeletePod(t *testing.T, podName string, nodeName strin } t.Logf("The Antrea Pod for Node '%s' is '%s'", nodeName, antreaPodName) + cmds := []string{"antctl", "get", "podinterface", podName, "-n", testNamespace, "-o", "json"} + stdout, _, err := runAntctl(antreaPodName, cmds, data, t) + var podInterfaces []podinterface.Response + if err := json.Unmarshal([]byte(stdout), &podInterfaces); err != nil { + t.Fatalf("Error when querying the pod interface: %v", err) + } + if len(podInterfaces) != 1 { + t.Fatalf("Expected 1 pod interface, got %d", len(podInterfaces)) + } + ifName := podInterfaces[0].InterfaceName + podIP := podInterfaces[0].IP + t.Logf("Host interface name for Pod is '%s'", ifName) + doesInterfaceExist := func() bool { cmd := fmt.Sprintf("ip link show %s", ifName) if rc, _, _, err := RunCommandOnNode(nodeName, cmd); err != nil { @@ -103,6 +113,16 @@ func (data *TestData) testDeletePod(t *testing.T, podName string, nodeName strin return exists } + doesIPAllocationExist := func() bool { + cmd := fmt.Sprintf("test -f /var/run/antrea/cni/networks/antrea/%s", podIP) + if rc, _, _, err := RunCommandOnNode(nodeName, cmd); err != nil { + t.Fatalf("Error when running ip command on Node '%s': %v", nodeName, err) + } else { + return rc == 0 + } + return false + } + t.Logf("Checking that the veth interface and the OVS port exist") if !doesInterfaceExist() { t.Errorf("Interface '%s' does not exist on Node '%s'", ifName, nodeName) @@ -110,6 +130,9 @@ func (data *TestData) testDeletePod(t *testing.T, podName string, nodeName strin if !doesOVSPortExist() { t.Errorf("OVS port '%s' does not exist on Node '%s'", ifName, nodeName) } + if !doesIPAllocationExist() { + t.Errorf("IP allocation '%s' does not exist on Node '%s'", podIP, nodeName) + } t.Logf("Deleting Pod '%s'", podName) if err := data.deletePodAndWait(defaultTimeout, podName); err != nil { @@ -123,6 +146,9 @@ func (data *TestData) testDeletePod(t *testing.T, podName string, nodeName strin if doesOVSPortExist() { t.Errorf("OVS port '%s' still exists on Node '%s' after Pod deletion", ifName, nodeName) } + if doesIPAllocationExist() { + t.Errorf("IP allocation '%s' still exists on Node '%s'", podIP, nodeName) + } } // TestDeletePod creates a Pod, then deletes it, and checks that the veth interface (in the Node