Skip to content

Commit

Permalink
Add host-local IPAM GC on startup
Browse files Browse the repository at this point in the history
During CNIServer reconciliation, we perform host-local IPAM garbage
collection (GC) by comparing the set of IPs allocated to local Pods and
the set of IPs currently reserved by the plugin. We release any IP
reserved by the plugin that is not in-use by a local Pod. The purpose is
to avoid leaking IP addresses when there is a bug in the container
runtime, which has happened in the past.

Two key design choices that were made:
* We do not invoke CNI DEL to release IPs, instead we access the
  host-local data which is persisted on the Node, and modify it as
  needed.
* We do not rely on the interface store (as persisted to OVSDB) to
  determine the set of IPs that may have been leaked. In case of an
  Antrea bug, it could be possible (although unlikely) for an IP to
  still be allocated by host-local but be missing from the interface
  store. Intead, we list all allocated IPs from the host-local data (an
  allocated IP corresponds to one disk file).

This approach is essentially the same as our existing script:
https://github.com/antrea-io/antrea/blob/main/hack/gc-host-local.sh

Fixes #4326

Signed-off-by: Antonin Bas <abas@vmware.com>
  • Loading branch information
antoninbas committed Nov 13, 2023
1 parent 4a52bca commit 4bb14d3
Show file tree
Hide file tree
Showing 9 changed files with 506 additions and 18 deletions.
2 changes: 2 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ updates:
- dependency-name: "antrea.io/ofnet"
- dependency-name: "antrea.io/libOpenflow"
- dependency-name: "github.com/ClickHouse/clickhouse-go/v2" # auto-upgrade involves dependency conflicts
- dependency-name: "github.com/alexflint/go-filemutex"
update-types: ["version-update:semver-major"] # ignore major updates only
- package-ecosystem: "github-actions"
# Workflow files stored in the default location of `.github/workflows`
directory: "/"
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ require (
github.com/Microsoft/go-winio v0.6.1
github.com/Microsoft/hcsshim v0.9.10
github.com/TomCodeLV/OVSDB-golang-lib v0.0.0-20200116135253-9bbdfadcd881
github.com/alexflint/go-filemutex v1.2.0
github.com/aws/aws-sdk-go-v2 v1.16.10
github.com/aws/aws-sdk-go-v2/config v1.16.0
github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.23
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRF
github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho=
github.com/alessio/shellescape v1.2.2/go.mod h1:PZAiSCk0LJaZkiCSkPv8qIobYglO3FPpyFjDCtHLS30=
github.com/alexflint/go-filemutex v0.0.0-20171022225611-72bdc8eae2ae/go.mod h1:CgnQgUtFrFz9mxFNtED3jI5tLDjKlOM+oUF/sTk6ps0=
github.com/alexflint/go-filemutex v1.2.0 h1:1v0TJPDtlhgpW4nJ+GvxCLSlUDC3+gW0CQQvlmfDR/s=
github.com/alexflint/go-filemutex v1.2.0/go.mod h1:mYyQSWvw9Tx2/H2n9qXPb52tTYfE0pZAWcBq5mK025c=
github.com/andreyvit/diff v0.0.0-20170406064948-c7f18ee00883/go.mod h1:rCTlJbsFo29Kk6CurOXKm700vrz8f0KW0JNfpkRJY/8=
github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY=
github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
Expand Down
117 changes: 117 additions & 0 deletions pkg/agent/cniserver/ipam/hostlocal/gc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// Copyright 2023 Antrea Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package hostlocal

import (
"fmt"
"net"
"os"
"path/filepath"
"runtime"
"strings"

"github.com/spf13/afero"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog/v2"
)

const dataDir = "/var/lib/cni/networks"

func networkDir(network string) string {
return filepath.Join(dataDir, network)
}

// This is a hacky approach as we access the internals of the host-local plugin,
// instead of using the CNI interface. However, crafting a CNI DEL request from
// scratch would also be hacky.
func GarbageCollectContainerIPs(network string, desiredIPs sets.Set[string]) error {
dir := networkDir(network)

info, err := os.Stat(dir)
if os.IsNotExist(err) {
klog.V(2).InfoS("Host-local IPAM data directory does not exist, nothing to do", "dir", dir)
return nil
}
if !info.IsDir() {
return fmt.Errorf("path '%s' is not a directory: %w", dir, err)
}

lk, err := NewFileLock(dataDir)
if err != nil {
return err
}
defer lk.Close()
lk.Lock()
defer lk.Unlock()

fs := afero.NewOsFs()
return gcContainerIPs(fs, dir, desiredIPs)
}

// Internal version of GarbageCollectContainerIPs which does not acquire the
// file lock and can work with an arbitrary afero filesystem.
func gcContainerIPs(fs afero.Fs, dir string, desiredIPs sets.Set[string]) error {
paths := make([]string, 0)

if err := afero.Walk(fs, dir, func(path string, info os.FileInfo, err error) error {
if err != nil || info.IsDir() {
return nil
}
paths = append(paths, path)
return nil
}); err != nil {
return fmt.Errorf("error when gathering IP filenames in the host-local data directory: %w", err)
}

allocatedIPs := sets.New[string]()
for _, p := range paths {
ip := getIPFromPath(p)
if net.ParseIP(ip) == nil {
// not a valid IP, nothing to do
continue
}
allocatedIPs.Insert(ip)
if desiredIPs.Has(ip) {
// IP is in-use
continue
}
if err := fs.Remove(p); err != nil {
klog.ErrorS(err, "Failed to release unused IP from host-local IPAM plugin", "IP", ip)
continue
}
allocatedIPs.Delete(ip)
klog.InfoS("Unused IP was successfully released from host-local IPAM plugin", "IP", ip)
}

if allocatedIPs.Difference(desiredIPs).Len() > 0 {
return fmt.Errorf("not all unused IPs could be released from host-local IPAM plugin, some IPs may be leaked")
}

// Note that it is perfectly possible for some IPs to be in desiredIPs but not in
// allocatedIPs. This can be the case when another IPAM plugin (e.g., AntreaIPAM) is also
// used.

return nil
}

func getIPFromPath(path string) string {
fname := filepath.Base(path)
// need to unespace IPv6 addresses on Windows
// see https://github.com/containernetworking/plugins/blob/38f18d26ecfef550b8bac02656cc11103fd7cff1/plugins/ipam/host-local/backend/disk/backend.go#L197
if runtime.GOOS == "windows" {
fname = strings.ReplaceAll(fname, "_", ":")
}
return fname
}
178 changes: 178 additions & 0 deletions pkg/agent/cniserver/ipam/hostlocal/gc_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
// Copyright 2023 Antrea Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package hostlocal

import (
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
"testing"

"github.com/spf13/afero"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"k8s.io/apimachinery/pkg/util/sets"
)

type testFs struct {
afero.Fs
removeError error
removedFiles []string
}

func (fs *testFs) Remove(name string) error {
if fs.removeError != nil {
err := &os.PathError{Op: "remove", Path: name, Err: fs.removeError}
// reset error
fs.removeError = nil
return err
}
if err := fs.Fs.Remove(name); err != nil {
return err
}
fs.removedFiles = append(fs.removedFiles, name)
return nil
}

// forceRemoveError forces the next Remove call to fail. Error will be cleared after the first call.
func (fs *testFs) forceRemoveError() {
fs.removeError = fmt.Errorf("permission denied")
}

func (fs *testFs) removedIPs() sets.Set[string] {
s := sets.New[string]()
for _, p := range fs.removedFiles {
s.Insert(getIPFromPath(p))
}
return s
}

// from https://github.com/containernetworking/plugins/blob/38f18d26ecfef550b8bac02656cc11103fd7cff1/plugins/ipam/host-local/backend/disk/backend.go#L197
func getEscapedPath(dir string, fname string) string {
if runtime.GOOS == "windows" {
fname = strings.ReplaceAll(fname, ":", "_")
}
return filepath.Join(dir, fname)
}

func allocateIPs(t *testing.T, fs afero.Fs, dir string, ips ...string) {
for _, ip := range ips {
path := getEscapedPath(dir, ip)
// The real host-local IPAM plugin writes the container ID + interface name to the
// file, but it is irrelevant in our case.
require.NoError(t, afero.WriteFile(fs, path, []byte("foo"), 0o600))
}
}

func TestGcContainerIPs(t *testing.T) {
dir := networkDir("antrea")

newTestFs := func() *testFs {
return &testFs{
Fs: afero.NewMemMapFs(),
}
}

t.Run("missing directory", func(t *testing.T) {
fs := newTestFs()
// create the plugin data directory, but not the "network" sub-directory
require.NoError(t, fs.MkdirAll(dataDir, 0o755))
assert.NoError(t, gcContainerIPs(fs, dir, sets.New[string]()))
removedIPs := fs.removedIPs()
assert.Empty(t, removedIPs)
})

t.Run("remove error", func(t *testing.T) {
ips := []string{"10.0.0.1", "10.0.0.2"}
fs := newTestFs()
require.NoError(t, fs.MkdirAll(dir, 0o755))
allocateIPs(t, fs, dir, ips...)
fs.forceRemoveError()
require.Error(t, gcContainerIPs(fs, dir, sets.New[string]()))
// one of the IPs will fail to be released, the other one will succeed
removedIPs := fs.removedIPs()
assert.Len(t, removedIPs, 1)
})

resolveIP := func(id int, ipv6 bool) string {
if ipv6 {
return fmt.Sprintf("2001:db8:a::%d", id)
} else {
return fmt.Sprintf("10.0.0.%d", id)
}
}

// some success test cases, will be run for both IPv4 and IPv6
testCases := []struct {
name string
desiredIPs []int
allocatedIPs []int
expectedRemovedIPs []int
}{
{
name: "same sets",
desiredIPs: []int{1, 2},
allocatedIPs: []int{1, 2},
expectedRemovedIPs: []int{},
},
{
name: "multiple removals",
desiredIPs: []int{1, 3},
allocatedIPs: []int{1, 2, 3, 4},
expectedRemovedIPs: []int{2, 4},
},
{
name: "extra ip",
desiredIPs: []int{1, 2, 3},
allocatedIPs: []int{1, 2},
expectedRemovedIPs: []int{},
},
}

runTests := func(t *testing.T, ipv6 bool) {
name := "ipv4"
if ipv6 {
name = "ipv6"
}

toIPSet := func(ids []int) sets.Set[string] {
ips := sets.New[string]()
for _, id := range ids {
ip := resolveIP(id, ipv6)
require.NotEmpty(t, ip)
ips.Insert(ip)
}
return ips
}

t.Run(name, func(t *testing.T) {
for _, tc := range testCases {
fs := newTestFs()
require.NoError(t, fs.MkdirAll(dir, 0o755))
desiredIPs := toIPSet(tc.desiredIPs)
allocatedIPs := toIPSet(tc.allocatedIPs)
expectedRemovedIPs := toIPSet(tc.expectedRemovedIPs)
allocateIPs(t, fs, dir, allocatedIPs.UnsortedList()...)
require.NoError(t, gcContainerIPs(fs, dir, desiredIPs))
assert.Equal(t, expectedRemovedIPs, fs.removedIPs())
}
})
}

runTests(t, false)
runTests(t, true)
}
62 changes: 62 additions & 0 deletions pkg/agent/cniserver/ipam/hostlocal/lock.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Copyright 2023 Antrea Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package hostlocal

import (
"os"
"path"

"github.com/alexflint/go-filemutex"
)

// This code was copied from https://github.com/containernetworking/plugins/blob/v1.3.0/plugins/ipam/host-local/backend/disk/lock.go

// FileLock wraps os.File to be used as a lock using flock
type FileLock struct {
f *filemutex.FileMutex
}

// NewFileLock opens file/dir at path and returns unlocked FileLock object
func NewFileLock(lockPath string) (*FileLock, error) {
fi, err := os.Stat(lockPath)
if err != nil {
return nil, err
}

if fi.IsDir() {
lockPath = path.Join(lockPath, "lock")
}

f, err := filemutex.New(lockPath)
if err != nil {
return nil, err
}

return &FileLock{f}, nil
}

func (l *FileLock) Close() error {
return l.f.Close()
}

// Lock acquires an exclusive lock
func (l *FileLock) Lock() error {
return l.f.Lock()
}

// Unlock releases the lock
func (l *FileLock) Unlock() error {
return l.f.Unlock()
}
Loading

0 comments on commit 4bb14d3

Please sign in to comment.