From 485e6875eff56443e486fdfc73dca4b6606862ec Mon Sep 17 00:00:00 2001 From: Dyanngg Date: Tue, 10 Oct 2023 15:55:07 -0700 Subject: [PATCH 1/2] Fix deadlock when accessing dirtyRules in fqdn controller Signed-off-by: Dyanngg --- pkg/agent/controller/networkpolicy/fqdn.go | 13 +- .../controller/networkpolicy/fqdn_test.go | 134 ++++++++++++++++++ 2 files changed, 143 insertions(+), 4 deletions(-) diff --git a/pkg/agent/controller/networkpolicy/fqdn.go b/pkg/agent/controller/networkpolicy/fqdn.go index 885a9637e2c..7a534edc52c 100644 --- a/pkg/agent/controller/networkpolicy/fqdn.go +++ b/pkg/agent/controller/networkpolicy/fqdn.go @@ -102,7 +102,7 @@ type ruleRealizationUpdate struct { // ruleSyncTracker tracks the realization status of FQDN rules that are // applied to workloads on this Node. type ruleSyncTracker struct { - mutex sync.Mutex + mutex sync.RWMutex // updateCh is the channel used by the rule reconciler to report rule realization status. updateCh chan ruleRealizationUpdate // ruleToSubscribers keeps track of the subscribers that are currently subscribed @@ -497,14 +497,12 @@ func (f *fqdnController) syncDirtyRules(fqdn string, waitCh chan error, addressU utilsets.MergeString(dirtyRules, f.selectorItemToRuleIDs[selectorItem]) } if !addressUpdate { - f.ruleSyncTracker.mutex.Lock() - defer f.ruleSyncTracker.mutex.Unlock() // If there is no address update for this FQDN, and rules selecting this FQDN // were all previously realized successfully, then there will be no dirty rules // left to be synced. On the contrary, if some rules that select this FQDN are // still in the dirtyRules set of the ruleSyncTracker, then only those rules // should be retried for reconciliation, and packetOut shall be blocked. - dirtyRules = f.ruleSyncTracker.dirtyRules.Intersection(dirtyRules) + dirtyRules = f.ruleSyncTracker.getDirtyRules().Intersection(dirtyRules) } if len(dirtyRules) > 0 { klog.V(4).InfoS("Dirty rules blocking packetOut", "dirtyRules", dirtyRules) @@ -533,6 +531,13 @@ func (rst *ruleSyncTracker) subscribe(waitCh chan error, dirtyRules sets.String) } } +// getDirtyRules retrieves the current dirty rule set of ruleSyncTracker. +func (rst *ruleSyncTracker) getDirtyRules() sets.String { + rst.mutex.RLock() + defer rst.mutex.RUnlock() + return rst.dirtyRules +} + func (rst *ruleSyncTracker) Run(stopCh <-chan struct{}) { for { select { diff --git a/pkg/agent/controller/networkpolicy/fqdn_test.go b/pkg/agent/controller/networkpolicy/fqdn_test.go index 715ed3f7180..82a1461355a 100644 --- a/pkg/agent/controller/networkpolicy/fqdn_test.go +++ b/pkg/agent/controller/networkpolicy/fqdn_test.go @@ -16,6 +16,7 @@ package networkpolicy import ( "context" + "fmt" "net" "testing" "time" @@ -387,3 +388,136 @@ func TestGetIPsForFQDNSelectors(t *testing.T) { }) } } + +func TestSyncDirtyRules(t *testing.T) { + testFQDN := "test.antrea.io" + selectorItem := fqdnSelectorItem{ + matchName: testFQDN, + } + testFQDN2 := "dev.antrea.io" + selectorItem2 := fqdnSelectorItem{ + matchName: testFQDN2, + } + tests := []struct { + name string + fqdnsToSync []string + waitChs []chan error + addressUpdates []bool + prevDirtyRules sets.String + notifications []ruleRealizationUpdate + expectedDirtyRuleSyncCalls []string + expectedDirtyRulesRemaining sets.String + expectErr bool + }{ + { + name: "test non-blocking dirty rule sync with address update", + fqdnsToSync: []string{testFQDN}, + prevDirtyRules: sets.NewString(), + addressUpdates: []bool{true}, + waitChs: []chan error{nil}, + notifications: []ruleRealizationUpdate{{"1", nil}, {"2", nil}}, + expectedDirtyRuleSyncCalls: []string{"1", "2"}, + expectedDirtyRulesRemaining: sets.NewString(), + expectErr: false, + }, + { + name: "test blocking dirty rule sync with address update", + fqdnsToSync: []string{testFQDN}, + prevDirtyRules: sets.NewString(), + waitChs: []chan error{make(chan error, 1)}, + addressUpdates: []bool{true}, + notifications: []ruleRealizationUpdate{{"1", nil}, {"2", nil}}, + expectedDirtyRuleSyncCalls: []string{"1", "2"}, + expectedDirtyRulesRemaining: sets.NewString(), + expectErr: false, + }, + { + name: "test blocking dirty rule sync with failed rule realization", + fqdnsToSync: []string{testFQDN}, + prevDirtyRules: sets.NewString(), + waitChs: []chan error{make(chan error, 1)}, + addressUpdates: []bool{true}, + notifications: []ruleRealizationUpdate{{"1", nil}, {"2", fmt.Errorf("ovs err")}}, + expectedDirtyRuleSyncCalls: []string{"1", "2"}, + expectedDirtyRulesRemaining: sets.NewString("2"), + expectErr: true, + }, + { + name: "test blocking dirty rule sync without address update but previously failed rule realization", + fqdnsToSync: []string{testFQDN}, + prevDirtyRules: sets.NewString("2"), + waitChs: []chan error{make(chan error, 1)}, + addressUpdates: []bool{false}, + notifications: []ruleRealizationUpdate{{"2", nil}}, + expectedDirtyRuleSyncCalls: []string{"2"}, + expectedDirtyRulesRemaining: sets.NewString(), + expectErr: false, + }, + { + name: "test blocking dirty rule sync without address update", + fqdnsToSync: []string{testFQDN}, + prevDirtyRules: sets.NewString(), + waitChs: []chan error{make(chan error, 1)}, + addressUpdates: []bool{false}, + notifications: []ruleRealizationUpdate{}, + expectedDirtyRuleSyncCalls: []string{}, + expectedDirtyRulesRemaining: sets.NewString(), + expectErr: false, + }, + { + name: "test blocking single dirty rule multiple FQDN concurrent updates", + fqdnsToSync: []string{testFQDN, testFQDN2}, + prevDirtyRules: sets.NewString(), + waitChs: []chan error{make(chan error, 1), make(chan error, 1)}, + addressUpdates: []bool{true, false}, + notifications: []ruleRealizationUpdate{{"1", nil}, {"2", nil}}, + expectedDirtyRuleSyncCalls: []string{"1", "2", "2"}, + expectedDirtyRulesRemaining: sets.NewString(), + expectErr: false, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + controller := gomock.NewController(t) + f, _ := newMockFQDNController(t, controller, nil) + var dirtyRuleSyncCalls []string + f.dirtyRuleHandler = func(s string) { + dirtyRuleSyncCalls = append(dirtyRuleSyncCalls, s) + } + f.addFQDNSelector("1", []string{testFQDN}) + f.addFQDNSelector("2", []string{testFQDN}) + f.addFQDNSelector("2", []string{testFQDN2}) + f.setFQDNMatchSelector(testFQDN, selectorItem) + f.setFQDNMatchSelector(testFQDN2, selectorItem2) + // This simulates failed rule syncs in previous syncDirtyRules() calls + if len(tc.prevDirtyRules) > 0 { + f.ruleSyncTracker.dirtyRules = tc.prevDirtyRules + } + stopCh := make(chan struct{}) + defer close(stopCh) + go f.runRuleSyncTracker(stopCh) + + for i, fqdn := range tc.fqdnsToSync { + f.syncDirtyRules(fqdn, tc.waitChs[i], tc.addressUpdates[i]) + } + for _, update := range tc.notifications { + f.ruleSyncTracker.updateCh <- update + } + assert.ElementsMatch(t, tc.expectedDirtyRuleSyncCalls, dirtyRuleSyncCalls) + for _, waitCh := range tc.waitChs { + if waitCh != nil { + assert.Eventually(t, func() bool { + select { + case err := <-waitCh: + if err != nil && !tc.expectErr { + return false + } + } + return true + }, ruleRealizationTimeout, time.Millisecond*10, "Failed to successfully wait for rule syncs") + } + } + assert.Equal(t, tc.expectedDirtyRulesRemaining, f.ruleSyncTracker.getDirtyRules()) + }) + } +} From 6852facaca920a8cdb4b45d0b51224155b6d4dd7 Mon Sep 17 00:00:00 2001 From: Quan Tian Date: Tue, 17 Oct 2023 10:04:30 +0800 Subject: [PATCH 2/2] Fix data race in FQDN ruleSyncTracker (#5583) ruleSyncTracker.Run() can update ruleSyncTracker.dirtyRules in-place, while ruleSyncTracker.getDirtyRules() returns the pointer of the set which could be read by other goroutines and leads to a data race like below: WARNING: DATA RACE Write at 0x00c000dd8180 by goroutine 276: runtime.mapdelete_faststr() /usr/local/go/src/runtime/map_faststr.go:301 +0x0 k8s.io/apimachinery/pkg/util/sets.Set[go.shape.string].Delete() /root/go/pkg/mod/k8s.io/apimachinery@v0.26.4/pkg/util/sets/set.go:62 +0x2ae antrea.io/antrea/pkg/agent/controller/networkpolicy.(*ruleSyncTracker).Run() /root/antrea/pkg/agent/controller/networkpolicy/fqdn.go:570 +0x1d2 antrea.io/antrea/pkg/agent/controller/networkpolicy.(*fqdnController).runRuleSyncTracker() /root/antrea/pkg/agent/controller/networkpolicy/fqdn.go:584 +0x4a antrea.io/antrea/pkg/agent/controller/networkpolicy.TestSyncDirtyRules.func1.4() /root/antrea/pkg/agent/controller/networkpolicy/fqdn_test.go:495 +0x44 Previous read at 0x00c000dd8180 by goroutine 271: reflect.maplen() /usr/local/go/src/runtime/map.go:1411 +0x0 reflect.Value.lenNonSlice() /usr/local/go/src/reflect/value.go:1720 +0x324 reflect.Value.Len() /usr/local/go/src/reflect/value.go:1709 +0x158f reflect.deepValueEqual() /usr/local/go/src/reflect/deepequal.go:139 +0x1571 reflect.DeepEqual() /usr/local/go/src/reflect/deepequal.go:237 +0x38b github.com/stretchr/testify/assert.ObjectsAreEqual() /root/go/pkg/mod/github.com/stretchr/testify@v1.8.4/assert/assertions.go:65 +0x172 github.com/stretchr/testify/assert.Equal() /root/go/pkg/mod/github.com/stretchr/testify@v1.8.4/assert/assertions.go:414 +0x1f7 antrea.io/antrea/pkg/agent/controller/networkpolicy.TestSyncDirtyRules.func1() /root/antrea/pkg/agent/controller/networkpolicy/fqdn_test.go:517 +0xb6f testing.tRunner() /usr/local/go/src/testing/testing.go:1595 +0x238 testing.(*T).Run.func1() /usr/local/go/src/testing/testing.go:1648 +0x44 Signed-off-by: Quan Tian --- pkg/agent/controller/networkpolicy/fqdn.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/agent/controller/networkpolicy/fqdn.go b/pkg/agent/controller/networkpolicy/fqdn.go index 7a534edc52c..b315d19e964 100644 --- a/pkg/agent/controller/networkpolicy/fqdn.go +++ b/pkg/agent/controller/networkpolicy/fqdn.go @@ -535,7 +535,8 @@ func (rst *ruleSyncTracker) subscribe(waitCh chan error, dirtyRules sets.String) func (rst *ruleSyncTracker) getDirtyRules() sets.String { rst.mutex.RLock() defer rst.mutex.RUnlock() - return rst.dirtyRules + // Must return a copy as the set can be updated in-place by Run func. + return rst.dirtyRules.Union(nil) } func (rst *ruleSyncTracker) Run(stopCh <-chan struct{}) {