Skip to content

Commit

Permalink
fix: Fix data race in cluster state (#930)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-innis committed Jan 9, 2024
1 parent 13b5559 commit eb86007
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 22 deletions.
17 changes: 9 additions & 8 deletions pkg/controllers/state/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,17 +98,18 @@ func (c *Cluster) Synced(ctx context.Context) bool {
return false
}
c.mu.RLock()
stateNodeClaimNames := sets.New(lo.Keys(c.nodeClaimNameToProviderID)...)
stateNodeNames := sets.New(lo.Keys(c.nodeNameToProviderID)...)
c.mu.RUnlock()

// Check to see if any node claim doesn't have a provider ID. If it doesn't, then the nodeclaim hasn't been
// launched, and we need to wait to see what the resolved values are before continuing.
for nc := range stateNodeClaimNames {
if c.nodeClaimNameToProviderID[nc] == "" {
stateNodeClaimNames := sets.New[string]()
for name, providerID := range c.nodeClaimNameToProviderID {
// Check to see if any node claim doesn't have a provider ID. If it doesn't, then the nodeclaim hasn't been
// launched, and we need to wait to see what the resolved values are before continuing.
if providerID == "" {
c.mu.RUnlock()
return false
}
stateNodeClaimNames.Insert(name)
}
stateNodeNames := sets.New(lo.Keys(c.nodeNameToProviderID)...)
c.mu.RUnlock()

nodeClaimNames := sets.New[string]()
for _, nodeClaim := range nodeClaimList.Items {
Expand Down
69 changes: 55 additions & 14 deletions pkg/controllers/state/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1444,6 +1444,61 @@ var _ = Describe("Consolidated State", func() {
})
})

var _ = Describe("Data Races", func() {
It("should ensure that calling Synced() is valid while making updates to Nodes", func() {
cancelCtx, cancel := context.WithCancel(ctx)
DeferCleanup(func() {
cancel()
})

// Keep calling Synced for the entirety of this test
go func() {
for {
_ = cluster.Synced(ctx)
if cancelCtx.Err() != nil {
return
}
}
}()

// Call UpdateNode on 100 nodes (enough to trigger a DATA RACE)
for i := 0; i < 100; i++ {
node := test.Node(test.NodeOptions{
ProviderID: test.RandomProviderID(),
})
ExpectApplied(ctx, env.Client, node)
ExpectReconcileSucceeded(ctx, nodeController, client.ObjectKeyFromObject(node))
}
})
It("should ensure that calling Synced() is valid while making updates to NodeClaims", func() {
cancelCtx, cancel := context.WithCancel(ctx)
DeferCleanup(func() {
cancel()
})

// Keep calling Synced for the entirety of this test
go func() {
for {
_ = cluster.Synced(ctx)
if cancelCtx.Err() != nil {
return
}
}
}()

// Call UpdateNodeClaim on 100 NodeClaims (enough to trigger a DATA RACE)
for i := 0; i < 100; i++ {
nodeClaim := test.NodeClaim(v1beta1.NodeClaim{
Status: v1beta1.NodeClaimStatus{
ProviderID: test.RandomProviderID(),
},
})
ExpectApplied(ctx, env.Client, nodeClaim)
ExpectReconcileSucceeded(ctx, nodeClaimController, client.ObjectKeyFromObject(nodeClaim))
}
})
})

func ExpectStateNodeCount(comparator string, count int) int {
GinkgoHelper()
c := 0
Expand All @@ -1454,17 +1509,3 @@ func ExpectStateNodeCount(comparator string, count int) int {
Expect(c).To(BeNumerically(comparator, count))
return c
}

func ExpectStateNodeNotFoundForNodeClaim(nodeClaim *v1beta1.NodeClaim) *state.StateNode {
GinkgoHelper()
var ret *state.StateNode
cluster.ForEachNode(func(n *state.StateNode) bool {
if n.NodeClaim.Status.ProviderID != nodeClaim.Status.ProviderID {
return true
}
ret = n.DeepCopy()
return false
})
Expect(ret).To(BeNil())
return ret
}

0 comments on commit eb86007

Please sign in to comment.