From 5f7c2df76ae8a93f49761dfc91429f823a791caf Mon Sep 17 00:00:00 2001 From: Menghan Li Date: Tue, 10 Sep 2019 10:29:21 -0700 Subject: [PATCH] grpclb: fix deadlock in grpclb connection cache Before the fix, if the timer to remove a SubConn fires at the same time NewSubConn cancels the timer, it caused a mutex leak and deadlock. --- balancer/grpclb/grpclb_util.go | 2 +- balancer/grpclb/grpclb_util_test.go | 43 +++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/balancer/grpclb/grpclb_util.go b/balancer/grpclb/grpclb_util.go index 2663c37e3f89..6d44ae5c096c 100644 --- a/balancer/grpclb/grpclb_util.go +++ b/balancer/grpclb/grpclb_util.go @@ -173,13 +173,13 @@ func (ccc *lbCacheClientConn) RemoveSubConn(sc balancer.SubConn) { timer := time.AfterFunc(ccc.timeout, func() { ccc.mu.Lock() + defer ccc.mu.Unlock() if entry.abortDeleting { return } ccc.cc.RemoveSubConn(sc) delete(ccc.subConnToAddr, sc) delete(ccc.subConnCache, addr) - ccc.mu.Unlock() }) entry.cancel = func() { if !timer.Stop() { diff --git a/balancer/grpclb/grpclb_util_test.go b/balancer/grpclb/grpclb_util_test.go index 39d422a70500..0037b3d9d7e1 100644 --- a/balancer/grpclb/grpclb_util_test.go +++ b/balancer/grpclb/grpclb_util_test.go @@ -217,3 +217,46 @@ func TestLBCacheClientConnReuse(t *testing.T) { t.Fatal(err) } } + +// Test that if the timer to remove a SubConn fires at the same time NewSubConn +// cancels the timer, it doesn't cause deadlock. +func TestLBCache_RemoveTimer_New_Race(t *testing.T) { + mcc := newMockClientConn() + if err := checkMockCC(mcc, 0); err != nil { + t.Fatal(err) + } + + ccc := newLBCacheClientConn(mcc) + ccc.timeout = time.Nanosecond + if err := checkCacheCC(ccc, 0, 0); err != nil { + t.Fatal(err) + } + + sc, _ := ccc.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{}) + // One subconn in MockCC. + if err := checkMockCC(mcc, 1); err != nil { + t.Fatal(err) + } + // No subconn being deleted, and one in CacheCC. + if err := checkCacheCC(ccc, 0, 1); err != nil { + t.Fatal(err) + } + + done := make(chan struct{}) + + go func() { + for i := 0; i < 1000; i++ { + // Remove starts a timer with 1 ns timeout, the NewSubConn will race + // with with the timer. + ccc.RemoveSubConn(sc) + sc, _ = ccc.NewSubConn([]resolver.Address{{Addr: "address1"}}, balancer.NewSubConnOptions{}) + } + close(done) + }() + + select { + case <-time.After(time.Second): + t.Fatalf("Test didn't finish within 1 second. Deadlock") + case <-done: + } +}