From 11b9ff1a28c57e5f36e98a5da826c0df64d6a37b Mon Sep 17 00:00:00 2001 From: "larry.lx" Date: Wed, 19 Apr 2023 11:30:13 +0800 Subject: [PATCH 1/3] fix: a deadlock caused by bsc protocol handeshake timeout --- eth/handler_bsc.go | 2 +- eth/peerset.go | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/eth/handler_bsc.go b/eth/handler_bsc.go index 59943aadab..fcb39ed1ab 100644 --- a/eth/handler_bsc.go +++ b/eth/handler_bsc.go @@ -27,7 +27,7 @@ func (h *bscHandler) RunPeer(peer *bsc.Peer, hand bsc.Handler) error { ps.lock.Lock() if wait, ok := ps.bscWait[id]; ok { delete(ps.bscWait, id) - wait <- peer + wait <- nil } ps.lock.Unlock() return err diff --git a/eth/peerset.go b/eth/peerset.go index 6a84449fe1..d0ec5b9bbd 100644 --- a/eth/peerset.go +++ b/eth/peerset.go @@ -30,6 +30,7 @@ import ( "github.com/ethereum/go-ethereum/eth/protocols/eth" "github.com/ethereum/go-ethereum/eth/protocols/snap" "github.com/ethereum/go-ethereum/eth/protocols/trust" + "github.com/ethereum/go-ethereum/log" "github.com/ethereum/go-ethereum/p2p" ) @@ -70,6 +71,7 @@ const ( // extensionWaitTimeout is the maximum allowed time for the extension wait to // complete before dropping the connection as malicious. extensionWaitTimeout = 10 * time.Second + tryWaitTimeout = 200 * time.Millisecond ) // peerSet represents the collection of active peers currently participating in @@ -402,7 +404,15 @@ func (ps *peerSet) waitBscExtension(peer *eth.Peer) (*bsc.Peer, error) { return peer, nil case <-time.After(extensionWaitTimeout): - ps.lock.Lock() + // once timeout is reached, we won't wait for the peer any more, bsc protocol won't be enabled on it. + log.Warn("waitBscExtension", "peer", id, "bsc protocol timeout, it won't be enabled.") + for { + // it is potential deadlock, so we use TryLock to avoid it. + if ps.lock.TryLock() { + break + } + time.Sleep(tryWaitTimeout) + } delete(ps.bscWait, id) ps.lock.Unlock() return nil, errPeerWaitTimeout From 60428ef9edc9c166d5085fee9d4312ede7aa7a2d Mon Sep 17 00:00:00 2001 From: "larry.lx" Date: Wed, 19 Apr 2023 13:59:51 +0800 Subject: [PATCH 2/3] fix: clean the channel --- eth/peerset.go | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/eth/peerset.go b/eth/peerset.go index d0ec5b9bbd..91f1e5ed04 100644 --- a/eth/peerset.go +++ b/eth/peerset.go @@ -71,7 +71,7 @@ const ( // extensionWaitTimeout is the maximum allowed time for the extension wait to // complete before dropping the connection as malicious. extensionWaitTimeout = 10 * time.Second - tryWaitTimeout = 200 * time.Millisecond + tryWaitTimeout = 100 * time.Millisecond ) // peerSet represents the collection of active peers currently participating in @@ -404,18 +404,27 @@ func (ps *peerSet) waitBscExtension(peer *eth.Peer) (*bsc.Peer, error) { return peer, nil case <-time.After(extensionWaitTimeout): - // once timeout is reached, we won't wait for the peer any more, bsc protocol won't be enabled on it. + // once timeout is reached, bsc protocol won't be enabled on this peer. log.Warn("waitBscExtension", "peer", id, "bsc protocol timeout, it won't be enabled.") + // could be deadlock, so we use TryLock to avoid it. + if ps.lock.TryLock() { + delete(ps.bscWait, id) + ps.lock.Unlock() + return nil, errPeerWaitTimeout + } + // if TryLock failed, we wait for a while and try again. for { - // it is potential deadlock, so we use TryLock to avoid it. - if ps.lock.TryLock() { - break + select { + case <-wait: + return nil, errPeerWaitTimeout + case <-time.After(tryWaitTimeout): + if ps.lock.TryLock() { + delete(ps.bscWait, id) + ps.lock.Unlock() + return nil, errPeerWaitTimeout + } } - time.Sleep(tryWaitTimeout) } - delete(ps.bscWait, id) - ps.lock.Unlock() - return nil, errPeerWaitTimeout } } From e3138b6026a553d5ca4298cedb3de975af93f4e3 Mon Sep 17 00:00:00 2001 From: "larry.lx" Date: Wed, 19 Apr 2023 14:32:39 +0800 Subject: [PATCH 3/3] log: change the log level --- eth/peerset.go | 4 +--- p2p/peer.go | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/eth/peerset.go b/eth/peerset.go index 91f1e5ed04..5dfb73934a 100644 --- a/eth/peerset.go +++ b/eth/peerset.go @@ -30,7 +30,6 @@ import ( "github.com/ethereum/go-ethereum/eth/protocols/eth" "github.com/ethereum/go-ethereum/eth/protocols/snap" "github.com/ethereum/go-ethereum/eth/protocols/trust" - "github.com/ethereum/go-ethereum/log" "github.com/ethereum/go-ethereum/p2p" ) @@ -404,8 +403,6 @@ func (ps *peerSet) waitBscExtension(peer *eth.Peer) (*bsc.Peer, error) { return peer, nil case <-time.After(extensionWaitTimeout): - // once timeout is reached, bsc protocol won't be enabled on this peer. - log.Warn("waitBscExtension", "peer", id, "bsc protocol timeout, it won't be enabled.") // could be deadlock, so we use TryLock to avoid it. if ps.lock.TryLock() { delete(ps.bscWait, id) @@ -416,6 +413,7 @@ func (ps *peerSet) waitBscExtension(peer *eth.Peer) (*bsc.Peer, error) { for { select { case <-wait: + // discard the peer, even though the peer arrived. return nil, errPeerWaitTimeout case <-time.After(tryWaitTimeout): if ps.lock.TryLock() { diff --git a/p2p/peer.go b/p2p/peer.go index 84efde6a4f..1a110dffa6 100644 --- a/p2p/peer.go +++ b/p2p/peer.go @@ -435,7 +435,7 @@ func (p *Peer) startProtocols(writeStart <-chan struct{}, writeErr chan<- error) p.log.Trace(fmt.Sprintf("Protocol %s/%d returned", proto.Name, proto.Version)) err = errProtocolReturned } else if !errors.Is(err, io.EOF) { - p.log.Trace(fmt.Sprintf("Protocol %s/%d failed", proto.Name, proto.Version), "err", err) + p.log.Warn(fmt.Sprintf("Protocol %s/%d failed", proto.Name, proto.Version), "err", err) } p.protoErr <- err }()