-
Notifications
You must be signed in to change notification settings - Fork 189
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds exponential backoff to re-spawing new streams for supposedly dead peers #483
Changes from 3 commits
c3a6760
5e8ec29
7c58f7a
42e310b
90cdd55
b3f58bc
a9f4edf
a77d435
6e4b2f8
2761b98
6401d8b
4c94e5f
e260291
c74ae78
7f815f0
6ebc292
8b64966
c00510e
e9d42fa
eede9ba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,6 @@ package pubsub | |
import ( | ||
"bufio" | ||
"context" | ||
"fmt" | ||
"io" | ||
"time" | ||
|
||
|
@@ -123,20 +122,15 @@ func (p *PubSub) handleNewPeer(ctx context.Context, pid peer.ID, outgoing <-chan | |
} | ||
} | ||
|
||
func (p *PubSub) handleNewPeerWithBackoff(ctx context.Context, pid peer.ID, outgoing <-chan *RPC) error { | ||
delay, valid := p.deadPeerBackoff.updateAndGet(pid) | ||
if !valid { | ||
return fmt.Errorf("backoff attempts to %s expired after reaching maximum allowed", pid) | ||
} | ||
func (p *PubSub) handleNewPeerWithBackoff(ctx context.Context, pid peer.ID, outgoing <-chan *RPC) { | ||
delay := p.deadPeerBackoff.updateAndGet(pid) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to add a failure more if we have backed off too much and simply give up; say we try up to 10 times and then How does that sound? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe 10 is even too much, 3-4 attempts should be enough. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
||
select { | ||
case <-time.After(delay): | ||
p.handleNewPeer(ctx, pid, outgoing) | ||
case <-ctx.Done(): | ||
return fmt.Errorf("context cancelled") | ||
return | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func (p *PubSub) handlePeerEOF(ctx context.Context, s network.Stream) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -683,19 +683,15 @@ func (p *PubSub) handleDeadPeers() { | |
|
||
close(ch) | ||
|
||
if p.host.Network().Connectedness(pid) == network.Connected { | ||
if p.host.Network().Connectedness(pid) == network.Connected && | ||
!p.deadPeerBackoff.peerExceededBackoffThreshold(pid) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we might want to (debug) log this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
// still connected, must be a duplicate connection being closed. | ||
// we respawn the writer as we need to ensure there is a stream active | ||
log.Debugf("peer declared dead but still connected; respawning writer: %s", pid) | ||
messages := make(chan *RPC, p.peerOutboundQueueSize) | ||
messages <- p.getHelloPacket() | ||
go func() { | ||
err := p.handleNewPeerWithBackoff(p.ctx, pid, messages) | ||
if err != nil { | ||
log.Warnf("could not handle backoff to new peer %s", err) | ||
close(messages) | ||
} | ||
}() | ||
go p.handleNewPeerWithBackoff(p.ctx, pid, messages) | ||
p.peers[pid] = messages | ||
continue | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
let's get the time after checking the max attempts, will avoid the gettimeofday call in that case.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please read my reply to the below comment as this part has got changed.