From aae09ae50df0934a8d839332993e49a5398dab32 Mon Sep 17 00:00:00 2001
From: Juan Batiz-Benet <juan@benet.ai>
Date: Tue, 20 Jan 2015 07:38:20 -0800
Subject: [PATCH] core/bootstrap: cleaned up bootstrapping

Moved it to its own package to isolate scope.
---
 core/bootstrap.go            | 203 +++++++++++++++++++++++++----------
 core/core.go                 |  22 +---
 routing/dht/dht_bootstrap.go |  66 +++++-------
 3 files changed, 179 insertions(+), 112 deletions(-)

diff --git a/core/bootstrap.go b/core/bootstrap.go
index f8da807bf71d..74b2253ffe27 100644
--- a/core/bootstrap.go
+++ b/core/bootstrap.go
@@ -2,6 +2,7 @@ package core
 
 import (
 	"errors"
+	"fmt"
 	"math/rand"
 	"sync"
 	"time"
@@ -16,109 +17,187 @@ import (
 
 	context "github.com/jbenet/go-ipfs/Godeps/_workspace/src/code.google.com/p/go.net/context"
 	ma "github.com/jbenet/go-ipfs/Godeps/_workspace/src/github.com/jbenet/go-multiaddr"
+	goprocess "github.com/jbenet/go-ipfs/Godeps/_workspace/src/github.com/jbenet/goprocess"
+	periodicproc "github.com/jbenet/go-ipfs/Godeps/_workspace/src/github.com/jbenet/goprocess/periodic"
 )
 
+// ErrNotEnoughBootstrapPeers signals that we do not have enough bootstrap
+// peers to bootstrap correctly.
+var ErrNotEnoughBootstrapPeers = errors.New("not enough bootstrap peers to bootstrap")
+
 const (
-	period                               = 30 * time.Second // how often to check connection status
-	connectiontimeout      time.Duration = period / 3       // duration to wait when attempting to connect
-	recoveryThreshold                    = 4                // attempt to bootstrap if connection count falls below this value
-	numDHTBootstrapQueries               = 15               // number of DHT queries to execute
+	// BootstrapPeriod governs the periodic interval at which the node will
+	// attempt to bootstrap. The bootstrap process is not very expensive, so
+	// this threshold can afford to be small (<=30s).
+	BootstrapPeriod = 30 * time.Second
+
+	// BootstrapPeerThreshold governs the node Bootstrap process. If the node
+	// has less open connections than this number, it will open connections
+	// to the bootstrap nodes. From there, the routing system should be able
+	// to use the connections to the bootstrap nodes to connect to even more
+	// peers. Routing systems like the IpfsDHT do so in their own Bootstrap
+	// process, which issues random queries to find more peers.
+	BootstrapPeerThreshold = 4
+
+	// BootstrapConnectionTimeout determines how long to wait for a bootstrap
+	// connection attempt before cancelling it.
+	BootstrapConnectionTimeout time.Duration = BootstrapPeriod / 3
 )
 
-func superviseConnections(parent context.Context,
-	h host.Host,
-	route *dht.IpfsDHT, // TODO depend on abstract interface for testing purposes
-	store peer.Peerstore,
-	peers []peer.PeerInfo) error {
+// nodeBootstrapper is a small object used to bootstrap an IpfsNode.
+type nodeBootstrapper struct {
+	node *IpfsNode
+}
 
-	var dhtAlreadyBootstrapping bool
+// TryToBootstrap starts IpfsNode bootstrapping. This function will run an
+// initial bootstrapping phase before exiting: connect to several bootstrap
+// nodes. This allows callers to call this function synchronously to:
+// - check if an error occurrs (bootstrapping unsuccessful)
+// - wait before starting services which require the node to be bootstrapped
+//
+// If bootstrapping initially fails, Bootstrap() will try again for a total of
+// three times, before giving up completely. Note that in environments where a
+// node may be initialized offline, as normal operation, BootstrapForever()
+// should be used instead.
+//
+// Note: this function could be much cleaner if we were to relax the constraint
+// that we want to exit **after** we have performed initial bootstrapping (and are
+// thus connected to nodes). The constraint may not be that useful in practice.
+// Consider cases when we initialize the node while disconnected from the internet.
+// We don't want this launch to fail... want to continue launching the node, hoping
+// that bootstrapping will work in the future if we get connected.
+func (nb *nodeBootstrapper) TryToBootstrap(ctx context.Context, peers []peer.PeerInfo) error {
+	n := nb.node
+
+	// TODO what bootstrapping should happen if there is no DHT? i.e. we could
+	// continue connecting to our bootstrap peers, but for what purpose? for now
+	// simply exit without connecting to any of them. When we introduce another
+	// routing system that uses bootstrap peers we can change this.
+	dht, ok := n.Routing.(*dht.IpfsDHT)
+	if !ok {
+		return nil
+	}
 
-	for {
-		ctx, _ := context.WithTimeout(parent, connectiontimeout)
-		// TODO get config from disk so |peers| always reflects the latest
-		// information
-		if err := bootstrap(ctx, h, route, store, peers); err != nil {
-			log.Error(err)
+	for i := 0; i < 3; i++ {
+		if err := bootstrapRound(ctx, n.PeerHost, dht, n.Peerstore, peers); err != nil {
+			return err
 		}
+	}
 
-		if !dhtAlreadyBootstrapping {
-			dhtAlreadyBootstrapping = true // only call dht.Bootstrap once.
-			if _, err := route.Bootstrap(); err != nil {
-				log.Error(err)
-			}
+	// at this point we have done at least one round of initial bootstrap.
+	// we're ready to kick off dht bootstrapping.
+	dbproc, err := dht.Bootstrap(ctx)
+	if err != nil {
+		return err
+	}
+
+	// kick off the node's periodic bootstrapping
+	proc := periodicproc.Tick(BootstrapPeriod, func(worker goprocess.Process) {
+		if err := bootstrapRound(ctx, n.PeerHost, dht, n.Peerstore, peers); err != nil {
+			log.Error(err)
 		}
+	})
+
+	// add dht bootstrap proc as a child, so it is closed automatically when we are.
+	proc.AddChild(dbproc)
+
+	// we were given a context. instead of returning proc for the caller
+	// to manage, for now we just close the proc when context is done.
+	go func() {
+		<-ctx.Done()
+		proc.Close()
+	}()
+	return nil
+}
 
-		select {
-		case <-parent.Done():
-			return parent.Err()
-		case <-time.Tick(period):
+// BootstrapForever starts IpfsNode bootstrapping. Unlike TryToBootstrap(),
+// BootstrapForever() will run indefinitely (until its context is cancelled).
+// This is particularly useful for the daemon and other services, which may
+// be started offline and will come online at a future date.
+//
+// TODO: check offline --to--> online case works well and doesn't hurt perf.
+// We may still be dialing. We should check network config.
+func (nb *nodeBootstrapper) BootstrapForever(ctx context.Context, peers []peer.PeerInfo) error {
+	for {
+		if err := nb.TryToBootstrap(ctx, peers); err == nil {
+			return nil
 		}
 	}
-	return nil
 }
 
-func bootstrap(ctx context.Context,
-	h host.Host,
-	r *dht.IpfsDHT,
-	ps peer.Peerstore,
+func bootstrapRound(ctx context.Context,
+	host host.Host,
+	route *dht.IpfsDHT,
+	peerstore peer.Peerstore,
 	bootstrapPeers []peer.PeerInfo) error {
 
-	connectedPeers := h.Network().Peers()
-	if len(connectedPeers) >= recoveryThreshold {
-		log.Event(ctx, "bootstrapSkip", h.ID())
-		log.Debugf("%s core bootstrap skipped -- connected to %d (> %d) nodes",
-			h.ID(), len(connectedPeers), recoveryThreshold)
+	ctx, _ = context.WithTimeout(ctx, BootstrapConnectionTimeout)
 
+	// determine how many bootstrap connections to open
+	connectedPeers := host.Network().Peers()
+	if len(connectedPeers) >= BootstrapPeerThreshold {
+		log.Event(ctx, "bootstrapSkip", host.ID())
+		log.Debugf("%s core bootstrap skipped -- connected to %d (> %d) nodes",
+			host.ID(), len(connectedPeers), BootstrapPeerThreshold)
 		return nil
 	}
-	numCxnsToCreate := recoveryThreshold - len(connectedPeers)
-
-	log.Event(ctx, "bootstrapStart", h.ID())
-	log.Debugf("%s core bootstrapping to %d more nodes", h.ID(), numCxnsToCreate)
+	numCxnsToCreate := BootstrapPeerThreshold - len(connectedPeers)
 
+	// filter out bootstrap nodes we are already connected to
 	var notConnected []peer.PeerInfo
 	for _, p := range bootstrapPeers {
-		if h.Network().Connectedness(p.ID) != inet.Connected {
+		if host.Network().Connectedness(p.ID) != inet.Connected {
 			notConnected = append(notConnected, p)
 		}
 	}
 
-	// if not connected to all bootstrap peer candidates
-	if len(notConnected) > 0 {
-		var randomSubset = randomSubsetOfPeers(notConnected, numCxnsToCreate)
-		log.Debugf("%s bootstrapping to %d nodes: %s", h.ID(), numCxnsToCreate, randomSubset)
-		if err := connect(ctx, ps, r, randomSubset); err != nil {
-			log.Event(ctx, "bootstrapError", h.ID(), lgbl.Error(err))
-			log.Errorf("%s bootstrap error: %s", h.ID(), err)
-			return err
-		}
+	// if connected to all bootstrap peer candidates, exit
+	if len(notConnected) < 1 {
+		log.Debugf("%s no more bootstrap peers to create %d connections", host.ID(), numCxnsToCreate)
+		return ErrNotEnoughBootstrapPeers
+	}
+
+	// connect to a random susbset of bootstrap candidates
+	var randomSubset = randomSubsetOfPeers(notConnected, numCxnsToCreate)
+	log.Event(ctx, "bootstrapStart", host.ID())
+	log.Debugf("%s bootstrapping to %d nodes: %s", host.ID(), numCxnsToCreate, randomSubset)
+	if err := bootstrapConnect(ctx, peerstore, route, randomSubset); err != nil {
+		log.Event(ctx, "bootstrapError", host.ID(), lgbl.Error(err))
+		log.Errorf("%s bootstrap error: %s", host.ID(), err)
+		return err
 	}
 	return nil
 }
 
-func connect(ctx context.Context, ps peer.Peerstore, r *dht.IpfsDHT, peers []peer.PeerInfo) error {
+func bootstrapConnect(ctx context.Context,
+	ps peer.Peerstore,
+	route *dht.IpfsDHT,
+	peers []peer.PeerInfo) error {
 	if len(peers) < 1 {
-		return errors.New("bootstrap set empty")
+		return ErrNotEnoughBootstrapPeers
 	}
 
+	errs := make(chan error, len(peers))
 	var wg sync.WaitGroup
 	for _, p := range peers {
 
 		// performed asynchronously because when performed synchronously, if
 		// one `Connect` call hangs, subsequent calls are more likely to
 		// fail/abort due to an expiring context.
+		// Also, performed asynchronously for dial speed.
 
 		wg.Add(1)
 		go func(p peer.PeerInfo) {
 			defer wg.Done()
-			log.Event(ctx, "bootstrapDial", r.LocalPeer(), p.ID)
-			log.Debugf("%s bootstrapping to %s", r.LocalPeer(), p.ID)
+			log.Event(ctx, "bootstrapDial", route.LocalPeer(), p.ID)
+			log.Debugf("%s bootstrapping to %s", route.LocalPeer(), p.ID)
 
 			ps.AddAddresses(p.ID, p.Addrs)
-			err := r.Connect(ctx, p.ID)
+			err := route.Connect(ctx, p.ID)
 			if err != nil {
 				log.Event(ctx, "bootstrapFailed", p.ID)
-				log.Criticalf("failed to bootstrap with %v: %s", p.ID, err)
+				log.Errorf("failed to bootstrap with %v: %s", p.ID, err)
+				errs <- err
 				return
 			}
 			log.Event(ctx, "bootstrapSuccess", p.ID)
@@ -126,6 +205,20 @@ func connect(ctx context.Context, ps peer.Peerstore, r *dht.IpfsDHT, peers []pee
 		}(p)
 	}
 	wg.Wait()
+
+	// our failure condition is when no connection attempt succeeded.
+	// So drain the errs channel, counting the results.
+	close(errs)
+	count := 0
+	var err error
+	for err = range errs {
+		if err != nil {
+			count++
+		}
+	}
+	if count == len(peers) {
+		return fmt.Errorf("failed to bootstrap. %s", err)
+	}
 	return nil
 }
 
diff --git a/core/core.go b/core/core.go
index a2e03f2155a4..d5ece35e9bf6 100644
--- a/core/core.go
+++ b/core/core.go
@@ -297,30 +297,12 @@ func (n *IpfsNode) Resolve(k util.Key) (*merkledag.Node, error) {
 func (n *IpfsNode) Bootstrap(ctx context.Context, peers []peer.PeerInfo) error {
 
 	// TODO what should return value be when in offlineMode?
-
 	if n.Routing == nil {
 		return nil
 	}
 
-	// TODO what bootstrapping should happen if there is no DHT? i.e. we could
-	// continue connecting to our bootstrap peers, but for what purpose?
-	dhtRouting, ok := n.Routing.(*dht.IpfsDHT)
-	if !ok {
-		return nil
-	}
-
-	// TODO consider moving connection supervision into the Network. We've
-	// discussed improvements to this Node constructor. One improvement
-	// would be to make the node configurable, allowing clients to inject
-	// an Exchange, Network, or Routing component and have the constructor
-	// manage the wiring. In that scenario, this dangling function is a bit
-	// awkward.
-
-	// spin off the node's connection supervisor.
-	// TODO, clean up how this thing works. Make the superviseConnections thing
-	// work like the DHT.Bootstrap.
-	go superviseConnections(ctx, n.PeerHost, dhtRouting, n.Peerstore, peers)
-	return nil
+	nb := nodeBootstrapper{n}
+	return nb.TryToBootstrap(ctx, peers)
 }
 
 func (n *IpfsNode) loadID() error {
diff --git a/routing/dht/dht_bootstrap.go b/routing/dht/dht_bootstrap.go
index 095c194d676e..c3991972ce4f 100644
--- a/routing/dht/dht_bootstrap.go
+++ b/routing/dht/dht_bootstrap.go
@@ -14,6 +14,7 @@ import (
 
 	context "github.com/jbenet/go-ipfs/Godeps/_workspace/src/code.google.com/p/go.net/context"
 	goprocess "github.com/jbenet/go-ipfs/Godeps/_workspace/src/github.com/jbenet/goprocess"
+	periodicproc "github.com/jbenet/go-ipfs/Godeps/_workspace/src/github.com/jbenet/goprocess/periodic"
 )
 
 // DefaultBootstrapQueries specifies how many queries to run,
@@ -54,9 +55,9 @@ const DefaultBootstrapTimeout = time.Duration(10 * time.Second)
 // and connected to at least a few nodes.
 //
 // Like PeriodicBootstrap, Bootstrap returns a process, so the user can stop it.
-func (dht *IpfsDHT) Bootstrap() (goprocess.Process, error) {
+func (dht *IpfsDHT) Bootstrap(ctx context.Context) (goprocess.Process, error) {
 
-	if err := dht.runBootstrap(dht.Context(), DefaultBootstrapQueries); err != nil {
+	if err := dht.runBootstrap(ctx, DefaultBootstrapQueries); err != nil {
 		return nil, err
 	}
 
@@ -79,41 +80,32 @@ func (dht *IpfsDHT) BootstrapOnSignal(queries int, signal <-chan time.Time) (gop
 		return nil, fmt.Errorf("invalid signal: %v", signal)
 	}
 
-	proc := goprocess.Go(func(worker goprocess.Process) {
-		defer log.Debug("dht bootstrapper shutting down")
-		for {
-			select {
-			case <-worker.Closing():
-				return
-
-			case <-signal:
-				// it would be useful to be able to send out signals of when we bootstrap, too...
-				// maybe this is a good case for whole module event pub/sub?
-
-				ctx := dht.Context()
-				if err := dht.runBootstrap(ctx, queries); err != nil {
-					log.Error(err)
-					// A bootstrapping error is important to notice but not fatal.
-					// maybe the client should be able to consume these errors,
-					// though I dont have a clear use case in mind-- what **could**
-					// the client do if one of the bootstrap calls fails?
-					//
-					// This is also related to the core's bootstrap failures.
-					// superviseConnections should perhaps allow clients to detect
-					// bootstrapping problems.
-					//
-					// Anyway, passing errors could be done with a bootstrapper object.
-					// this would imply the client should be able to consume a lot of
-					// other non-fatal dht errors too. providing this functionality
-					// should be done correctly DHT-wide.
-					// NB: whatever the design, clients must ensure they drain errors!
-					// This pattern is common to many things, perhaps long-running services
-					// should have something like an ErrStream that allows clients to consume
-					// periodic errors and take action. It should allow the user to also
-					// ignore all errors with something like an ErrStreamDiscard. We should
-					// study what other systems do for ideas.
-				}
-			}
+	proc := periodicproc.Ticker(signal, func(worker goprocess.Process) {
+		// it would be useful to be able to send out signals of when we bootstrap, too...
+		// maybe this is a good case for whole module event pub/sub?
+
+		ctx := dht.Context()
+		if err := dht.runBootstrap(ctx, queries); err != nil {
+			log.Error(err)
+			// A bootstrapping error is important to notice but not fatal.
+			// maybe the client should be able to consume these errors,
+			// though I dont have a clear use case in mind-- what **could**
+			// the client do if one of the bootstrap calls fails?
+			//
+			// This is also related to the core's bootstrap failures.
+			// superviseConnections should perhaps allow clients to detect
+			// bootstrapping problems.
+			//
+			// Anyway, passing errors could be done with a bootstrapper object.
+			// this would imply the client should be able to consume a lot of
+			// other non-fatal dht errors too. providing this functionality
+			// should be done correctly DHT-wide.
+			// NB: whatever the design, clients must ensure they drain errors!
+			// This pattern is common to many things, perhaps long-running services
+			// should have something like an ErrStream that allows clients to consume
+			// periodic errors and take action. It should allow the user to also
+			// ignore all errors with something like an ErrStreamDiscard. We should
+			// study what other systems do for ideas.
 		}
 	})