routing+routerrpc: improve prob. estimation for untried connections

This commit changes mission control to partially base the estimated probability for untried connections on historical results obtained in previous payment attempts. This incentivizes routing nodes to keep all of their channels in good shape.
lightningnetwork · Oct 22, 2019 · 1fac41d · 1fac41d
1 parent fab1390
commit 1fac41d
Show file tree

Hide file tree

Showing 9 changed files with 410 additions and 97 deletions.
diff --git a/lnrpc/routerrpc/config.go b/lnrpc/routerrpc/config.go
@@ -16,6 +16,15 @@ type RoutingConfig struct {
 	// a route when no other information is available.
 	AprioriHopProbability float64 `long:"apriorihopprob" description:"Assumed success probability of a hop in a route when no other information is available."`
 
+	// AprioriWeight is a value in the range [0, 1] that defines to what
+	// extent historical results should be extrapolated to untried
+	// connections. Setting it to one will completely ignore historical
+	// results and always assume the configured a priori probability for
+	// untried connections. A value of zero will ignore the a priori
+	// probability completely and only base the probability on historical
+	// results, unless there are none available.
+	AprioriWeight float64 `long:"aprioriweight" description:"Weight of the a priori probability in success probability estimation. Valid values are in [0, 1]."`
+
 	// PenaltyHalfLife defines after how much time a penalized node or
 	// channel is back at 50% probability.
 	PenaltyHalfLife time.Duration `long:"penaltyhalflife" description:"Defines the duration after which a penalized node or channel is back at 50% probability"`

diff --git a/lnrpc/routerrpc/config_active.go b/lnrpc/routerrpc/config_active.go
@@ -45,6 +45,7 @@ type Config struct {
 func DefaultConfig() *Config {
 	defaultRoutingConfig := RoutingConfig{
 		AprioriHopProbability: routing.DefaultAprioriHopProbability,
+		AprioriWeight:         routing.DefaultAprioriWeight,
 		MinRouteProbability:   routing.DefaultMinRouteProbability,
 		PenaltyHalfLife:       routing.DefaultPenaltyHalfLife,
 		AttemptCost: routing.DefaultPaymentAttemptPenalty.
@@ -61,6 +62,7 @@ func DefaultConfig() *Config {
 func GetRoutingConfig(cfg *Config) *RoutingConfig {
 	return &RoutingConfig{
 		AprioriHopProbability: cfg.AprioriHopProbability,
+		AprioriWeight:         cfg.AprioriWeight,
 		MinRouteProbability:   cfg.MinRouteProbability,
 		AttemptCost:           cfg.AttemptCost,
 		PenaltyHalfLife:       cfg.PenaltyHalfLife,

diff --git a/lnrpc/routerrpc/config_default.go b/lnrpc/routerrpc/config_default.go
@@ -18,6 +18,7 @@ func DefaultConfig() *Config {
 func GetRoutingConfig(cfg *Config) *RoutingConfig {
 	return &RoutingConfig{
 		AprioriHopProbability: routing.DefaultAprioriHopProbability,
+		AprioriWeight:         routing.DefaultAprioriWeight,
 		MinRouteProbability:   routing.DefaultMinRouteProbability,
 		AttemptCost: routing.DefaultPaymentAttemptPenalty.
 			ToSatoshis(),

diff --git a/routing/missioncontrol.go b/routing/missioncontrol.go
@@ -1,7 +1,6 @@
 package routing
 
 import (
-	"math"
 	"sync"
 	"time"
 
@@ -47,6 +46,10 @@ const (
 	// prevSuccessProbability is the assumed probability for node pairs that
 	// successfully relayed the previous attempt.
 	prevSuccessProbability = 0.95
+
+	// DefaultAprioriWeight is the default a priori weight. See
+	// MissionControlConfig for further explanation.
+	DefaultAprioriWeight = 0.5
 )
 
 // NodeResults contains previous results from a node to its peers.
@@ -68,9 +71,6 @@ type MissionControl struct {
 	// particular node.
 	lastPairResult map[route.Vertex]NodeResults
 
-	// lastNodeFailure tracks the last node level failure per node.
-	lastNodeFailure map[route.Vertex]time.Time
-
 	// lastSecondChance tracks the last time a second chance was granted for
 	// a directed node pair.
 	lastSecondChance map[DirectedNodePair]time.Time
@@ -83,6 +83,10 @@ type MissionControl struct {
 
 	store *missionControlStore
 
+	// estimator is the probability estimator that is used with the payment
+	// results that mission control collects.
+	estimator *probabilityEstimator
+
 	sync.Mutex
 
 	// TODO(roasbeef): further counters, if vertex continually unavailable,
@@ -105,6 +109,15 @@ type MissionControlConfig struct {
 	// MaxMcHistory defines the maximum number of payment results that are
 	// held on disk.
 	MaxMcHistory int
+
+	// AprioriWeight is a value in the range [0, 1] that defines to what
+	// extent historical results should be extrapolated to untried
+	// connections. Setting it to one will completely ignore historical
+	// results and always assume the configured a priori probability for
+	// untried connections. A value of zero will ignore the a priori
+	// probability completely and only base the probability on historical
+	// results, unless there are none available.
+	AprioriWeight float64
 }
 
 // timedPairResult describes a timestamped pair result.
@@ -157,21 +170,29 @@ func NewMissionControl(db *bbolt.DB, cfg *MissionControlConfig) (
 	*MissionControl, error) {
 
 	log.Debugf("Instantiating mission control with config: "+
-		"PenaltyHalfLife=%v, AprioriHopProbability=%v",
-		cfg.PenaltyHalfLife, cfg.AprioriHopProbability)
+		"PenaltyHalfLife=%v, AprioriHopProbability=%v, "+
+		"AprioriWeight=%v", cfg.PenaltyHalfLife,
+		cfg.AprioriHopProbability, cfg.AprioriWeight)
 
 	store, err := newMissionControlStore(db, cfg.MaxMcHistory)
 	if err != nil {
 		return nil, err
 	}
 
+	estimator := &probabilityEstimator{
+		aprioriHopProbability:  cfg.AprioriHopProbability,
+		aprioriWeight:          cfg.AprioriWeight,
+		penaltyHalfLife:        cfg.PenaltyHalfLife,
+		prevSuccessProbability: prevSuccessProbability,
+	}
+
 	mc := &MissionControl{
 		lastPairResult:   make(map[route.Vertex]NodeResults),
-		lastNodeFailure:  make(map[route.Vertex]time.Time),
 		lastSecondChance: make(map[DirectedNodePair]time.Time),
 		now:              time.Now,
 		cfg:              cfg,
 		store:            store,
+		estimator:        estimator,
 	}
 
 	if err := mc.init(); err != nil {
@@ -213,7 +234,6 @@ func (m *MissionControl) ResetHistory() error {
 	}
 
 	m.lastPairResult = make(map[route.Vertex]NodeResults)
-	m.lastNodeFailure = make(map[route.Vertex]time.Time)
 	m.lastSecondChance = make(map[DirectedNodePair]time.Time)
 
 	log.Debugf("Mission control history cleared")
@@ -229,100 +249,40 @@ func (m *MissionControl) GetProbability(fromNode, toNode route.Vertex,
 	m.Lock()
 	defer m.Unlock()
 
-	return m.getPairProbability(fromNode, toNode, amt)
-}
-
-// getProbAfterFail returns a probability estimate based on a last failure time.
-func (m *MissionControl) getProbAfterFail(lastFailure time.Time) float64 {
-	if lastFailure.IsZero() {
-		return m.cfg.AprioriHopProbability
-	}
-
-	timeSinceLastFailure := m.now().Sub(lastFailure)
-
-	// Calculate success probability based on the weight of the last
-	// failure. When the failure is fresh, its weight is 1 and we'll return
-	// probability 0. Over time the probability recovers to the a priori
-	// probability.
-	weight := m.getWeight(timeSinceLastFailure)
-	probability := m.cfg.AprioriHopProbability * (1 - weight)
-
-	return probability
-}
+	now := m.now()
+	results := m.lastPairResult[fromNode]
 
-// getWeight calculates a weight in the range [0, 1] that should be assigned to
-// a payment result. Weight follows an exponential curve that starts at 1 when
-// the result is fresh and asymptotically approaches zero over time. The rate at
-// which this happens is controlled by the penaltyHalfLife parameter.
-func (m *MissionControl) getWeight(age time.Duration) float64 {
-	exp := -age.Hours() / m.cfg.PenaltyHalfLife.Hours()
-	return math.Pow(2, exp)
-}
-
-// getLastPairResult gets the last recorded result for a node pair.
-func (m *MissionControl) getLastPairResult(fromNode,
-	toNode route.Vertex) *timedPairResult {
-
-	nodePairs, ok := m.lastPairResult[fromNode]
-	if !ok {
-		return nil
-	}
-
-	lastResult, ok := nodePairs[toNode]
-	if !ok {
-		return nil
-	}
-
-	return &lastResult
+	return m.estimator.getPairProbability(now, results, toNode, amt)
 }
 
 // setLastPairResult stores a result for a node pair.
 func (m *MissionControl) setLastPairResult(fromNode,
-	toNode route.Vertex, result *timedPairResult) {
+	toNode route.Vertex, result timedPairResult) {
 
 	nodePairs, ok := m.lastPairResult[fromNode]
 	if !ok {
 		nodePairs = make(NodeResults)
 		m.lastPairResult[fromNode] = nodePairs
 	}
 
-	nodePairs[toNode] = *result
+	nodePairs[toNode] = result
 }
 
-// getPairProbability estimates the probability of successfully
-// traversing from fromNode to toNode based on historical payment outcomes.
-func (m *MissionControl) getPairProbability(fromNode,
-	toNode route.Vertex, amt lnwire.MilliSatoshi) float64 {
-
-	// Start by getting the last node level failure. A node failure is
-	// considered a failure that would have affected every edge. Therefore
-	// we insert a node level failure into the history of every channel. If
-	// there is none, lastFail will be zero.
-	lastFail := m.lastNodeFailure[fromNode]
-
-	// Retrieve the last pair outcome.
-	lastPairResult := m.getLastPairResult(fromNode, toNode)
-
-	// Only look at the last pair outcome if it happened after the last node
-	// level failure. Otherwise the node level failure is the most recent
-	// and used as the basis for calculation of the probability.
-	if lastPairResult != nil && lastPairResult.timestamp.After(lastFail) {
-		if lastPairResult.success {
-			return prevSuccessProbability
-		}
+// setAllFail stores a fail result for all known connection of the given node.
+func (m *MissionControl) setAllFail(fromNode route.Vertex,
+	timestamp time.Time) {
 
-		// Take into account a minimum penalize amount. For balance
-		// errors, a failure may be reported with such a minimum to
-		// prevent too aggresive penalization. We only take into account
-		// a previous failure if the amount that we currently get the
-		// probability for is greater or equal than the minPenalizeAmt
-		// of the previous failure.
-		if amt >= lastPairResult.minPenalizeAmt {
-			lastFail = lastPairResult.timestamp
-		}
+	nodePairs, ok := m.lastPairResult[fromNode]
+	if !ok {
+		return
 	}
 
-	return m.getProbAfterFail(lastFail)
+	for connection := range nodePairs {
+		nodePairs[connection] = timedPairResult{
+			timestamp:  timestamp,
+			pairResult: failPairResult(0),
+		}
+	}
 }
 
 // requestSecondChance checks whether the node fromNode can have a second chance
@@ -363,8 +323,7 @@ func (m *MissionControl) GetHistorySnapshot() *MissionControlSnapshot {
 	defer m.Unlock()
 
 	log.Debugf("Requesting history snapshot from mission control: "+
-		"node_failure_count=%v, pair_result_count=%v",
-		len(m.lastNodeFailure), len(m.lastPairResult))
+		"pair_result_count=%v", len(m.lastPairResult))
 
 	pairs := make([]MissionControlPairSnapshot, 0, len(m.lastPairResult))
 
@@ -475,11 +434,28 @@ func (m *MissionControl) applyPaymentResult(
 		}
 	}
 
+	// If there is a node-level failure, record a failure for every tried
+	// connection of that node. A node-level failure can be considered as a
+	// failure that would have occurred with any of the node's channels.
+	//
+	// Ideally we'd also record the failure for the untried connections of
+	// the node. Unfortunately this would require access to the graph and
+	// adding this dependency and db calls does not outweigh the benefits.
+	//
+	// Untried connections will fall back to the node probability. After the
+	// call to setAllPairResult below, the node probability will be equal to
+	// the probability of the tried channels except that the a priori
+	// probability is mixed in too. This effect is controlled by the
+	// aprioriWeight parameter. If that parameter isn't set to an extreme
+	// and there are a few known connections, there shouldn't be much of a
+	// difference. The largest difference occurs when aprioriWeight is 1. In
+	// that case, a node-level failure would not be applied to untried
+	// channels.
 	if i.nodeFailure != nil {
 		log.Debugf("Reporting node failure to Mission Control: "+
 			"node=%v", *i.nodeFailure)
 
-		m.lastNodeFailure[*i.nodeFailure] = result.timeReply
+		m.setAllFail(*i.nodeFailure, result.timeReply)
 	}
 
 	for pair, pairResult := range i.pairResults {
@@ -492,7 +468,7 @@ func (m *MissionControl) applyPaymentResult(
 				pair, pairResult.minPenalizeAmt)
 		}
 
-		m.setLastPairResult(pair.From, pair.To, &timedPairResult{
+		m.setLastPairResult(pair.From, pair.To, timedPairResult{
 			timestamp:  result.timeReply,
 			pairResult: pairResult,
 		})

diff --git a/routing/missioncontrol_test.go b/routing/missioncontrol_test.go
@@ -34,7 +34,8 @@ var (
 	mcTestNode2 = mcTestRoute.Hops[1].PubKeyBytes
 
 	testPenaltyHalfLife       = 30 * time.Minute
-	testAprioriHopProbability = 0.8
+	testAprioriHopProbability = 0.9
+	testAprioriWeight         = 0.5
 )
 
 type mcTestContext struct {
@@ -78,6 +79,7 @@ func (ctx *mcTestContext) restartMc() {
 		&MissionControlConfig{
 			PenaltyHalfLife:       testPenaltyHalfLife,
 			AprioriHopProbability: testAprioriHopProbability,
+			AprioriWeight:         testAprioriWeight,
 		},
 	)
 	if err != nil {
@@ -136,20 +138,23 @@ func TestMissionControl(t *testing.T) {
 
 	testTime := time.Date(2018, time.January, 9, 14, 00, 00, 0, time.UTC)
 
-	// Initial probability is expected to be 1.
-	ctx.expectP(1000, 0.8)
+	// Initial probability is expected to be the a priori.
+	ctx.expectP(1000, testAprioriHopProbability)
 
 	// Expect probability to be zero after reporting the edge as failed.
 	ctx.reportFailure(1000, lnwire.NewTemporaryChannelFailure(nil))
 	ctx.expectP(1000, 0)
 
 	// As we reported with a min penalization amt, a lower amt than reported
-	// should be unaffected.
+	// should return the node probability, which is the a priori
+	// probability.
 	ctx.expectP(500, testAprioriHopProbability)
 
-	// Edge decay started.
+	// Edge decay started. The node probability weighted average should now
+	// have shifted from 1:1 to 1:0.5 -> 60%. The connection probability is
+	// half way through the recovery, so we expect 30% here.
 	ctx.now = testTime.Add(30 * time.Minute)
-	ctx.expectP(1000, 0.4)
+	ctx.expectP(1000, 0.3)
 
 	// Edge fails again, this time without a min penalization amt. The edge
 	// should be penalized regardless of amount.
@@ -159,11 +164,11 @@ func TestMissionControl(t *testing.T) {
 
 	// Edge decay started.
 	ctx.now = testTime.Add(60 * time.Minute)
-	ctx.expectP(1000, 0.4)
+	ctx.expectP(1000, 0.3)
 
 	// Restart mission control to test persistence.
 	ctx.restartMc()
-	ctx.expectP(1000, 0.4)
+	ctx.expectP(1000, 0.3)
 
 	// A node level failure should bring probability of all known channels
 	// back to zero.