Skip to content

Commit

Permalink
routing+routerrpc: improve prob. estimation for untried connections
Browse files Browse the repository at this point in the history
This commit changes mission control to partially base the estimated
probability for untried connections on historical results obtained in
previous payment attempts. This incentivizes routing nodes to keep all
of their channels in good shape.
  • Loading branch information
joostjager committed Oct 22, 2019
1 parent fab1390 commit 1fac41d
Show file tree
Hide file tree
Showing 9 changed files with 410 additions and 97 deletions.
9 changes: 9 additions & 0 deletions lnrpc/routerrpc/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@ type RoutingConfig struct {
// a route when no other information is available.
AprioriHopProbability float64 `long:"apriorihopprob" description:"Assumed success probability of a hop in a route when no other information is available."`

// AprioriWeight is a value in the range [0, 1] that defines to what
// extent historical results should be extrapolated to untried
// connections. Setting it to one will completely ignore historical
// results and always assume the configured a priori probability for
// untried connections. A value of zero will ignore the a priori
// probability completely and only base the probability on historical
// results, unless there are none available.
AprioriWeight float64 `long:"aprioriweight" description:"Weight of the a priori probability in success probability estimation. Valid values are in [0, 1]."`

// PenaltyHalfLife defines after how much time a penalized node or
// channel is back at 50% probability.
PenaltyHalfLife time.Duration `long:"penaltyhalflife" description:"Defines the duration after which a penalized node or channel is back at 50% probability"`
Expand Down
2 changes: 2 additions & 0 deletions lnrpc/routerrpc/config_active.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ type Config struct {
func DefaultConfig() *Config {
defaultRoutingConfig := RoutingConfig{
AprioriHopProbability: routing.DefaultAprioriHopProbability,
AprioriWeight: routing.DefaultAprioriWeight,
MinRouteProbability: routing.DefaultMinRouteProbability,
PenaltyHalfLife: routing.DefaultPenaltyHalfLife,
AttemptCost: routing.DefaultPaymentAttemptPenalty.
Expand All @@ -61,6 +62,7 @@ func DefaultConfig() *Config {
func GetRoutingConfig(cfg *Config) *RoutingConfig {
return &RoutingConfig{
AprioriHopProbability: cfg.AprioriHopProbability,
AprioriWeight: cfg.AprioriWeight,
MinRouteProbability: cfg.MinRouteProbability,
AttemptCost: cfg.AttemptCost,
PenaltyHalfLife: cfg.PenaltyHalfLife,
Expand Down
1 change: 1 addition & 0 deletions lnrpc/routerrpc/config_default.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ func DefaultConfig() *Config {
func GetRoutingConfig(cfg *Config) *RoutingConfig {
return &RoutingConfig{
AprioriHopProbability: routing.DefaultAprioriHopProbability,
AprioriWeight: routing.DefaultAprioriWeight,
MinRouteProbability: routing.DefaultMinRouteProbability,
AttemptCost: routing.DefaultPaymentAttemptPenalty.
ToSatoshis(),
Expand Down
154 changes: 65 additions & 89 deletions routing/missioncontrol.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package routing

import (
"math"
"sync"
"time"

Expand Down Expand Up @@ -47,6 +46,10 @@ const (
// prevSuccessProbability is the assumed probability for node pairs that
// successfully relayed the previous attempt.
prevSuccessProbability = 0.95

// DefaultAprioriWeight is the default a priori weight. See
// MissionControlConfig for further explanation.
DefaultAprioriWeight = 0.5
)

// NodeResults contains previous results from a node to its peers.
Expand All @@ -68,9 +71,6 @@ type MissionControl struct {
// particular node.
lastPairResult map[route.Vertex]NodeResults

// lastNodeFailure tracks the last node level failure per node.
lastNodeFailure map[route.Vertex]time.Time

// lastSecondChance tracks the last time a second chance was granted for
// a directed node pair.
lastSecondChance map[DirectedNodePair]time.Time
Expand All @@ -83,6 +83,10 @@ type MissionControl struct {

store *missionControlStore

// estimator is the probability estimator that is used with the payment
// results that mission control collects.
estimator *probabilityEstimator

sync.Mutex

// TODO(roasbeef): further counters, if vertex continually unavailable,
Expand All @@ -105,6 +109,15 @@ type MissionControlConfig struct {
// MaxMcHistory defines the maximum number of payment results that are
// held on disk.
MaxMcHistory int

// AprioriWeight is a value in the range [0, 1] that defines to what
// extent historical results should be extrapolated to untried
// connections. Setting it to one will completely ignore historical
// results and always assume the configured a priori probability for
// untried connections. A value of zero will ignore the a priori
// probability completely and only base the probability on historical
// results, unless there are none available.
AprioriWeight float64
}

// timedPairResult describes a timestamped pair result.
Expand Down Expand Up @@ -157,21 +170,29 @@ func NewMissionControl(db *bbolt.DB, cfg *MissionControlConfig) (
*MissionControl, error) {

log.Debugf("Instantiating mission control with config: "+
"PenaltyHalfLife=%v, AprioriHopProbability=%v",
cfg.PenaltyHalfLife, cfg.AprioriHopProbability)
"PenaltyHalfLife=%v, AprioriHopProbability=%v, "+
"AprioriWeight=%v", cfg.PenaltyHalfLife,
cfg.AprioriHopProbability, cfg.AprioriWeight)

store, err := newMissionControlStore(db, cfg.MaxMcHistory)
if err != nil {
return nil, err
}

estimator := &probabilityEstimator{
aprioriHopProbability: cfg.AprioriHopProbability,
aprioriWeight: cfg.AprioriWeight,
penaltyHalfLife: cfg.PenaltyHalfLife,
prevSuccessProbability: prevSuccessProbability,
}

mc := &MissionControl{
lastPairResult: make(map[route.Vertex]NodeResults),
lastNodeFailure: make(map[route.Vertex]time.Time),
lastSecondChance: make(map[DirectedNodePair]time.Time),
now: time.Now,
cfg: cfg,
store: store,
estimator: estimator,
}

if err := mc.init(); err != nil {
Expand Down Expand Up @@ -213,7 +234,6 @@ func (m *MissionControl) ResetHistory() error {
}

m.lastPairResult = make(map[route.Vertex]NodeResults)
m.lastNodeFailure = make(map[route.Vertex]time.Time)
m.lastSecondChance = make(map[DirectedNodePair]time.Time)

log.Debugf("Mission control history cleared")
Expand All @@ -229,100 +249,40 @@ func (m *MissionControl) GetProbability(fromNode, toNode route.Vertex,
m.Lock()
defer m.Unlock()

return m.getPairProbability(fromNode, toNode, amt)
}

// getProbAfterFail returns a probability estimate based on a last failure time.
func (m *MissionControl) getProbAfterFail(lastFailure time.Time) float64 {
if lastFailure.IsZero() {
return m.cfg.AprioriHopProbability
}

timeSinceLastFailure := m.now().Sub(lastFailure)

// Calculate success probability based on the weight of the last
// failure. When the failure is fresh, its weight is 1 and we'll return
// probability 0. Over time the probability recovers to the a priori
// probability.
weight := m.getWeight(timeSinceLastFailure)
probability := m.cfg.AprioriHopProbability * (1 - weight)

return probability
}
now := m.now()
results := m.lastPairResult[fromNode]

// getWeight calculates a weight in the range [0, 1] that should be assigned to
// a payment result. Weight follows an exponential curve that starts at 1 when
// the result is fresh and asymptotically approaches zero over time. The rate at
// which this happens is controlled by the penaltyHalfLife parameter.
func (m *MissionControl) getWeight(age time.Duration) float64 {
exp := -age.Hours() / m.cfg.PenaltyHalfLife.Hours()
return math.Pow(2, exp)
}

// getLastPairResult gets the last recorded result for a node pair.
func (m *MissionControl) getLastPairResult(fromNode,
toNode route.Vertex) *timedPairResult {

nodePairs, ok := m.lastPairResult[fromNode]
if !ok {
return nil
}

lastResult, ok := nodePairs[toNode]
if !ok {
return nil
}

return &lastResult
return m.estimator.getPairProbability(now, results, toNode, amt)
}

// setLastPairResult stores a result for a node pair.
func (m *MissionControl) setLastPairResult(fromNode,
toNode route.Vertex, result *timedPairResult) {
toNode route.Vertex, result timedPairResult) {

nodePairs, ok := m.lastPairResult[fromNode]
if !ok {
nodePairs = make(NodeResults)
m.lastPairResult[fromNode] = nodePairs
}

nodePairs[toNode] = *result
nodePairs[toNode] = result
}

// getPairProbability estimates the probability of successfully
// traversing from fromNode to toNode based on historical payment outcomes.
func (m *MissionControl) getPairProbability(fromNode,
toNode route.Vertex, amt lnwire.MilliSatoshi) float64 {

// Start by getting the last node level failure. A node failure is
// considered a failure that would have affected every edge. Therefore
// we insert a node level failure into the history of every channel. If
// there is none, lastFail will be zero.
lastFail := m.lastNodeFailure[fromNode]

// Retrieve the last pair outcome.
lastPairResult := m.getLastPairResult(fromNode, toNode)

// Only look at the last pair outcome if it happened after the last node
// level failure. Otherwise the node level failure is the most recent
// and used as the basis for calculation of the probability.
if lastPairResult != nil && lastPairResult.timestamp.After(lastFail) {
if lastPairResult.success {
return prevSuccessProbability
}
// setAllFail stores a fail result for all known connection of the given node.
func (m *MissionControl) setAllFail(fromNode route.Vertex,
timestamp time.Time) {

// Take into account a minimum penalize amount. For balance
// errors, a failure may be reported with such a minimum to
// prevent too aggresive penalization. We only take into account
// a previous failure if the amount that we currently get the
// probability for is greater or equal than the minPenalizeAmt
// of the previous failure.
if amt >= lastPairResult.minPenalizeAmt {
lastFail = lastPairResult.timestamp
}
nodePairs, ok := m.lastPairResult[fromNode]
if !ok {
return
}

return m.getProbAfterFail(lastFail)
for connection := range nodePairs {
nodePairs[connection] = timedPairResult{
timestamp: timestamp,
pairResult: failPairResult(0),
}
}
}

// requestSecondChance checks whether the node fromNode can have a second chance
Expand Down Expand Up @@ -363,8 +323,7 @@ func (m *MissionControl) GetHistorySnapshot() *MissionControlSnapshot {
defer m.Unlock()

log.Debugf("Requesting history snapshot from mission control: "+
"node_failure_count=%v, pair_result_count=%v",
len(m.lastNodeFailure), len(m.lastPairResult))
"pair_result_count=%v", len(m.lastPairResult))

pairs := make([]MissionControlPairSnapshot, 0, len(m.lastPairResult))

Expand Down Expand Up @@ -475,11 +434,28 @@ func (m *MissionControl) applyPaymentResult(
}
}

// If there is a node-level failure, record a failure for every tried
// connection of that node. A node-level failure can be considered as a
// failure that would have occurred with any of the node's channels.
//
// Ideally we'd also record the failure for the untried connections of
// the node. Unfortunately this would require access to the graph and
// adding this dependency and db calls does not outweigh the benefits.
//
// Untried connections will fall back to the node probability. After the
// call to setAllPairResult below, the node probability will be equal to
// the probability of the tried channels except that the a priori
// probability is mixed in too. This effect is controlled by the
// aprioriWeight parameter. If that parameter isn't set to an extreme
// and there are a few known connections, there shouldn't be much of a
// difference. The largest difference occurs when aprioriWeight is 1. In
// that case, a node-level failure would not be applied to untried
// channels.
if i.nodeFailure != nil {
log.Debugf("Reporting node failure to Mission Control: "+
"node=%v", *i.nodeFailure)

m.lastNodeFailure[*i.nodeFailure] = result.timeReply
m.setAllFail(*i.nodeFailure, result.timeReply)
}

for pair, pairResult := range i.pairResults {
Expand All @@ -492,7 +468,7 @@ func (m *MissionControl) applyPaymentResult(
pair, pairResult.minPenalizeAmt)
}

m.setLastPairResult(pair.From, pair.To, &timedPairResult{
m.setLastPairResult(pair.From, pair.To, timedPairResult{
timestamp: result.timeReply,
pairResult: pairResult,
})
Expand Down
21 changes: 13 additions & 8 deletions routing/missioncontrol_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ var (
mcTestNode2 = mcTestRoute.Hops[1].PubKeyBytes

testPenaltyHalfLife = 30 * time.Minute
testAprioriHopProbability = 0.8
testAprioriHopProbability = 0.9
testAprioriWeight = 0.5
)

type mcTestContext struct {
Expand Down Expand Up @@ -78,6 +79,7 @@ func (ctx *mcTestContext) restartMc() {
&MissionControlConfig{
PenaltyHalfLife: testPenaltyHalfLife,
AprioriHopProbability: testAprioriHopProbability,
AprioriWeight: testAprioriWeight,
},
)
if err != nil {
Expand Down Expand Up @@ -136,20 +138,23 @@ func TestMissionControl(t *testing.T) {

testTime := time.Date(2018, time.January, 9, 14, 00, 00, 0, time.UTC)

// Initial probability is expected to be 1.
ctx.expectP(1000, 0.8)
// Initial probability is expected to be the a priori.
ctx.expectP(1000, testAprioriHopProbability)

// Expect probability to be zero after reporting the edge as failed.
ctx.reportFailure(1000, lnwire.NewTemporaryChannelFailure(nil))
ctx.expectP(1000, 0)

// As we reported with a min penalization amt, a lower amt than reported
// should be unaffected.
// should return the node probability, which is the a priori
// probability.
ctx.expectP(500, testAprioriHopProbability)

// Edge decay started.
// Edge decay started. The node probability weighted average should now
// have shifted from 1:1 to 1:0.5 -> 60%. The connection probability is
// half way through the recovery, so we expect 30% here.
ctx.now = testTime.Add(30 * time.Minute)
ctx.expectP(1000, 0.4)
ctx.expectP(1000, 0.3)

// Edge fails again, this time without a min penalization amt. The edge
// should be penalized regardless of amount.
Expand All @@ -159,11 +164,11 @@ func TestMissionControl(t *testing.T) {

// Edge decay started.
ctx.now = testTime.Add(60 * time.Minute)
ctx.expectP(1000, 0.4)
ctx.expectP(1000, 0.3)

// Restart mission control to test persistence.
ctx.restartMc()
ctx.expectP(1000, 0.4)
ctx.expectP(1000, 0.3)

// A node level failure should bring probability of all known channels
// back to zero.
Expand Down
Loading

0 comments on commit 1fac41d

Please sign in to comment.