Skip to content

Commit

Permalink
collectors: add failure reasons vector to htlcmonitor
Browse files Browse the repository at this point in the history
  • Loading branch information
carlaKC committed Nov 17, 2020
1 parent 7bda61d commit 939973e
Show file tree
Hide file tree
Showing 2 changed files with 131 additions and 10 deletions.
50 changes: 40 additions & 10 deletions collectors/htlcs_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"strconv"
"strings"
"sync"
"time"

Expand All @@ -28,6 +29,15 @@ const (

chanInLabel = "chan_in"
chanOutLabel = "chan_out"

// failureReasonLabel is the variable label we use for failure reasons
// for forwards.
failureReasonLabel = "failure_reason"

// failureReasonExternal is a special value for the failureReason
// that we use when a forward is failed back to us and we do not know
// the exact reason for failure.
failureReasonExternal = "failed_back"
)

// htlcLabels is the set of labels we use to label htlc events.
Expand Down Expand Up @@ -74,7 +84,7 @@ func newHtlcMonitor(router lndclient.RouterClient,
Subsystem: "htlcs",
Name: "resolved_htlcs",
Help: "count of resolved htlcs",
}, htlcLabels),
}, append(htlcLabels, failureReasonLabel)),
resolutionTimeHistogram: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "lnd",
Expand Down Expand Up @@ -192,7 +202,7 @@ func (h *htlcMonitor) processHtlcEvent(event *routerrpc.HtlcEvent) error {

ts := time.Unix(0, int64(event.TimestampNs))

switch event.Event.(type) {
switch e := event.Event.(type) {
// If we have received a forwarding event, we add it to our map if it
// is not already present. We are ok with duplicate events, because
// htlcs are sometimes replayed by the switch, but we want to keep our
Expand All @@ -207,19 +217,23 @@ func (h *htlcMonitor) processHtlcEvent(event *routerrpc.HtlcEvent) error {
h.activeHtlcs[key] = ts

case *routerrpc.HtlcEvent_SettleEvent:
err := h.recordResolution(key, event.EventType, ts, true)
err := h.recordResolution(key, event.EventType, ts, "")
if err != nil {
return err
}

case *routerrpc.HtlcEvent_ForwardFailEvent:
err := h.recordResolution(key, event.EventType, ts, false)
err := h.recordResolution(
key, event.EventType, ts, failureReasonExternal,
)
if err != nil {
return err
}

case *routerrpc.HtlcEvent_LinkFailEvent:
err := h.recordResolution(key, event.EventType, ts, false)
err := h.recordResolution(
key, event.EventType, ts, e.LinkFailEvent.FailureString,
)
if err != nil {
return err
}
Expand All @@ -232,12 +246,15 @@ func (h *htlcMonitor) processHtlcEvent(event *routerrpc.HtlcEvent) error {
}

// recordResolution records the outcome of a htlc resolution (settle/fail) in
// our metrics.
// our metrics. The failure reason string should be empty for all successful
// forwards, and populated for all failures.
func (h *htlcMonitor) recordResolution(key htlcswitch.HtlcKey,
eventType routerrpc.HtlcEvent_EventType, ts time.Time,
success bool) error {
failureReason string) error {

// Create the set of labels we want to track this resolution.
// Create the set of labels we want to track this resolution. Remove
// spaces from our failure reason so that it can be used as a prometheus
// label.
labels := map[string]string{
outcomeLabel: outcomeFailedValue,
chanInLabel: strconv.FormatUint(
Expand All @@ -246,8 +263,11 @@ func (h *htlcMonitor) recordResolution(key htlcswitch.HtlcKey,
chanOutLabel: strconv.FormatUint(
key.OutgoingCircuit.ChanID.ToUint64(), 10,
),
failureReasonLabel: strings.ToLower(strings.ReplaceAll(
failureReason, " ", "_",
)),
}
if success {
if failureReason == "" {
labels[outcomeLabel] = outcomeSettledValue
}

Expand Down Expand Up @@ -285,8 +305,18 @@ func (h *htlcMonitor) recordResolution(key htlcswitch.HtlcKey,
return nil
}

// We make a copy of our labels rather than delete the unneeded failure
// reason label so that we don't run into any unexpected behaviour from
// map references.
histogramLabels := make(map[string]string, len(htlcLabels))
for _, label := range htlcLabels {
histogramLabels[label] = labels[label]
}

// Add the amount of time the htlc took to resolve to our histogram.
h.resolutionTimeHistogram.With(labels).Observe(ts.Sub(fwdTs).Seconds())
h.resolutionTimeHistogram.With(
histogramLabels,
).Observe(ts.Sub(fwdTs).Seconds())

// Delete the htlc from our set of active htlcs.
delete(h.activeHtlcs, key)
Expand Down
91 changes: 91 additions & 0 deletions grafana/provisioning/dashboards/routing.json
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,97 @@
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": true,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"description": "The reasons provided for htlcs involved in ${HtlcType}s on lnd's channels failing. Note that \"failed_back\" indicates that the payment failed further down the route, and all other failure reasons indicate that your node failed the htlc.",
"fill": 1,
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 32
},
"id": 15,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": false,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (failure_reason)(lnd_htlcs_resolved_htlcs{outcome=\"failed\",type=\"$HtlcType\"})",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{failure_reason}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Failure Reasons",
"tooltip": {
"shared": false,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "series",
"name": null,
"show": true,
"values": [
"max"
]
},
"yaxes": [
{
"decimals": 0,
"format": "short",
"label": "HTLC Count",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"decimals": 0,
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": false,
Expand Down

0 comments on commit 939973e

Please sign in to comment.