Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

collectors: don't shut down on timeout on GetInfo RPC call #110

Merged
merged 2 commits into from
Aug 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions collectors/chain_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,20 @@ func (c *ChainCollector) Describe(ch chan<- *prometheus.Desc) {
func (c *ChainCollector) Collect(ch chan<- prometheus.Metric) {
resp, err := c.lnd.GetInfo(context.Background())
if err != nil {
c.errChan <- fmt.Errorf("ChainCollector GetInfo failed with: "+
"%v", err)
errWithContext := fmt.Errorf("ChainCollector GetInfo "+
"failed with: %w", err)
Logger.Error(errWithContext)

// If this isn't just a timeout, we'll want to exit to give the
// runtime (Docker/k8s/systemd) a chance to restart us, in case
// something with the lnd connection and/or credentials has
// changed. We just do this check for the GetInfo call, since
// that's known to sometimes randomly take way longer than on
// average (database interactions?).
if !IsDeadlineExceeded(err) {
c.errChan <- errWithContext
}

return
}

Expand Down
16 changes: 14 additions & 2 deletions collectors/channels_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,20 @@ func (c *ChannelsCollector) Collect(ch chan<- prometheus.Metric) {
// have open.
getInfoResp, err := c.lnd.GetInfo(context.Background())
if err != nil {
c.errChan <- fmt.Errorf("ChannelsCollector GetInfo failed "+
"with: %v", err)
errWithContext := fmt.Errorf("ChannelsCollector GetInfo "+
"failed with: %w", err)
Logger.Error(errWithContext)

// If this isn't just a timeout, we'll want to exit to give the
// runtime (Docker/k8s/systemd) a chance to restart us, in case
// something with the lnd connection and/or credentials has
// changed. We just do this check for the GetInfo call, since
// that's known to sometimes randomly take way longer than on
// average (database interactions?).
if !IsDeadlineExceeded(err) {
c.errChan <- errWithContext
}

return
}

Expand Down
39 changes: 39 additions & 0 deletions collectors/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package collectors

import (
"context"
"strings"

"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)

var (
// errRPCDeadlineExceeded is the error that is sent over the gRPC
// interface when it's coming from the server side. The
// status.FromContextError() function won't recognize it correctly
// since the error sent over the wire is a string and not a structured
// error anymore.
errRPCDeadlineExceeded = status.Error(
codes.DeadlineExceeded, context.DeadlineExceeded.Error(),
)
)

// IsDeadlineExceeded returns true if the passed error is a gRPC error with the
// context.DeadlineExceeded error as the cause.
func IsDeadlineExceeded(err error) bool {
if err == nil {
return false
}

st := status.FromContextError(err)
if st.Code() == codes.DeadlineExceeded {
return true
}

if strings.Contains(err.Error(), errRPCDeadlineExceeded.Error()) {
return true
}

return false
}
28 changes: 18 additions & 10 deletions collectors/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ type MonitoringConfig struct {
// DisableHtlc disables collection of HTLCs metrics
DisableHtlc bool

// ProgramStartTime stores a best-effort estimate of when lnd/lndmon was started.
// ProgramStartTime stores a best-effort estimate of when lnd/lndmon was
// started.
ProgramStartTime time.Time
}

Expand All @@ -88,13 +89,14 @@ func DefaultConfig() *PrometheusConfig {
// NewPrometheusExporter makes a new instance of the PrometheusExporter given
// the address to listen for Prometheus on and an lnd gRPC client.
func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices,
monitoringCfg *MonitoringConfig, quitChan chan struct{}) *PrometheusExporter {
monitoringCfg *MonitoringConfig,
quitChan chan struct{}) *PrometheusExporter {

// We have six collectors and a htlc monitor running, so we buffer our
// error channel by 7 so that we do not need to consume all errors from
// error channel by 8 so that we do not need to consume all errors from
// this channel (on the first one, we'll start shutting down, but a few
// could arrive quickly in the case where lnd is shutting down).
errChan := make(chan error, 7)
errChan := make(chan error, 8)

htlcMonitor := newHtlcMonitor(lnd.Router, errChan)

Expand All @@ -116,7 +118,9 @@ func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices,
}

if !monitoringCfg.DisableGraph {
collectors = append(collectors, NewGraphCollector(lnd.Client, errChan))
collectors = append(
collectors, NewGraphCollector(lnd.Client, errChan),
)
}

return &PrometheusExporter{
Expand Down Expand Up @@ -165,15 +169,19 @@ func (p *PrometheusExporter) Start() error {
// scape our metrics.
go func() {
errorLogger := log.New(
os.Stdout, "promhttp", log.Ldate|log.Ltime|log.Lshortfile,
os.Stdout, "promhttp",
log.Ldate|log.Ltime|log.Lshortfile,
)

promHandler := promhttp.InstrumentMetricHandler(
prometheus.DefaultRegisterer,
promhttp.HandlerFor(prometheus.DefaultGatherer, promhttp.HandlerOpts{
ErrorLog: errorLogger,
ErrorHandling: promhttp.ContinueOnError,
}),
promhttp.HandlerFor(
prometheus.DefaultGatherer,
promhttp.HandlerOpts{
ErrorLog: errorLogger,
ErrorHandling: promhttp.ContinueOnError,
},
),
)
http.Handle("/metrics", promHandler)
Logger.Info(http.ListenAndServe(p.cfg.ListenAddr, nil))
Expand Down
Loading