diff --git a/collectors/chain_collector.go b/collectors/chain_collector.go index 8093207..d853913 100644 --- a/collectors/chain_collector.go +++ b/collectors/chain_collector.go @@ -70,8 +70,20 @@ func (c *ChainCollector) Describe(ch chan<- *prometheus.Desc) { func (c *ChainCollector) Collect(ch chan<- prometheus.Metric) { resp, err := c.lnd.GetInfo(context.Background()) if err != nil { - c.errChan <- fmt.Errorf("ChainCollector GetInfo failed with: "+ - "%v", err) + errWithContext := fmt.Errorf("ChainCollector GetInfo "+ + "failed with: %w", err) + Logger.Error(errWithContext) + + // If this isn't just a timeout, we'll want to exit to give the + // runtime (Docker/k8s/systemd) a chance to restart us, in case + // something with the lnd connection and/or credentials has + // changed. We just do this check for the GetInfo call, since + // that's known to sometimes randomly take way longer than on + // average (database interactions?). + if !IsDeadlineExceeded(err) { + c.errChan <- errWithContext + } + return } diff --git a/collectors/channels_collector.go b/collectors/channels_collector.go index 138374a..f2ad2ea 100644 --- a/collectors/channels_collector.go +++ b/collectors/channels_collector.go @@ -311,8 +311,20 @@ func (c *ChannelsCollector) Collect(ch chan<- prometheus.Metric) { // have open. getInfoResp, err := c.lnd.GetInfo(context.Background()) if err != nil { - c.errChan <- fmt.Errorf("ChannelsCollector GetInfo failed "+ - "with: %v", err) + errWithContext := fmt.Errorf("ChannelsCollector GetInfo "+ + "failed with: %w", err) + Logger.Error(errWithContext) + + // If this isn't just a timeout, we'll want to exit to give the + // runtime (Docker/k8s/systemd) a chance to restart us, in case + // something with the lnd connection and/or credentials has + // changed. We just do this check for the GetInfo call, since + // that's known to sometimes randomly take way longer than on + // average (database interactions?). + if !IsDeadlineExceeded(err) { + c.errChan <- errWithContext + } + return } diff --git a/collectors/errors.go b/collectors/errors.go new file mode 100644 index 0000000..55cde5b --- /dev/null +++ b/collectors/errors.go @@ -0,0 +1,39 @@ +package collectors + +import ( + "context" + "strings" + + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +var ( + // errRPCDeadlineExceeded is the error that is sent over the gRPC + // interface when it's coming from the server side. The + // status.FromContextError() function won't recognize it correctly + // since the error sent over the wire is a string and not a structured + // error anymore. + errRPCDeadlineExceeded = status.Error( + codes.DeadlineExceeded, context.DeadlineExceeded.Error(), + ) +) + +// IsDeadlineExceeded returns true if the passed error is a gRPC error with the +// context.DeadlineExceeded error as the cause. +func IsDeadlineExceeded(err error) bool { + if err == nil { + return false + } + + st := status.FromContextError(err) + if st.Code() == codes.DeadlineExceeded { + return true + } + + if strings.Contains(err.Error(), errRPCDeadlineExceeded.Error()) { + return true + } + + return false +} diff --git a/collectors/prometheus.go b/collectors/prometheus.go index 9781404..c4b61de 100644 --- a/collectors/prometheus.go +++ b/collectors/prometheus.go @@ -72,7 +72,8 @@ type MonitoringConfig struct { // DisableHtlc disables collection of HTLCs metrics DisableHtlc bool - // ProgramStartTime stores a best-effort estimate of when lnd/lndmon was started. + // ProgramStartTime stores a best-effort estimate of when lnd/lndmon was + // started. ProgramStartTime time.Time } @@ -88,13 +89,14 @@ func DefaultConfig() *PrometheusConfig { // NewPrometheusExporter makes a new instance of the PrometheusExporter given // the address to listen for Prometheus on and an lnd gRPC client. func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices, - monitoringCfg *MonitoringConfig, quitChan chan struct{}) *PrometheusExporter { + monitoringCfg *MonitoringConfig, + quitChan chan struct{}) *PrometheusExporter { // We have six collectors and a htlc monitor running, so we buffer our - // error channel by 7 so that we do not need to consume all errors from + // error channel by 8 so that we do not need to consume all errors from // this channel (on the first one, we'll start shutting down, but a few // could arrive quickly in the case where lnd is shutting down). - errChan := make(chan error, 7) + errChan := make(chan error, 8) htlcMonitor := newHtlcMonitor(lnd.Router, errChan) @@ -116,7 +118,9 @@ func NewPrometheusExporter(cfg *PrometheusConfig, lnd *lndclient.LndServices, } if !monitoringCfg.DisableGraph { - collectors = append(collectors, NewGraphCollector(lnd.Client, errChan)) + collectors = append( + collectors, NewGraphCollector(lnd.Client, errChan), + ) } return &PrometheusExporter{ @@ -165,15 +169,19 @@ func (p *PrometheusExporter) Start() error { // scape our metrics. go func() { errorLogger := log.New( - os.Stdout, "promhttp", log.Ldate|log.Ltime|log.Lshortfile, + os.Stdout, "promhttp", + log.Ldate|log.Ltime|log.Lshortfile, ) promHandler := promhttp.InstrumentMetricHandler( prometheus.DefaultRegisterer, - promhttp.HandlerFor(prometheus.DefaultGatherer, promhttp.HandlerOpts{ - ErrorLog: errorLogger, - ErrorHandling: promhttp.ContinueOnError, - }), + promhttp.HandlerFor( + prometheus.DefaultGatherer, + promhttp.HandlerOpts{ + ErrorLog: errorLogger, + ErrorHandling: promhttp.ContinueOnError, + }, + ), ) http.Handle("/metrics", promHandler) Logger.Info(http.ListenAndServe(p.cfg.ListenAddr, nil))