From a240606384b29094256dfbbd3f554367974502b2 Mon Sep 17 00:00:00 2001 From: Kyle Huntsman <3432646+kylehuntsman@users.noreply.github.com> Date: Thu, 6 Oct 2022 03:31:11 -0700 Subject: [PATCH] Add booster-bitswap request and response count metrics to dashboard (#838) * feat: add bitswap request and response count metrics to dashboard * add configurable metrics http port; move http metrics server to main away from bitswap server Co-authored-by: Anton Evangelatov --- .../remoteblockstore/remoteblockstore.go | 17 ++++++ cmd/booster-bitswap/run.go | 24 +++++++- docker/monitoring/prometheus.yaml | 3 + metrics/metrics.go | 58 +++++++++++++++++++ 4 files changed, 101 insertions(+), 1 deletion(-) diff --git a/cmd/booster-bitswap/remoteblockstore/remoteblockstore.go b/cmd/booster-bitswap/remoteblockstore/remoteblockstore.go index 35b83ef64..ee5a9e733 100644 --- a/cmd/booster-bitswap/remoteblockstore/remoteblockstore.go +++ b/cmd/booster-bitswap/remoteblockstore/remoteblockstore.go @@ -6,12 +6,14 @@ import ( "fmt" "strings" + "github.com/filecoin-project/boost/metrics" "github.com/filecoin-project/boost/tracing" blocks "github.com/ipfs/go-block-format" "github.com/ipfs/go-cid" blockstore "github.com/ipfs/go-ipfs-blockstore" format "github.com/ipfs/go-ipld-format" logging "github.com/ipfs/go-log/v2" + "go.opencensus.io/stats" "go.opentelemetry.io/otel/attribute" ) @@ -40,14 +42,17 @@ func (ro *RemoteBlockstore) Get(ctx context.Context, c cid.Cid) (b blocks.Block, ctx, span := tracing.Tracer.Start(ctx, "rbls.get") defer span.End() span.SetAttributes(attribute.String("cid", c.String())) + stats.Record(ctx, metrics.BitswapRblsGetRequestCount.M(1)) log.Debugw("Get", "cid", c) data, err := ro.api.BlockstoreGet(ctx, c) err = normalizeError(err) log.Debugw("Get response", "cid", c, "error", err) if err != nil { + stats.Record(ctx, metrics.BitswapRblsGetFailResponseCount.M(1)) return nil, err } + stats.Record(ctx, metrics.BitswapRblsGetSuccessResponseCount.M(1)) return blocks.NewBlockWithCid(data, c) } @@ -55,10 +60,16 @@ func (ro *RemoteBlockstore) Has(ctx context.Context, c cid.Cid) (bool, error) { ctx, span := tracing.Tracer.Start(ctx, "rbls.has") defer span.End() span.SetAttributes(attribute.String("cid", c.String())) + stats.Record(ctx, metrics.BitswapRblsHasRequestCount.M(1)) log.Debugw("Has", "cid", c) has, err := ro.api.BlockstoreHas(ctx, c) log.Debugw("Has response", "cid", c, "has", has, "error", err) + if err != nil { + stats.Record(ctx, metrics.BitswapRblsHasFailResponseCount.M(1)) + } else { + stats.Record(ctx, metrics.BitswapRblsHasSuccessResponseCount.M(1)) + } return has, err } @@ -66,11 +77,17 @@ func (ro *RemoteBlockstore) GetSize(ctx context.Context, c cid.Cid) (int, error) ctx, span := tracing.Tracer.Start(ctx, "rbls.get_size") defer span.End() span.SetAttributes(attribute.String("cid", c.String())) + stats.Record(ctx, metrics.BitswapRblsGetSizeRequestCount.M(1)) log.Debugw("GetSize", "cid", c) size, err := ro.api.BlockstoreGetSize(ctx, c) err = normalizeError(err) log.Debugw("GetSize response", "cid", c, "size", size, "error", err) + if err != nil { + stats.Record(ctx, metrics.BitswapRblsGetSizeFailResponseCount.M(1)) + } else { + stats.Record(ctx, metrics.BitswapRblsGetSizeSuccessResponseCount.M(1)) + } return size, err } diff --git a/cmd/booster-bitswap/run.go b/cmd/booster-bitswap/run.go index 97829bff0..f5635db02 100644 --- a/cmd/booster-bitswap/run.go +++ b/cmd/booster-bitswap/run.go @@ -12,6 +12,7 @@ import ( cliutil "github.com/filecoin-project/boost/cli/util" "github.com/filecoin-project/boost/cmd/booster-bitswap/blockfilter" "github.com/filecoin-project/boost/cmd/booster-bitswap/remoteblockstore" + "github.com/filecoin-project/boost/metrics" "github.com/filecoin-project/boost/tracing" "github.com/filecoin-project/go-jsonrpc" lcli "github.com/filecoin-project/lotus/cli" @@ -28,11 +29,21 @@ var runCmd = &cli.Command{ Name: "pprof", Usage: "run pprof web server on localhost:6070", }, + &cli.UintFlag{ + Name: "pprof-port", + Usage: "the http port to serve pprof on", + Value: 6070, + }, &cli.UintFlag{ Name: "port", Usage: "the port to listen for bitswap requests on", Value: 8888, }, + &cli.UintFlag{ + Name: "metrics-port", + Usage: "the http port to serve prometheus metrics on", + Value: 9696, + }, &cli.StringFlag{ Name: "api-boost", Usage: "the endpoint for the boost API", @@ -55,8 +66,9 @@ var runCmd = &cli.Command{ }, Action: func(cctx *cli.Context) error { if cctx.Bool("pprof") { + pprofPort := cctx.Int("pprof-port") go func() { - err := http.ListenAndServe("localhost:6070", nil) + err := http.ListenAndServe(fmt.Sprintf("localhost:%d", pprofPort), nil) if err != nil { log.Error(err) } @@ -119,6 +131,16 @@ var runCmd = &cli.Command{ return err } + // Start the metrics web server + metricsPort := cctx.Int("metrics-port") + log.Infof("Starting booster-bitswap metrics web server on port %d", metricsPort) + http.Handle("/metrics", metrics.Exporter("booster_bitswap")) // metrics server + go func() { + if err := http.ListenAndServe(fmt.Sprintf("0.0.0.0:%d", metricsPort), nil); err != nil { + log.Errorf("could not start prometheus metric exporter server: %s", err) + } + }() + // Monitor for shutdown. <-ctx.Done() diff --git a/docker/monitoring/prometheus.yaml b/docker/monitoring/prometheus.yaml index e28e33659..0ecc44ceb 100644 --- a/docker/monitoring/prometheus.yaml +++ b/docker/monitoring/prometheus.yaml @@ -15,6 +15,9 @@ scrape_configs: - job_name: 'booster-http' static_configs: - targets: [ 'booster-http:7777' ] + - job_name: 'booster-bitswap' + static_configs: + - targets: [ 'booster-bitswap:9696' ] - job_name: 'lotus-miner' metrics_path: "/debug/metrics" static_configs: diff --git a/metrics/metrics.go b/metrics/metrics.go index a9c5d85cf..882c4f1dc 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -128,6 +128,17 @@ var ( HttpPieceByCid400ResponseCount = stats.Int64("http/piece_by_cid_400_response_count", "Counter of /piece/ 400 responses", stats.UnitDimensionless) HttpPieceByCid404ResponseCount = stats.Int64("http/piece_by_cid_404_response_count", "Counter of /piece/ 404 responses", stats.UnitDimensionless) HttpPieceByCid500ResponseCount = stats.Int64("http/piece_by_cid_500_response_count", "Counter of /piece/ 500 responses", stats.UnitDimensionless) + + // bitswap + BitswapRblsGetRequestCount = stats.Int64("bitswap/rbls_get_request_count", "Counter of RemoteBlockstore Get requests", stats.UnitDimensionless) + BitswapRblsGetSuccessResponseCount = stats.Int64("bitswap/rbls_get_success_response_count", "Counter of successful RemoteBlockstore Get responses", stats.UnitDimensionless) + BitswapRblsGetFailResponseCount = stats.Int64("bitswap/rbls_get_fail_response_count", "Counter of failed RemoteBlockstore Get responses", stats.UnitDimensionless) + BitswapRblsGetSizeRequestCount = stats.Int64("bitswap/rbls_getsize_request_count", "Counter of RemoteBlockstore GetSize requests", stats.UnitDimensionless) + BitswapRblsGetSizeSuccessResponseCount = stats.Int64("bitswap/rbls_getsize_success_response_count", "Counter of successful RemoteBlockstore GetSize responses", stats.UnitDimensionless) + BitswapRblsGetSizeFailResponseCount = stats.Int64("bitswap/rbls_getsize_fail_response_count", "Counter of failed RemoteBlockstore GetSize responses", stats.UnitDimensionless) + BitswapRblsHasRequestCount = stats.Int64("bitswap/rbls_has_request_count", "Counter of RemoteBlockstore Has requests", stats.UnitDimensionless) + BitswapRblsHasSuccessResponseCount = stats.Int64("bitswap/rbls_has_success_response_count", "Counter of successful RemoteBlockstore Has responses", stats.UnitDimensionless) + BitswapRblsHasFailResponseCount = stats.Int64("bitswap/rbls_has_fail_response_count", "Counter of failed RemoteBlockstore Has responses", stats.UnitDimensionless) ) var ( @@ -181,6 +192,44 @@ var ( Aggregation: view.Count(), } + // bitswap + BitswapRblsGetRequestCountView = &view.View{ + Measure: BitswapRblsGetRequestCount, + Aggregation: view.Count(), + } + BitswapRblsGetSuccessResponseCountView = &view.View{ + Measure: BitswapRblsGetSuccessResponseCount, + Aggregation: view.Count(), + } + BitswapRblsGetFailResponseCountView = &view.View{ + Measure: BitswapRblsGetFailResponseCount, + Aggregation: view.Count(), + } + BitswapRblsGetSizeRequestCountView = &view.View{ + Measure: BitswapRblsGetSizeRequestCount, + Aggregation: view.Count(), + } + BitswapRblsGetSizeSuccessResponseCountView = &view.View{ + Measure: BitswapRblsGetSizeSuccessResponseCount, + Aggregation: view.Count(), + } + BitswapRblsGetSizeFailResponseCountView = &view.View{ + Measure: BitswapRblsGetSizeFailResponseCount, + Aggregation: view.Count(), + } + BitswapRblsHasRequestCountView = &view.View{ + Measure: BitswapRblsHasRequestCount, + Aggregation: view.Count(), + } + BitswapRblsHasSuccessResponseCountView = &view.View{ + Measure: BitswapRblsHasSuccessResponseCount, + Aggregation: view.Count(), + } + BitswapRblsHasFailResponseCountView = &view.View{ + Measure: BitswapRblsHasFailResponseCount, + Aggregation: view.Count(), + } + InfoView = &view.View{ Name: "info", Description: "Lotus node information", @@ -463,6 +512,15 @@ var DefaultViews = func() []*view.View { HttpPieceByCid400ResponseCountView, HttpPieceByCid404ResponseCountView, HttpPieceByCid500ResponseCountView, + BitswapRblsGetRequestCountView, + BitswapRblsGetSuccessResponseCountView, + BitswapRblsGetFailResponseCountView, + BitswapRblsGetSizeRequestCountView, + BitswapRblsGetSizeSuccessResponseCountView, + BitswapRblsGetSizeFailResponseCountView, + BitswapRblsHasRequestCountView, + BitswapRblsHasSuccessResponseCountView, + BitswapRblsHasFailResponseCountView, lotusmetrics.DagStorePRBytesDiscardedView, lotusmetrics.DagStorePRBytesRequestedView, lotusmetrics.DagStorePRDiscardCountView,