Add metrics to Elastic Package Registry #827

mrodm · 2022-07-01T21:32:53Z

Relates #797

Add metrics to Elastic Package Registry. These metrics ~~are being sent to Elastic Stack through the APM agent and also they~~ are being exposed in the same application in the "/metrics" endpoint.

These metrics are being instrumented using Prometheus client. The list of metrics exposed in this PR are:

epr_service_info (Gauge)
epr_in_flight_requests (Gauge)
epr_number_indexed_packages (Gauge)
epr_http_requests_total (Counter)
epr_http_request_duration_seconds_{bucket,sum,count} (Histogram)
epr_http_request_size_bytes_{bucket,sum,count} (Histogram)
epr_http_response_size_bytes_{bucket,sum,count} (Histogram)
epr_storage_indexer_updates_index_success_total (Counter)
epr_storage_indexer_updates_index_error_total (Counter)
epr_storage_requests_total (Counter)
epr_storage_indexer_update_index_duration_seconds_{bucket,sum,count) (Histogram)
epr_storage_indexer_get_duration_seconds_{bucket,sum,count} (Histogram)

Some example of metrics that are going to be exposed (to be reviewed buckets yet):

# HELP epr_service_info Version information about this binary
# TYPE epr_service_info gauge
epr_service_info{instance="ec4bd25798d7",version="1.9.1"} 1

# HELP epr_in_flight_requests A gauge of requests currently being served by the http server.
# TYPE epr_in_flight_requests gauge
epr_in_flight_requests 0

# HELP epr_http_requests_total A counter for requests to the http server.
# TYPE epr_http_requests_total counter
epr_http_requests_total{code="200",method="get",path="/categories"} 10
epr_http_requests_total{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip"} 30
epr_http_requests_total{code="200",method="get",path="/package/{packageName:[a-z0-9_]+}/{packageVersion}/"} 30
epr_http_requests_total{code="200",method="get",path="/search"} 40
epr_http_requests_total{code="301",method="get",path="/package/{packageName:[a-z0-9_]+}/{packageVersion}/"} 30
epr_http_requests_total{code="400",method="get",path="/package/{packageName:[a-z0-9_]+}/{packageVersion}/"} 1
epr_http_requests_total{code="404",method="get",path="/package/{packageName:[a-z0-9_]+}/{packageVersion}/"} 2


# HELP epr_http_response_size_bytes A histogram of response sizes for requests to the http server.
# TYPE epr_http_response_size_bytes histogram
epr_http_response_size_bytes_bucket{code="200",method="get",path="/search",le="16"} 0
epr_http_response_size_bytes_bucket{code="200",method="get",path="/search",le="32"} 0
epr_http_response_size_bytes_bucket{code="200",method="get",path="/search",le="64"} 0
epr_http_response_size_bytes_bucket{code="200",method="get",path="/search",le="128"} 0
epr_http_response_size_bytes_bucket{code="200",method="get",path="/search",le="256"} 0
epr_http_response_size_bytes_bucket{code="200",method="get",path="/search",le="512"} 0
epr_http_response_size_bytes_bucket{code="200",method="get",path="/search",le="1024"} 21
epr_http_response_size_bytes_bucket{code="200",method="get",path="/search",le="65536"} 31
epr_http_response_size_bytes_bucket{code="200",method="get",path="/search",le="262144"} 63
epr_http_response_size_bytes_bucket{code="200",method="get",path="/search",le="+Inf"} 63
epr_http_response_size_bytes_sum{code="200",method="get",path="/search"} 4.192611e+06
epr_http_response_size_bytes_count{code="200",method="get",path="/search"} 63

# HELP epr_http_request_duration_seconds A histogram of latencies for requests to the http server.
# TYPE epr_http_request_duration_seconds histogram
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/search",le="0.005"} 57
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/search",le="0.01"} 63
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/search",le="0.025"} 63
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/search",le="0.05"} 63
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/search",le="0.1"} 63
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/search",le="0.25"} 63
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/search",le="0.5"} 63
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/search",le="1"} 63
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/search",le="2.5"} 63
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/search",le="5"} 63
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/search",le="10"} 63
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/search",le="+Inf"} 63
epr_http_request_duration_seconds_sum{code="200",method="get",path="/search"} 0.13667412999999992
epr_http_request_duration_seconds_count{code="200",method="get",path="/search"} 63

# HELP epr_http_request_size_bytes A histogram of sizes of requests to the http server.
# TYPE epr_http_request_size_bytes histogram
epr_http_request_size_bytes_bucket{code="200",method="get",path="/search",le="16"} 0
epr_http_request_size_bytes_bucket{code="200",method="get",path="/search",le="32"} 0
epr_http_request_size_bytes_bucket{code="200",method="get",path="/search",le="64"} 31
epr_http_request_size_bytes_bucket{code="200",method="get",path="/search",le="128"} 63
epr_http_request_size_bytes_bucket{code="200",method="get",path="/search",le="256"} 63
epr_http_request_size_bytes_bucket{code="200",method="get",path="/search",le="512"} 63
epr_http_request_size_bytes_bucket{code="200",method="get",path="/search",le="1024"} 63
epr_http_request_size_bytes_bucket{code="200",method="get",path="/search",le="65536"} 63
epr_http_request_size_bytes_bucket{code="200",method="get",path="/search",le="262144"} 63
epr_http_request_size_bytes_bucket{code="200",method="get",path="/search",le="+Inf"} 63
epr_http_request_size_bytes_sum{code="200",method="get",path="/search"} 4348
epr_http_request_size_bytes_count{code="200",method="get",path="/search"} 63

elasticmachine · 2022-07-01T21:36:22Z

💚 Build Succeeded

the below badges are clickable and redirect to their specific view in the CI or DOCS

Expand to view the summary

Build stats

Start Time: 2022-07-07T08:05:30.580+0000
Duration: 6 min 25 sec

Test stats 🧪

Test	Results
Failed	0
Passed	207
Skipped	0
Total	207

🤖 GitHub comments

To re-run your PR in the CI, just comment with:

/test : Re-trigger the build.

Add metrics for every request in flight requests, response size, request size and duration for every request including as label code, method and path. Uptime of the application could be obtanied using metric "process_start_time_seconds"

mrodm · 2022-07-04T14:30:14Z

A new middleware has been added to get all the metrics per request, and they have been exposed in /metrics following the prometheus standard.

These first metrics are:

Total Requests.
In flight requests.
Histogram Request size (bytes).
Histogram Response size (bytes).
Histogram Duration of the request (seconds).

The prometheus client offers also a set of default metrics. From these metrics, at least "uptime" values can be obtained from "process_start_time_seconds" metric:

# HELP process_start_time_seconds Start time of the process since unix epoch in seconds.
# TYPE process_start_time_seconds gauge
process_start_time_seconds 1.65694381591e+09

For all the metrics except "in flight requests", it has been added as labels: code, method and path. That would allow us to check in more detail the values, e.g. check specific endpoints.

As an example of these:

epr_http_request_duration_seconds_bucket{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip",le="0.005"} 0
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip",le="0.01"} 0
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip",le="0.025"} 0
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip",le="0.05"} 0
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip",le="0.1"} 4
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip",le="0.25"} 19
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip",le="0.5"} 30
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip",le="1"} 30
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip",le="2.5"} 30
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip",le="5"} 30
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip",le="10"} 30
epr_http_request_duration_seconds_bucket{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip",le="+Inf"} 30
epr_http_request_duration_seconds_sum{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip"} 6.300512742
epr_http_request_duration_seconds_count{code="200",method="get",path="/epr/{packageName}/{packageName:[a-z0-9_]+}-{packageVersion}.zip"} 30

@elastic/ecosystem I was wondering if you could take a look at this PR before adding more metrics (I still need to check about metrics related to storage). I wanted to check if you see any issue about the middleware or how the metrics are being added.

mrodm · 2022-07-04T14:34:30Z

main.go

@@ -186,6 +189,7 @@ func initAPMTracer(logger *zap.Logger) *apm.Tracer {
 	if err != nil {
 		logger.Fatal("Failed to initialize APM agent", zap.Error(err))
 	}
+	tracer.RegisterMetricsGatherer(apmprometheus.Wrap(prometheus.DefaultGatherer))


I was able to send the metrics to elastic stack using the APM integration. However, metrics that have labels on it are stored as plain json strings by default. And new dashboards should be created (still not sure how to achieve this in the APM integration).

As an alternative, I've added a new endpoint (/metrics) that exposes all the metrics using the Prometheus format. From that we could use MetricBeat to create our own integration.

WDYT ? Should we discard the APM integration for the time being in favor of the Prometheus endpoint and a new integration?

I think that we can focus on the prometheus endpoint by now. It is more clear how to build an integration around that. We can add the APM metrics gatherer later, specially seeing that it is basically a one-liner.

For users of APM, some of the information related to requests can be obtained in any case with more detail using the APM data.

jsoriano

This is looking good, added some comments by now.

jsoriano · 2022-07-04T14:47:59Z

main.go

@@ -186,6 +189,7 @@ func initAPMTracer(logger *zap.Logger) *apm.Tracer {
 	if err != nil {
 		logger.Fatal("Failed to initialize APM agent", zap.Error(err))
 	}
+	tracer.RegisterMetricsGatherer(apmprometheus.Wrap(prometheus.DefaultGatherer))


I think that we can focus on the prometheus endpoint by now. It is more clear how to build an integration around that. We can add the APM metrics gatherer later, specially seeing that it is basically a one-liner.

For users of APM, some of the information related to requests can be obtained in any case with more detail using the APM data.

jsoriano · 2022-07-04T14:49:18Z

main.go

@@ -281,7 +285,9 @@ func getRouter(logger *zap.Logger, config *Config, indexer Indexer) (*mux.Router
 	router.HandleFunc(signaturesRouterPath, signaturesHandler)
 	router.HandleFunc(packageIndexRouterPath, packageIndexHandler)
 	router.HandleFunc(staticRouterPath, staticHandler)
+	router.Handle("/metrics", promhttp.Handler())


For production environments we probably want to expose this endpoint in other host/port, so metrics are not publicly available. We may need to add a flag for this.

That's true, I'll add those flags and run this metrics endpoint in a separate server (e.g. http.Server).

Doing it in this way, it would also allow me to remove the filtering done in the logging middleware to avoid logging requests to /metrics.

util/metrics.go

jsoriano · 2022-07-04T14:51:55Z

util/metrics.go

+			}
+			labels := prometheus.Labels{"path": path}
+			handler = promhttp.InstrumentHandlerCounter(httpRequestsTotal.MustCurryWith(labels), handler)
+			handler = promhttp.InstrumentHandlerDuration(httpRequestDurationSeconds.MustCurryWith(labels), handler)


APM also collects this kind of duration information. Though I think it is ok to have this also as a metric.

jsoriano · 2022-07-04T14:55:20Z

util/metrics.go

+				512 * 1024,        /* 512KiB */
+				1024 * 1024,       /* 1MiB */
+				64 * 1024 * 1024,  /* 64MiB */
+				512 * 1024 * 1024, /* 512MiB */


Is there any reasoning behind the selection of these buckets? Maybe we can use the current APM data we collect now to select values according to our use case.

No reason for those buckets, I've set them just randomly and I was checking how they fit executing some curls.

If APM exposes that data so it can be checked the actual values, I'll check it and update this list.

With v2 changes, we don't expect any responses great than a few KBs. In case the Fleet wants to download a ZIP file, it will be redirected (HTTP 301) to the https://package-storage.elastic.co/.

So, should I plan now the buckets thinking on the storage v2 ? Buckets with lower values would be needed for sure.

Regarding this http_request_duration_seconds metric that is using the default buckets definition¹:

var ( DefBuckets = [][float64](https://pkg.go.dev/builtin#float64){.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10} )

I've been checking data collected by APM, and it looks they could fit well in our scenarios.

Footnotes

https://pkg.go.dev/github.com/prometheus/client_golang/prometheus#section-readme ↩

Co-authored-by: Jaime Soriano Pastor <jaime.soriano@elastic.co>

mtojek

It would be great if you post a sample /metrics response in the issue description.

go.mod

mtojek · 2022-07-05T07:30:18Z

main.go

+	// If -dry-run=true is set, service stops here after validation
+	if dryRun {
+		os.Exit(0)
+	}


Hm.. I wouldn't hide this condition in the initMetricsServer function as it will be surprising to debug. Shall we move it to the main routine?

This function was based on the initServer one, and I must review the usage of dryRun here.

I think the call to os.Exit(0) in initServer could also be moved to main(). I'll check that too.

Sounds great!

mtojek · 2022-07-05T07:31:20Z

util/logging.go

-			// Do not log requests to the health endpoint
-			if r.RequestURI == "/health" {
+			switch r.RequestURI {
+			case "/health":
+				// Do not log requests to these endpoints
 				next.ServeHTTP(w, r)
-				return
+			default:
+				logRequest(logger, next, w, r)
 			}
-
-			logRequest(logger, next, w, r)


Frankly speaking, I would log all requests. At least we know that some script/bot/logic/ hits the healthcheck. WDYT?

I got tempted by this when migrating to the ECS logger, but it flooded the logs quite a bit. Let's leave it for a separate PR if we do it.

For context, I just changed the code here because I wanted to filter outlogs from /metrics too. As part of one of the comments now /metrics is under another host:port (http.ListenAndServe()), so at the end it was not needed.

Not a strong opinion here, I see that this filter in case of /health endpoint could be advantageous to inspect logs later, since it could flood elastic with too many GET requests to /health (they could be filtered in a KQL query too). However, this is hiding from us the actual requests (in logs) to all the endpoints.

I could remove it and log all requests, and check whether or not it causes a problem when we try to diagnose, check logs... Ans if that is the case, we could remove it.
WDYT ?

I got tempted by this when migrating to the ECS logger, but it flooded the logs quite a bit. Let's leave it for a separate PR if we do it.

Ok, I'll keep the same logic and let's try that in a separate PR.

mtojek · 2022-07-05T07:33:08Z

util/metrics.go

+				512 * 1024,        /* 512KiB */
+				1024 * 1024,       /* 1MiB */
+				64 * 1024 * 1024,  /* 64MiB */
+				512 * 1024 * 1024, /* 512MiB */


With v2 changes, we don't expect any responses great than a few KBs. In case the Fleet wants to download a ZIP file, it will be redirected (HTTP 301) to the https://package-storage.elastic.co/.

util/metrics.go

mtojek · 2022-07-05T07:40:50Z

My fault, I haven't read the whole thread. I can see metrics here.

Few ideas to consider:

Is it possible to collect also EPR version (dimension)? It would be nice to correlate metrics with rollouts.
Do you plan to cover also indexers (GetPackages, number of indexed packages, errors, search time, etc.)?

jsoriano · 2022-07-05T08:51:25Z

Is it possible to collect also EPR version (dimension)? It would be nice to correlate metrics with rollouts.

This kind of "global" metadata uses to be added in Prometheus as labels of a dummy metric to avoid increasing a lot the size of responses. For example in Kubernetes you find "metrics" like the following ones, instead of adding this metadata to all the metrics of a service or a node. But assigning this info to the proper events in Metricbeat is a bit tricky.

kube_service_info{namespace="kube-system",service="kube-dns",cluster_ip="10.96.0.10",external_name="",load_balancer_ip=""} 1

kube_node_info{node="kind-worker2",kernel_version="5.10.25-linuxkit",os_image="Ubuntu 21.04",container_runtime_version="containerd://1.5.2",kubelet_version="v1.16.15",kubeproxy_version="v1.16.15",provider_id="kind://docker/kind/kind-worker2",pod_cidr="10.244.1.0/24",internal_ip="172.20.0.3"} 1

In istio you can also find some versions this way: https://github.com/elastic/beats/blob/949d7ccd1f866ddbcd73edb8ba0cd7b8af7f918d/x-pack/metricbeat/module/istio/istiod/_meta/testdata/istiod.v1.7.1.plain#L357

The problem of this trick for Metricbeat is that generic Metricbeat collector groups prometheus metrics by labels, so it may be difficult to put the version in all documents, unless we modify the Metricbeat collectors.
Kubernetes metricsets need Go code exactly for that, like here: https://github.com/elastic/beats/blob/949d7ccd1f866ddbcd73edb8ba0cd7b8af7f918d/metricbeat/module/kubernetes/state_service/state_service.go#L77-L87
But this doesn't seem to be available in the generic collector used in integrations.

mrodm · 2022-07-05T09:24:22Z

It would be great if you post a sample /metrics response in the issue description.

Sure! Added some examples of metrics into the description.

Is it possible to collect also EPR version (dimension)? It would be nice to correlate metrics with rollouts.

This kind of "global" metadata uses to be added in Prometheus as labels of a dummy metric to avoid increasing a lot the size of responses. For example in Kubernetes you find "metrics" like the following ones, instead of adding this metadata to all the metrics of a service or a node. But assigning this info to the proper events in Metricbeat is a bit tricky.

I could try to create a metric with that info at least to be exposed in /metrics. I wanted also to add a node_name or instance label in metrics.

2. Do you plan to cover also indexers (GetPackages, number of indexed packages, errors, search time, etc.)?

I wanted to at least cover number of indexed packages in this PR. I will take a look to the other metrics you mention to check how they can be added.

mrodm · 2022-07-05T09:50:52Z

main.go

-	// If -dry-run=true is set, service stops here after validation
-	if dryRun {
-		os.Exit(0)
-	}
-


Moved to main().
It looks safe to be executed the following lines:

router := mustLoadRouter(logger, config, combinedIndexer) apmgorilla.Instrument(router, apmgorilla.WithTracer(apmTracer)) return &http.Server{Addr: address, Handler: router}

and in main.go just do the exit call.
Do you think there could be any issue?

The idea for dry-run is to run the package validation, fail in case of an error, or os.Exit(0) when it's safe. To be 100% sure, you can try running it against a path with packages and see if it doesn't skip validation.

Tested in a container, and it seems that works as expected. Here it is the output:

root@7f7596816c73:/package-registry# cat config.yml package_paths: - /packages/development - /packages/production - /packages/staging - /packages/snapshot root@7f7596816c73:/package-registry# ./package-registry --dry-run {"log.level":"info","@timestamp":"2022-07-05T11:54:55.958Z","log.origin":{"file.name":"package-registry/main.go","file.line":95},"message":"Package registry started","ecs.version":"1.6.0"} {"log.level":"info","@timestamp":"2022-07-05T11:54:55.961Z","log.origin":{"file.name":"package-registry/main.go","file.line":247},"message":"Packages paths: /packages/development, /packages/production, /packages/staging, /packages/snapshot","ecs.version":"1.6.0"} {"log.level":"info","@timestamp":"2022-07-05T11:54:55.961Z","log.origin":{"file.name":"package-registry/main.go","file.line":249},"message":"Cache time for /search: 10m0s","ecs.version":"1.6.0"} {"log.level":"info","@timestamp":"2022-07-05T11:54:55.961Z","log.origin":{"file.name":"package-registry/main.go","file.line":250},"message":"Cache time for /categories: 10m0s","ecs.version":"1.6.0"} {"log.level":"info","@timestamp":"2022-07-05T11:54:55.961Z","log.origin":{"file.name":"package-registry/main.go","file.line":251},"message":"Cache time for all others: 10m0s","ecs.version":"1.6.0"} {"log.level":"info","@timestamp":"2022-07-05T11:54:55.962Z","log.origin":{"file.name":"packages/packages.go","file.line":191},"message":"Searching packages in /packages/development","ecs.version":"1.6.0"} {"log.level":"info","@timestamp":"2022-07-05T11:54:55.966Z","log.origin":{"file.name":"packages/packages.go","file.line":191},"message":"Searching packages in /packages/production","ecs.version":"1.6.0"} {"log.level":"info","@timestamp":"2022-07-05T11:55:01.087Z","log.origin":{"file.name":"packages/packages.go","file.line":191},"message":"Searching packages in /packages/staging","ecs.version":"1.6.0"} {"log.level":"info","@timestamp":"2022-07-05T11:55:01.309Z","log.origin":{"file.name":"packages/packages.go","file.line":191},"message":"Searching packages in /packages/snapshot","ecs.version":"1.6.0"} {"log.level":"info","@timestamp":"2022-07-05T11:55:02.318Z","log.origin":{"file.name":"packages/packages.go","file.line":191},"message":"Searching packages in /packages/development","ecs.version":"1.6.0"} {"log.level":"info","@timestamp":"2022-07-05T11:55:02.807Z","log.origin":{"file.name":"packages/packages.go","file.line":191},"message":"Searching packages in /packages/production","ecs.version":"1.6.0"} {"log.level":"info","@timestamp":"2022-07-05T11:55:02.828Z","log.origin":{"file.name":"packages/packages.go","file.line":191},"message":"Searching packages in /packages/staging","ecs.version":"1.6.0"} {"log.level":"info","@timestamp":"2022-07-05T11:55:02.932Z","log.origin":{"file.name":"packages/packages.go","file.line":191},"message":"Searching packages in /packages/snapshot","ecs.version":"1.6.0"} {"log.level":"info","@timestamp":"2022-07-05T11:55:02.932Z","log.origin":{"file.name":"package-registry/main.go","file.line":269},"message":"1584 package manifests loaded","ecs.version":"1.6.0"} root@7f7596816c73:/package-registry#

I find confusing to need now so many if dryRun { return }.

Could we refactor this so it is more explicit what is run and what not when dryRun is enabled?

If the idea is to run only the package validation, maybe we can move the initial package loading to its own function, and in main() just do after initializing the logger:

if dryRun { initIndexers(logger) os.Exit(0) }

(initIndexers would contain what is done in initServer before the current if dryRun, initServer would also call initIndexers).

mtojek · 2022-07-05T11:07:58Z

I could try to create a metric with that info at least to be exposed in /metrics. I wanted also to add a node_name or instance label in metrics.

My intention was to fix the following use case: during the rollout of a new EPR deployment, we can easily determine whether errors are coming from the freshly deployed app.

mtojek

I left a few minor comments, but in general, it's close to getting merged. We can add more metrics in follow-ups.

One thing that is missing is a CHANGELOG entry.

mtojek · 2022-07-06T07:15:38Z

main.go

@@ -143,6 +152,11 @@ func initMetricsServer(logger *zap.Logger) {
 		return
 	}
 	logger.Info("Starting http metrics in " + metricsAddress)
+	hostname, found := os.LookupEnv("HOSTNAME")


I think that there is a specific function called os.Hostname.

Thanks! Updated to use that function

mtojek · 2022-07-06T07:16:19Z

util/metrics.go

@@ -104,6 +110,8 @@ var (
 // MetricsMiddleware is a middleware used to measure every request received
 func MetricsMiddleware() mux.MiddlewareFunc {
 	// Rergister all metrics
+	prometheus.MustRegister(ServiceInfo)


Should ServiceInfo be exposed or can be private?

mtojek · 2022-07-06T07:17:16Z

util/metrics.go

@@ -0,0 +1,140 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one


I wouldn't put more stuff in util as it will become a trash can eventually. What about another package called metrics?

Sure! I will put this code in a new metrics package.
I was thinking in the last changes that this kind of naming utils.ServiceInfo was not meaningful. So, better in its own package.

jsoriano · 2022-07-06T07:08:35Z

main.go

+
+	hostname, found := os.LookupEnv("HOSTNAME")
+	if !found {
+		hostname = defaultInstanceName


You can also try with os.Hostname().

Sure, changed

I meant that you could use both methods 🙂 the HOSTNAME environment variable, and os.Hostname().
Consider creating a function to obtain the hostname from one of these two methods, plus the fallback of using localhost.

jsoriano · 2022-07-06T07:12:43Z

search.go

 		packages, err := indexer.Get(r.Context(), &opts)
 		if err != nil {
 			notFoundError(w, errors.Wrapf(err, "fetching package failed"))
 			return
 		}
+		util.SearchProcessDurationSeconds.Observe(time.Since(start).Seconds())


We are overlapping here a bit with APM. APM is intended for this kind of monitoring, and provides more contextual information, as data of the request, or information about the parent or child spans.
Without some context this metric doesn't say much, but if we start adding context to this metric, as the parameters used, we can end up with metrics explosion.
I would remove it by now as this is covered by APM, and maybe we can reconsider it in the future.

main.go

util/metrics.go

jsoriano · 2022-07-06T07:27:01Z

util/metrics.go

+
+const metricsNamespace = "epr"
+
+// info metric


Use godocs in this kind of comments, starting with the name what is being described.

Suggested change

// info metric

// ServiceInfo is a metric used to report information about the service. Labels are used for that, its value is not relevant.

I've added those comments just to some kind of grouping. I delete those comments, as every metric variable has a help message, I think it's not needed to add a comment too.

jsoriano · 2022-07-06T07:33:10Z

main.go

-	// If -dry-run=true is set, service stops here after validation
-	if dryRun {
-		os.Exit(0)
-	}
-


I find confusing to need now so many if dryRun { return }.

Could we refactor this so it is more explicit what is run and what not when dryRun is enabled?

If the idea is to run only the package validation, maybe we can move the initial package loading to its own function, and in main() just do after initializing the logger:

if dryRun { initIndexers(logger) os.Exit(0) }

(initIndexers would contain what is done in initServer before the current if dryRun, initServer would also call initIndexers).

main.go

jsoriano · 2022-07-06T07:47:22Z

main.go

+	if !found {
+		hostname = defaultInstanceName
+	}
+	util.ServiceInfo.WithLabelValues(version, hostname).Set(1)


Nit. I don't find WithLabelsValues very intuitive, I had to double check where is defined the names of the labels (I see it is in the definition of ServiceInfo).
Maybe we could use WithLabels.

Suggested change

util.ServiceInfo.WithLabelValues(version, hostname).Set(1)

util.ServiceInfo.WithLabels(prometheus.Labels{"version": version, "hostname": hostname}).Set(1)

Or given the particularity of this "metric", consider adding a method for this, so here we can do something like:

Suggested change

util.ServiceInfo.WithLabelValues(version, hostname).Set(1)

util.SetServiceInfo(version, hostname)

Agreed, it's not very intuitive.
As there have been added new metrics using labels (e.g. StorageRequestsTotal), I would keep the first approach you proposed (using With() ), and set/increment all the metrics in the same way.

jsoriano · 2022-07-06T07:52:45Z

storage/indexer.go

@@ -160,6 +160,7 @@ func (i *Indexer) updateIndex(ctx context.Context) error {
 	defer i.m.Unlock()
 	i.cursor = storageCursor.Current


If this is a string, we could collect it with something like the service info.

Something else that we could measure here is the count of updates, this may be an interesting metric to see how frequently we are requesting an update.

(Btw, we could instrument more the storage indexer with APM, but leave this for other PRs)

mrodm · 2022-07-06T09:16:41Z

README.md

-* `/epr/{name}/{name}-{version}.tar.gz`: Download a package
+* `/epr/{name}/{name}-{version}.zip`: Download a package


According to the artifact handler, just zip extension is allowed in the regex

Yes, good catch :)

Removed epr_search_process_duration_seconds metric in favor of creating specific metrics for storage indexer get and update functions.

jsoriano · 2022-07-06T16:26:48Z

README.md

@@ -236,6 +236,19 @@ It will be listening in the given address.

 You can read more about this profiler and the available endpoints in the [pprof documentation](https://pkg.go.dev/net/http/pprof).

+## Metrics
+Package registry is instrumented to expose Prometheus metrics. These metrics are exposed under the `/metrics` endpoint.
+By default this endpoint runs `localhost:9000`. These metrics can be scraped like:


So far APM and pprof are disabled by default, should we do the same with metrics? This would also help to avoid backwards compatibility issues.

Ok, I'll keep the same approach to avoid any possible backwards compatibility issue.

README.md

jsoriano · 2022-07-06T16:36:56Z

main.go

+
+	hostname, found := os.LookupEnv("HOSTNAME")
+	if !found {
+		hostname = defaultInstanceName


I meant that you could use both methods 🙂 the HOSTNAME environment variable, and os.Hostname().
Consider creating a function to obtain the hostname from one of these two methods, plus the fallback of using localhost.

jsoriano · 2022-07-06T16:41:56Z

metrics/middleware.go

+			path, _ := route.GetPathTemplate()
+			labels := prometheus.Labels{"path": path}


Should we handle the error of route.GetPathTemplate(), and set the path label only when available?

Suggested change

path, _ := route.GetPathTemplate()

labels := prometheus.Labels{"path": path}

path, err := route.GetPathTemplate()

if err == nil {

labels := prometheus.Labels{"path": path}

}

Good catch, I missed handling this error.

As the prometheus.Labels var must have all the values defined in the metrics (path should have some value), I think I'll go with this:

path, err := route.GetPathTemplate() if err != nil { path := "unknown" } labels := prometheus.Labels{"path": path}

When I try to query urls that are not defined like https://localhost:8080/notfound, the router.NotFoundHandler is executed and there is no metric about that in metrics endpoint. That's why I think that error did not raise in my tests.

Co-authored-by: Jaime Soriano Pastor <jaime.soriano@elastic.co>

mtojek

It seems to be complete, all my comments are addressed. Well done!

Once you get a "green light" from Jaime, feel free to merge this PR.

mrodm · 2022-07-06T18:20:50Z

CHANGELOG.md

@@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 * Update Go version and base Ubuntu image. [#821](https://github.com/elastic/package-registry/pull/821)
 * Add support for "threat_intel" category. [#841](https://github.com/elastic/package-registry/pull/841)
+* Instrument package registry with Prometheus metrics. [#827](https://github.com/elastic/package-registry/pull/827)


Should I mark this as experimental here and in the Readme? I was thinking in case some metrics (e.g. namings or buckets) need to be updated later

Nah, I would say that we can go without special labels like experimental or beta. It's just yet another feature. If it starts failing, we will bugfix it.

jsoriano

👍

Add apm and prometheus packages

3e7a8a6

mrodm force-pushed the add_metrics_prometheus branch from 31cbca6 to 941d546 Compare July 1, 2022 21:36

mrodm force-pushed the add_metrics_prometheus branch 2 times, most recently from 7d8206b to 840b435 Compare July 1, 2022 22:15

jlind23 added the Team:Ecosystem Label for the Packages Ecosystem team label Jul 4, 2022

jlind23 mentioned this pull request Jul 4, 2022

package-registry observability integration #797

Closed

3 tasks

jlind23 assigned mrodm Jul 4, 2022

Add basic metrics for every request

352170c

Add metrics for every request in flight requests, response size, request size and duration for every request including as label code, method and path. Uptime of the application could be obtanied using metric "process_start_time_seconds"

mrodm force-pushed the add_metrics_prometheus branch from 840b435 to 352170c Compare July 4, 2022 14:02

mrodm commented Jul 4, 2022

View reviewed changes

mrodm marked this pull request as ready for review July 4, 2022 14:46

jsoriano reviewed Jul 4, 2022

View reviewed changes

mrodm and others added 3 commits July 4, 2022 17:09

Fix typo metrics middleware

f5132cd

Co-authored-by: Jaime Soriano Pastor <jaime.soriano@elastic.co>

Fix typo in usage

4e4cea2

Allow metrics endpoint to be served in a different address

5e02260

mrodm force-pushed the add_metrics_prometheus branch from 717bfdd to 5e02260 Compare July 4, 2022 16:40

mtojek reviewed Jul 5, 2022

View reviewed changes

Rephrase comment

0fb89c3

mrodm added 2 commits July 5, 2022 11:29

Move os.Exit calls to main

54f8d76

Downgrade to apm 1.14.0 and remove apmprometheus

73cca37

mrodm force-pushed the add_metrics_prometheus branch from 812be9a to 73cca37 Compare July 5, 2022 09:47

mrodm commented Jul 5, 2022

View reviewed changes

mrodm added 2 commits July 5, 2022 12:53

Add indexed packages metric

a1853ee

Merge upstream main branch into add_metrics_prometheus branch

4cef3f1

mrodm requested review from jsoriano and mtojek July 5, 2022 17:29

mtojek reviewed Jul 6, 2022

View reviewed changes

jsoriano reviewed Jul 6, 2022

View reviewed changes

mrodm added 4 commits July 6, 2022 09:56

Use specific os function to get hostname

b510843

Moved metrics out from util package

6d29990

Refactor init indexers in its own function

871a863

Update Readme and Changelog

8906a03

mrodm commented Jul 6, 2022

View reviewed changes

mrodm added 4 commits July 6, 2022 12:43

Add two more metrics

4d8226c

Merge upstream main branch into add_metrics_prometheus branch

b508695

Set service_info metrics as the other metrics

42ecdd7

Set storage indexer metrics

0031521

Removed epr_search_process_duration_seconds metric in favor of creating specific metrics for storage indexer get and update functions.

mrodm requested review from mtojek and jsoriano July 6, 2022 12:39

Add metric to count the erros while updating the index/cursor

679bb07

mrodm mentioned this pull request Jul 6, 2022

Set metrics address and port for EPR container elastic/elastic-package#885

Merged

Update Dockerfile to ensure metrics are accesbile from outside

7d9ff68

jsoriano reviewed Jul 6, 2022

View reviewed changes

mrodm and others added 4 commits July 6, 2022 18:57

Update README.md

14ad680

Co-authored-by: Jaime Soriano Pastor <jaime.soriano@elastic.co>

Handler error of GetPathTemplate

4913a1f

Add function to get hostname

bc44972

Disable by default metrics endpoint

8492d33

mrodm force-pushed the add_metrics_prometheus branch from b560dc6 to 8492d33 Compare July 6, 2022 18:01

mtojek approved these changes Jul 6, 2022

View reviewed changes

mrodm commented Jul 6, 2022

View reviewed changes

Adjust metrics name related to storage indexer processes

f05309e

jsoriano approved these changes Jul 6, 2022

View reviewed changes

Updated metric help messages

584bd06

mrodm merged commit 5090808 into elastic:main Jul 7, 2022

		@@ -0,0 +1,140 @@
		// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one

	// info metric
	// ServiceInfo is a metric used to report information about the service. Labels are used for that, its value is not relevant.

	util.ServiceInfo.WithLabelValues(version, hostname).Set(1)
	util.ServiceInfo.WithLabels(prometheus.Labels{"version": version, "hostname": hostname}).Set(1)

	util.ServiceInfo.WithLabelValues(version, hostname).Set(1)
	util.SetServiceInfo(version, hostname)

		@@ -160,6 +160,7 @@ func (i *Indexer) updateIndex(ctx context.Context) error {
		defer i.m.Unlock()
		i.cursor = storageCursor.Current

		* `/epr/{name}/{name}-{version}.tar.gz`: Download a package
		* `/epr/{name}/{name}-{version}.zip`: Download a package

		path, _ := route.GetPathTemplate()
		labels := prometheus.Labels{"path": path}

Add metrics to Elastic Package Registry #827

Add metrics to Elastic Package Registry #827

Conversation

mrodm commented Jul 1, 2022 • edited Loading

elasticmachine commented Jul 1, 2022 • edited Loading

💚 Build Succeeded

Build stats

Test stats 🧪

🤖 GitHub comments

mrodm commented Jul 4, 2022 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

jsoriano left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Footnotes

mtojek left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

mtojek commented Jul 5, 2022

jsoriano commented Jul 5, 2022 • edited Loading

mrodm commented Jul 5, 2022

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

mtojek commented Jul 5, 2022

mtojek left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

mrodm Jul 6, 2022 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

mrodm Jul 6, 2022 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

mtojek left a comment • edited Loading

Choose a reason for hiding this comment

mrodm Jul 6, 2022 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

jsoriano left a comment

Choose a reason for hiding this comment

mrodm commented Jul 1, 2022 •

edited

Loading

elasticmachine commented Jul 1, 2022 •

edited

Loading

mrodm commented Jul 4, 2022 •

edited

Loading

jsoriano commented Jul 5, 2022 •

edited

Loading

mrodm Jul 6, 2022 •

edited

Loading

mrodm Jul 6, 2022 •

edited

Loading

mtojek left a comment •

edited

Loading

mrodm Jul 6, 2022 •

edited

Loading