From 584713cb079f4ab2618bc0b2822102d3b2ecc8f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paolo=20Chil=C3=A0?= Date: Mon, 6 May 2024 11:12:57 +0200 Subject: [PATCH] Reduce agent logs by default (#4633) * set intermediate verification error logs to debug * Drop non-zero metrics periodic logs in monitoring config * Add script for elastic-agent logs and metrics disk size comparison * changelog --- ...14717717-Reduce-agent-logs-by-default.yaml | 32 ++ .../es/elastic-agent-logs-comparison.http | 407 ++++++++++++++++++ .../application/monitoring/v1_monitor.go | 10 + .../artifact/download/composed/verifier.go | 3 +- 4 files changed, 450 insertions(+), 2 deletions(-) create mode 100644 changelog/fragments/1714717717-Reduce-agent-logs-by-default.yaml create mode 100644 docs/scripts/es/elastic-agent-logs-comparison.http diff --git a/changelog/fragments/1714717717-Reduce-agent-logs-by-default.yaml b/changelog/fragments/1714717717-Reduce-agent-logs-by-default.yaml new file mode 100644 index 00000000000..99436a70a45 --- /dev/null +++ b/changelog/fragments/1714717717-Reduce-agent-logs-by-default.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: enhancement + +# Change summary; a 80ish characters long description of the change. +summary: Reduce agent logs by default by dropping "Non-zero metrics..." logs before ES ingestion + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; a word indicating the component this changeset affects. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +pr: https://github.com/elastic/elastic-agent/pull/4633 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +issue: https://github.com/elastic/elastic-agent/issues/4252 diff --git a/docs/scripts/es/elastic-agent-logs-comparison.http b/docs/scripts/es/elastic-agent-logs-comparison.http new file mode 100644 index 00000000000..8f6fd8ec4c1 --- /dev/null +++ b/docs/scripts/es/elastic-agent-logs-comparison.http @@ -0,0 +1,407 @@ +# This script is used to measure the disk size of elastic-agent logs and metrics. +# We need to run a baseline and an updated version of the agent for comparison. +# After ingesting the logs and metrics from both runs we reindex the documents based on +# agent ids and timestamp range and store them in dedicated indices so we can measure +# disk size (the new indices are creates with the same mappings as the elastic-agent +# logs and metrics datastreams) + +# In order to simplify running the script, a few variables have been defined. +# Note: The *_start_ts variables seem to not always render correctly, we may +# just replace the values as a workaround + +# Variables (with some sample values) +# +# Variable name Sample value +# +# baseline_agent_id acaf7ee8-defa-476a-bd57-2ef565809470 +# updated_agent_id e9a7c222-172e-4d39-9dbb-311be5e24a75 +# baseline_start_ts 2024-04-29T06:40:39 +# updated_start_ts 2024-04-29T08:53:23 +# time_interval +1h + + + +# Cleanup + +# Test log indices and templates +DELETE /_data_stream/logs-elastic_agent-disksize.baseline +DELETE /_data_stream/logs-elastic_agent-disksize.updated +DELETE /_data_stream/logs-elastic_agent.filebeat-disksize.baseline +DELETE /_data_stream/logs-elastic_agent.filebeat-disksize.updated +DELETE /_data_stream/logs-elastic_agent.metricbeat-disksize.baseline +DELETE /_data_stream/logs-elastic_agent.metricbeat-disksize.updated + +# Test metrics indices and templates +DELETE /metrics-elastic_agent.elastic_agent-disksize.baseline +DELETE /metrics-elastic_agent.elastic_agent-disksize.updated +DELETE /_index_template/metrics-elastic_agent.elastic_agent-disksize + +DELETE /metrics-elastic_agent.filebeat-disksize.baseline +DELETE /metrics-elastic_agent.filebeat-disksize.updated +DELETE /_index_template/metrics-elastic_agent.filebeat-disksize + +DELETE /metrics-elastic_agent.metricbeat-disksize.baseline +DELETE /metrics-elastic_agent.metricbeat-disksize.updated +DELETE /_index_template/metrics-elastic_agent.metricbeat-disksize + + +# Recreate index templates + +# Mappings we want to use in our test index templates +# GET /_index_template/logs-elastic_agent +# GET /_index_template/metrics-elastic_agent.elastic_agent +# GET /_index_template/metrics-elastic_agent.filebeat +# GET /_index_template/metrics-elastic_agent.metricbeat + + +PUT /_index_template/metrics-elastic_agent.elastic_agent-disksize +{ + "index_patterns": [ + "metrics-elastic_agent.elastic_agent-disksize*" + ], + "template": { + "settings": { + "index": { + "mode": "time_series", + "routing_path": [ + "component.id", + "agent.id", + "metricset.name" + ] + } + }, + "mappings": { + } + }, + "composed_of": [ + "metrics@tsdb-settings", + "metrics-elastic_agent.elastic_agent@package", + "metrics-elastic_agent.elastic_agent@custom", + "ecs@mappings", + ".fleet_globals-1", + ".fleet_agent_id_verification-1" + ], + "priority": 201, + "ignore_missing_component_templates": [ + "metrics-elastic_agent.elastic_agent@custom" + ] +} + +PUT /_index_template/metrics-elastic_agent.filebeat-disksize +{ + "index_patterns": [ + "metrics-elastic_agent.filebeat-disksize*" + ], + "template": { + "settings": { + "index": { + "mode": "time_series", + "routing_path": [ + "component.id", + "agent.id", + "metricset.name" + ] + } + }, + "mappings": { + } + }, + "composed_of": [ + "metrics@tsdb-settings", + "metrics-elastic_agent.filebeat@package", + "metrics-elastic_agent.filebeat@custom", + "ecs@mappings", + ".fleet_globals-1", + ".fleet_agent_id_verification-1" + ], + "priority": 201, + "ignore_missing_component_templates": [ + "metrics-elastic_agent.filebeat@custom" + ] +} + +PUT /_index_template/metrics-elastic_agent.metricbeat-disksize +{ + "index_patterns": [ + "metrics-elastic_agent.metricbeat-disksize*" + ], + "template": { + "settings": { + "index": { + "mode": "time_series", + "routing_path": [ + "component.id", + "agent.id", + "metricset.name" + ] + } + }, + "mappings": { + } + }, + "composed_of": [ + "metrics@tsdb-settings", + "metrics-elastic_agent.metricbeat@package", + "metrics-elastic_agent.metricbeat@custom", + "ecs@mappings", + ".fleet_globals-1", + ".fleet_agent_id_verification-1" + ], + "priority": 201, + "ignore_missing_component_templates": [ + "metrics-elastic_agent.metricbeat@custom" + ] +} + +# Reindex a subset of the elastic-agent logs in the new indices +POST _reindex +{ + "source": { + "index": "logs-elastic_agent-default", + "query": { + "bool": { + "filter": [ + {"term": {"agent.id":"${baseline_agent_id}"}}, + {"range": {"@timestamp": {"gte": "${baseline_start_ts}", "lte": "${baseline_start_ts}||${time_interval}"}}} + ] + } + } + }, + "dest": { + "index": "logs-elastic_agent-disksize.baseline", + "op_type": "create" + } +} + +POST _reindex +{ + "source": { + "index": "logs-elastic_agent-default", + "query": { + "bool": { + "filter": [ + {"term": {"agent.id":"${updated_agent_id}"}}, + {"range": {"@timestamp": {"gte": "${updated_start_ts}", "lte": "${updated_start_ts}||${time_interval}"}}} + ] + } + } + }, + "dest": { + "index": "logs-elastic_agent-disksize.updated", + "op_type": "create" + } +} + +POST _reindex +{ + "source": { + "index": "logs-elastic_agent.filebeat-default", + "query": { + "bool": { + "filter": [ + {"term": {"agent.id":"${baseline_agent_id}"}}, + {"range": {"@timestamp": {"gte": "${baseline_start_ts}", "lte": "${baseline_start_ts}||${time_interval}"}}} + ] + } + } + }, + "dest": { + "index": "logs-elastic_agent.filebeat-disksize.baseline", + "op_type": "create" + } +} + +POST _reindex +{ + "source": { + "index": "logs-elastic_agent.filebeat-default", + "query": { + "bool": { + "filter": [ + {"term": {"agent.id":"${updated_agent_id}"}}, + {"range": {"@timestamp": {"gte": "${updated_start_ts}", "lte": "${updated_start_ts}||${time_interval}"}}} + ] + } + } + }, + "dest": { + "index": "logs-elastic_agent.filebeat-disksize.updated", + "op_type": "create" + } +} + +POST _reindex +{ + "source": { + "index": "logs-elastic_agent.metricbeat-default", + "query": { + "bool": { + "filter": [ + {"term": {"agent.id":"${baseline_agent_id}"}}, + {"range": {"@timestamp": {"gte": "${baseline_start_ts}", "lte": "${baseline_start_ts}||${time_interval}"}}} + ] + } + } + }, + "dest": { + "index": "logs-elastic_agent.metricbeat-disksize.baseline", + "op_type": "create" + } +} + +POST _reindex +{ + "source": { + "index": "logs-elastic_agent.metricbeat-default", + "query": { + "bool": { + "filter": [ + {"term": {"agent.id":"${updated_agent_id}"}}, + {"range": {"@timestamp": {"gte": "${updated_start_ts}", "lte": "${updated_start_ts}||${time_interval}"}}} + ] + } + } + }, + "dest": { + "index": "logs-elastic_agent.metricbeat-disksize.updated", + "op_type": "create" + } +} + +# Reindex a subset of the elastic-agent metrics in the new indices +POST _reindex +{ + "source": { + "index": "metrics-elastic_agent.elastic_agent-default", + "query": { + "bool": { + "filter": [ + {"term": {"agent.id":"${baseline_agent_id}"}}, + {"range": {"@timestamp": {"gte": "${baseline_start_ts}", "lte": "${baseline_start_ts}||${time_interval}"}}} + ] + } + } + }, + "dest": { + "index": "metrics-elastic_agent.elastic_agent-disksize.baseline", + "op_type": "create" + } +} + +POST _reindex +{ + "source": { + "index": "metrics-elastic_agent.elastic_agent-default", + "query": { + "bool": { + "filter": [ + {"term": {"agent.id":"${updated_agent_id}"}}, + {"range": {"@timestamp": {"gte": "${updated_start_ts}", "lte": "${updated_start_ts}||${time_interval}"}}} + ] + } + } + }, + "dest": { + "index": "metrics-elastic_agent.elastic_agent-disksize.updated", + "op_type": "create" + } +} + +POST _reindex +{ + "source": { + "index": "metrics-elastic_agent.filebeat-default", + "query": { + "bool": { + "filter": [ + {"term": {"agent.id":"${baseline_agent_id}"}}, + {"range": {"@timestamp": {"gte": "${baseline_start_ts}", "lte": "${baseline_start_ts}||${time_interval}"}}} + ] + } + } + }, + "dest": { + "index": "metrics-elastic_agent.filebeat-disksize.baseline", + "op_type": "create" + } +} + + +POST _reindex +{ + "source": { + "index": "metrics-elastic_agent.filebeat-default", + "query": { + "bool": { + "filter": [ + {"term": {"agent.id":"${updated_agent_id}"}}, + {"range": {"@timestamp": {"gte": "${updated_start_ts}", "lte": "${updated_start_ts}||${time_interval}"}}} + ] + } + } + }, + "dest": { + "index": "metrics-elastic_agent.filebeat-disksize.updated", + "op_type": "create" + } +} + +POST _reindex +{ + "source": { + "index": "metrics-elastic_agent.metricbeat-default", + "query": { + "bool": { + "filter": [ + {"term": {"agent.id":"${baseline_agent_id}"}}, + {"range": {"@timestamp": {"gte": "${baseline_start_ts}", "lte": "${baseline_start_ts}||${time_interval}"}}} + ] + } + } + }, + "dest": { + "index": "metrics-elastic_agent.metricbeat-disksize.baseline", + "op_type": "create" + } +} + +POST _reindex +{ + "source": { + "index": "metrics-elastic_agent.metricbeat-default", + "query": { + "bool": { + "filter": [ + {"term": {"agent.id":"${updated_agent_id}"}}, + {"range": {"@timestamp": {"gte": "${updated_start_ts}", "lte": "${updated_start_ts}||${time_interval}"}}} + ] + } + } + }, + "dest": { + "index": "metrics-elastic_agent.metricbeat-disksize.updated", + "op_type": "create" + } +} + +# Check indices disk usage + +## Logs +POST /logs-elastic_agent-disksize.baseline/_disk_usage?run_expensive_tasks=true +POST /logs-elastic_agent-disksize.updated/_disk_usage?run_expensive_tasks=true + +POST /logs-elastic_agent.filebeat-disksize.baseline/_disk_usage?run_expensive_tasks=true +POST /logs-elastic_agent.filebeat-disksize.updated/_disk_usage?run_expensive_tasks=true + +POST /logs-elastic_agent.metricbeat-disksize.baseline/_disk_usage?run_expensive_tasks=true +POST /logs-elastic_agent.metricbeat-disksize.updated/_disk_usage?run_expensive_tasks=true + +## Metrics +POST /metrics-elastic_agent.elastic_agent-disksize.baseline/_disk_usage?run_expensive_tasks=true +POST /metrics-elastic_agent.elastic_agent-disksize.updated/_disk_usage?run_expensive_tasks=true + +POST /metrics-elastic_agent.filebeat-disksize.baseline/_disk_usage?run_expensive_tasks=true +POST /metrics-elastic_agent.filebeat-disksize.updated/_disk_usage?run_expensive_tasks=true + +POST /metrics-elastic_agent.metricbeat-disksize.baseline/_disk_usage?run_expensive_tasks=true +POST /metrics-elastic_agent.metricbeat-disksize.updated/_disk_usage?run_expensive_tasks=true \ No newline at end of file diff --git a/internal/pkg/agent/application/monitoring/v1_monitor.go b/internal/pkg/agent/application/monitoring/v1_monitor.go index 2b278f5a7c2..82fce6ff29b 100644 --- a/internal/pkg/agent/application/monitoring/v1_monitor.go +++ b/internal/pkg/agent/application/monitoring/v1_monitor.go @@ -349,6 +349,16 @@ func (b *BeatsMonitor) injectLogsInput(cfg map[string]interface{}, components [] }, }, }, + // drop periodic metrics logs (those are useful mostly in diagnostic dumps where we collect log files) + map[string]interface{}{ + "drop_event": map[string]interface{}{ + "when": map[string]interface{}{ + "regexp": map[string]interface{}{ + "message": "^Non-zero metrics in the last", + }, + }, + }, + }, // copy original dataset so we can drop the dataset field map[string]interface{}{ "copy_fields": map[string]interface{}{ diff --git a/internal/pkg/agent/application/upgrade/artifact/download/composed/verifier.go b/internal/pkg/agent/application/upgrade/artifact/download/composed/verifier.go index bfb305c8cf0..b9240cce638 100644 --- a/internal/pkg/agent/application/upgrade/artifact/download/composed/verifier.go +++ b/internal/pkg/agent/application/upgrade/artifact/download/composed/verifier.go @@ -49,9 +49,8 @@ func (v *Verifier) Verify(a artifact.Artifact, version agtversion.ParsedSemVer, return nil } + v.log.Debugw("Verifier failed!", "verifier", verifier.Name(), "error", e) err = multierror.Append(err, e) - - v.log.Warnw("Verifier failed!", "verifier", verifier.Name(), "error", e) } return err