Skip to content

Commit

Permalink
[Stack Monitoring] Collect metrics errors in health api (#148750)
Browse files Browse the repository at this point in the history
Closes #140358

## Summary
This PR adds integration package errors to the `_health` endpoint. The
response should look similar to the metricbeat one we already have. The
difference will be that the package will use `data_stream.dataset`
instead of `metricset.name`

## Testing

1. Setup an integration
- Follow this
[guide](https://github.com/klacabane/observability-dev/blob/0e924730cc5d9ae16f67420eb6fa3b2337b98dc9/docs/infra-obs-ui/stack-monitoring_integration-packages.md)
to install the package. ~~⚠️ Note that you need to have `metricset.name`
mapping to see the errors added in this
[PR](elastic/integrations#4973 - No need
anymore as we use `data_stream.dataset` for packages
2. Installing the package: 
- In this case, we want to enable metrics and set the Agent policy to
Elastic-Agent (elastic-package). To see an error I used the wrong host
URL (inside `Change Defaults`) when I was configuring the elasticsearch
integration:
 
<img width="1313" alt="image"
src="https://user-images.githubusercontent.com/14139027/212313537-f1f6f5b1-6e4d-40f2-8f78-9f0e5f48c434.png">

3. To see the package errors run `curl --user elastic:changeme
http://localhost:5602/ftw/api/monitoring/v1/_health` or open
http://localhost:5602/ftw/api/monitoring/v1/_health in a browser where
you are logged in in your kibana. (The port of the kibana is 5602 as
mentioned in the
[guide](https://github.com/klacabane/observability-dev/blob/0e924730cc5d9ae16f67420eb6fa3b2337b98dc9/docs/infra-obs-ui/stack-monitoring_integration-packages.md#connecting-a-local-kibana)
this will be your local kibana running - I recommend to add the package
to the elastic package kibana running on 5601 and to run your local
kibana after the setup is complete)
 
Example response:
```
{
    "metricbeatErrors": {
        ...
    },
    "packageErrors": {
        "products": {
            "elasticsearch": {
                "elasticsearch.stack_monitoring.node": [
                    {
                        "message": "error making http request: Get \"http://localhost:9200/_nodes/_local\": dial tcp 127.0.0.1:9200: connect: connection refused",
                        "lastSeen": "2023-01-12T17:27:01.862Z"
                    }
                ],
                "elasticsearch.stack_monitoring.node_stats": [
                    {
                        "message": "error making http request: Get \"http://localhost:9200/_nodes/_local/stats\": dial tcp [::1]:9200: connect: cannot assign requested address",
                        "lastSeen": "2023-01-12T17:26:31.883Z"
                    }
                ],
                .....
            },
            "execution": {
                "timedOut": false,
                "errors": []
            }
        }
    }
}
```
  • Loading branch information
jennypavlova authored Jan 17, 2023
1 parent 4bdf1d4 commit 066ee1c
Show file tree
Hide file tree
Showing 17 changed files with 1,975 additions and 105 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ The response includes sections that can provide useful informations in a debuggi
- settings: a subset of the kibana.yml settings relevant to stack monitoring
- monitoredClusters: a representation of the monitoring documents available to the running kibana. It exposes which metricsets are collected by what collection mode and when was the last time it was ingested. The query groups the metricsets by products and can help identify missing documents that could explain why a page is not loading or crashing
- metricbeatErrors: a list of errors encountered by metricbeat processes when collecting data
- packageErrors: a list of errors encountered by integration package processes when collecting data
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { buildErrors } from './build_errors';
import assert from 'assert';

describe(__filename, () => {
describe('buildErrors', () => {
test('Metricbeat: it should build an object containing dedup error messages per event.dataset', () => {
const metricbeatErrors = [
{
key: 'beat',
errors_by_dataset: {
buckets: [
{
key: 'state',
latest_docs: {
hits: {
hits: [
{
_source: {
'@timestamp': '2022-07-26T08:43:32.625Z',
error: {
message:
'error making http request: Get "http://host.docker.internal:5067/state": dial tcp 192.168.65.2:5067: connect: connection refused',
},
},
},
{
_source: {
'@timestamp': '2022-07-26T08:42:32.625Z',
error: {
message:
'error making http request: Get "http://host.docker.internal:5067/state": dial tcp 192.168.65.2:5067: connect: connection refused',
},
},
},
{
_source: {
'@timestamp': '2022-07-26T08:41:32.625Z',
error: {
message: 'Generic random error',
},
},
},
],
},
},
},
],
},
},
];

const monitoredClusters = buildErrors(metricbeatErrors);
assert.deepEqual(monitoredClusters, {
beat: {
state: [
{
lastSeen: '2022-07-26T08:43:32.625Z',
message:
'error making http request: Get "http://host.docker.internal:5067/state": dial tcp 192.168.65.2:5067: connect: connection refused',
},
{
lastSeen: '2022-07-26T08:41:32.625Z',
message: 'Generic random error',
},
],
},
});
});

test('Packages: it should build an object containing dedup error messages per event.dataset', () => {
const packageErrors = [
{
key: 'elasticsearch',
errors_by_dataset: {
buckets: [
{
key: 'state',
latest_docs: {
hits: {
hits: [
{
_source: {
'@timestamp': '2023-01-10T14:39:37.114Z',
error: {
message:
'error making http request: Get "https://localhost:9200/_nodes/_local": dial tcp [::1]:9200: connect: cannot assign requested address',
},
},
},
{
_source: {
'@timestamp': '2023-01-10T14:39:27.114Z',
error: {
message:
'error making http request: Get "https://localhost:9200/_nodes/_local": dial tcp [::1]:9200: connect: cannot assign requested address',
},
},
},
{
_source: {
'@timestamp': '2022-07-26T08:41:32.625Z',
error: {
message: 'Generic random error',
},
},
},
],
},
},
},
],
},
},
];

const monitoredClusters = buildErrors(packageErrors);
assert.deepEqual(monitoredClusters, {
elasticsearch: {
state: [
{
lastSeen: '2023-01-10T14:39:37.114Z',
message:
'error making http request: Get "https://localhost:9200/_nodes/_local": dial tcp [::1]:9200: connect: cannot assign requested address',
},
{
lastSeen: '2022-07-26T08:41:32.625Z',
message: 'Generic random error',
},
],
},
});
});
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
* 2.0.
*/

import type { MetricbeatMonitoredProduct } from '../types';
import type { MonitoredProduct } from '../types';

export type MetricbeatProducts = {
[product in MetricbeatMonitoredProduct]?: ErrorsByMetricset;
export type Products = {
[product in MonitoredProduct]?: ErrorsByMetricset;
};

interface ErrorsByMetricset {
Expand All @@ -21,14 +21,14 @@ interface ErrorDetails {
}

/**
* builds a normalized representation of the metricbeat errors from the provided
* builds a normalized representation of the metricbeat and integration package errors from the provided
* query buckets with a product->metricset hierarchy where
* product: the monitored products (eg elasticsearch)
* metricset: the collected metricsets for a given entity
*
* example:
* {
* "product": {
* "products": {
* "logstash": {
* "node": {
* "message": "some error message",
Expand All @@ -38,7 +38,7 @@ interface ErrorDetails {
* }
* }
*/
export const buildMetricbeatErrors = (modulesBucket: any[]): MetricbeatProducts => {
export const buildErrors = (modulesBucket: any[]): Products => {
return (modulesBucket ?? []).reduce((module, { key, errors_by_dataset: errorsByDataset }) => {
const datasets = buildMetricsets(errorsByDataset.buckets);
if (Object.keys(datasets).length === 0) {
Expand All @@ -49,7 +49,7 @@ export const buildMetricbeatErrors = (modulesBucket: any[]): MetricbeatProducts
...module,
[key]: datasets,
};
}, {} as MetricbeatProducts);
}, {} as Products);
};

const buildMetricsets = (errorsByDataset: any[]): ErrorsByMetricset => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,28 @@
* 2.0.
*/

import type { MetricbeatMonitoredProduct, QueryOptions } from '../types';
import type { MetricbeatMonitoredProduct, PackagesMonitoredProduct, QueryOptions } from '../types';

const MAX_BUCKET_SIZE = 50;

/**
* Returns a nested aggregation of error messages per event.datasets.
* Each module (beats, kibana...) can contain one or multiple metricsets with error messages
*/
interface MetricbeatErrorsQueryOptions extends QueryOptions {
products: MetricbeatMonitoredProduct[];
interface ErrorsQueryOptions extends QueryOptions {
products: MetricbeatMonitoredProduct[] | PackagesMonitoredProduct[];
errorQueryType: 'metricbeatErrorsQuery' | 'packageErrorsQuery';
errorQueryIsDataStream?: boolean;
}

export const metricbeatErrorsQuery = ({
export const errorsQuery = ({
timeRange,
timeout,
products,
}: MetricbeatErrorsQueryOptions) => {
if (!timeRange) throw new Error('metricbeatErrorsQuery: missing timeRange parameter');
errorQueryType,
errorQueryIsDataStream,
}: ErrorsQueryOptions) => {
if (!timeRange) throw new Error(`${errorQueryType}: missing timeRange parameter`);
return {
timeout: `${timeout}s`,
query: {
Expand All @@ -37,7 +41,8 @@ export const metricbeatErrorsQuery = ({
},
{
terms: {
'event.module': Object.values(products),
[errorQueryIsDataStream ? 'service.type' : 'event.module']:
Object.values(products),
},
},
{
Expand All @@ -54,7 +59,7 @@ export const metricbeatErrorsQuery = ({
},
},
aggs: {
errors_aggregation: errorsAggregation,
errors_aggregation: errorsAggregation(errorQueryIsDataStream),
},
};
};
Expand Down Expand Up @@ -82,11 +87,34 @@ const errorsByMetricset = {
},
};

const errorsAggregation = {
const errorsByDataStream = {
terms: {
field: 'event.module',
field: 'data_stream.dataset',
},
aggs: {
errors_by_dataset: errorsByMetricset,
latest_docs: {
top_hits: {
sort: [
{
'@timestamp': {
order: 'desc',
},
},
],
size: MAX_BUCKET_SIZE,
_source: {
includes: ['@timestamp', 'error', 'data_stream'],
},
},
},
},
};

const errorsAggregation = (errorQueryIsDataStream?: boolean) => ({
terms: {
field: errorQueryIsDataStream ? 'service.type' : 'event.module',
},
aggs: {
errors_by_dataset: errorQueryIsDataStream ? errorsByDataStream : errorsByMetricset,
},
});
25 changes: 22 additions & 3 deletions x-pack/plugins/monitoring/server/routes/api/v1/_health/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@
import type { LegacyRequest, MonitoringCore } from '../../../../types';
import type { MonitoringConfig } from '../../../../config';
import { createValidationFunction } from '../../../../lib/create_route_validation_function';
import { getIndexPatterns } from '../../../../lib/cluster/get_index_patterns';
import { getIndexPatterns, getDsIndexPattern } from '../../../../lib/cluster/get_index_patterns';
import { getHealthRequestQueryRT } from '../../../../../common/http_api/_health';
import type { TimeRange } from '../../../../../common/http_api/shared';

import { fetchMonitoredClusters } from './monitored_clusters';
import { fetchMetricbeatErrors } from './metricbeat';
import type { FetchParameters } from './types';
import { fetchPackageErrors } from './package/fetch_package_errors';

const DEFAULT_QUERY_TIMERANGE = { min: 'now-15m', max: 'now' };
const DEFAULT_QUERY_TIMEOUT_SECONDS = 15;
Expand Down Expand Up @@ -53,6 +54,14 @@ export function registerV1HealthRoute(server: MonitoringCore) {
getIndexPatterns({ config, moduleType: 'logstash' }),
getIndexPatterns({ config, moduleType: 'beats' }),
].join(',');

const metricsPackageIndex = [
getDsIndexPattern({ config, moduleType: 'elasticsearch' }),
getDsIndexPattern({ config, moduleType: 'kibana' }),
getDsIndexPattern({ config, moduleType: 'logstash' }),
getDsIndexPattern({ config, moduleType: 'beats' }),
].join(',');

const entSearchIndex = getIndexPatterns({ config, moduleType: 'enterprise_search' });

const monitoredClustersFn = () =>
Expand All @@ -74,12 +83,22 @@ export function registerV1HealthRoute(server: MonitoringCore) {
return { error: err.message };
});

const [monitoredClusters, metricbeatErrors] = await Promise.all([
const packageErrorsFn = () =>
fetchPackageErrors({
...fetchArgs,
packageIndex: metricsPackageIndex,
}).catch((err: Error) => {
logger.error(`_health: failed to retrieve package data:\n${err.stack}`);
return { error: err.message };
});

const [monitoredClusters, metricbeatErrors, packageErrors] = await Promise.all([
monitoredClustersFn(),
metricbeatErrorsFn(),
packageErrorsFn(),
]);

return { monitoredClusters, metricbeatErrors, settings };
return { monitoredClusters, metricbeatErrors, packageErrors, settings };
},
});
}
Expand Down
Loading

0 comments on commit 066ee1c

Please sign in to comment.