Skip to content

Commit

Permalink
[Detection Engine] Addresses Flakiness in ML FTR tests (#188155)
Browse files Browse the repository at this point in the history
## Summary

The full chronicle of this endeavor can be found
[here](#182183), but [this
comment](#182183 (comment))
summarizes the identified issue:

> I [finally
found](https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/6516#01909dde-a3e8-4e47-b255-b1ff7cac8f8d/6-2368)
the cause of these failures in the response to our "setup modules"
request to ML. Attaching here for posterity:
>
> <details>
> <summary>Setup Modules Failure Response</summary>
> 
> ```json
> {
>   "jobs": [
> { "id": "v3_linux_anomalous_network_port_activity", "success": true },
>     {
>       "id": "v3_linux_anomalous_network_activity",
>       "success": false,
>       "error": {
>         "error": {
>           "root_cause": [
>             {
>               "type": "no_shard_available_action_exception",
> "reason":
"[ftr][127.0.0.1:9300][indices:data/read/search[phase/query]]"
>             }
>           ],
>           "type": "search_phase_execution_exception",
>           "reason": "all shards failed",
>           "phase": "query",
>           "grouped": true,
>           "failed_shards": [
>             {
>               "shard": 0,
> "index":
".ml-anomalies-custom-v3_linux_network_configuration_discovery",
>               "node": "dKzpvp06ScO0OxqHilETEA",
>               "reason": {
>                 "type": "no_shard_available_action_exception",
> "reason":
"[ftr][127.0.0.1:9300][indices:data/read/search[phase/query]]"
>               }
>             }
>           ]
>         },
>         "status": 503
>       }
>     }
>   ],
>   "datafeeds": [
>     {
>       "id": "datafeed-v3_linux_anomalous_network_port_activity",
>       "success": true,
>       "started": false,
>       "awaitingMlNodeAllocation": false
>     },
>     {
>       "id": "datafeed-v3_linux_anomalous_network_activity",
>       "success": false,
>       "started": false,
>       "awaitingMlNodeAllocation": false,
>       "error": {
>         "error": {
>           "root_cause": [
>             {
>               "type": "resource_not_found_exception",
> "reason": "No known job with id 'v3_linux_anomalous_network_activity'"
>             }
>           ],
>           "type": "resource_not_found_exception",
> "reason": "No known job with id 'v3_linux_anomalous_network_activity'"
>         },
>         "status": 404
>       }
>     }
>   ],
>   "kibana": {}
> }
> 
> ```
> </details>

This branch, then, fixes said issue by (relatively simply) retrying the
failed API call until it succeeds.

### Related Issues
Addresses:
- #171426
- #187478
- #187614
- #182009
- #171426

### Checklist

- [x] [Unit or functional
tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)
were updated or added to match the most common scenarios
- [x] [Flaky Test
Runner](https://ci-stats.kibana.dev/trigger_flaky_test_runner/1) was
used on any tests changed
- [x] [ESS Rule Execution FTR x
200](https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/6528)
- [x] [Serverless Rule Execution FTR x
200](https://buildkite.com/elastic/kibana-flaky-test-suite-runner/builds/6529)


### For maintainers

- [x] This was checked for breaking API changes and was [labeled
appropriately](https://www.elastic.co/guide/en/kibana/master/contributing.html#kibana-release-notes-process)
  • Loading branch information
rylnd authored Jul 12, 2024
1 parent 4c0db61 commit 3df635e
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ import {
importFile,
} from '../../../../../lists_and_exception_lists/utils';
import {
executeSetupModuleRequest,
forceStartDatafeeds,
getAlerts,
getPreviewAlerts,
previewRule,
previewRuleWithExceptionEntries,
setupMlModulesWithRetry,
} from '../../../../utils';
import {
createRule,
Expand Down Expand Up @@ -86,13 +86,12 @@ export default ({ getService }: FtrProviderContext) => {
rule_id: 'ml-rule-id',
};

// FLAKY: https://github.com/elastic/kibana/issues/171426
describe.skip('@ess @serverless @serverlessQA Machine learning type rules', () => {
describe('@ess @serverless @serverlessQA Machine learning type rules', () => {
before(async () => {
// Order is critical here: auditbeat data must be loaded before attempting to start the ML job,
// as the job looks for certain indices on start
await esArchiver.load(auditPath);
await executeSetupModuleRequest({ module: siemModule, rspCode: 200, supertest });
await setupMlModulesWithRetry({ module: siemModule, supertest, retry });
await forceStartDatafeeds({ jobId: mlJobId, rspCode: 200, supertest });
await esArchiver.load('x-pack/test/functional/es_archives/security_solution/anomalies');
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import { EsArchivePathBuilder } from '../../../../../../es_archive_path_builder'
import { FtrProviderContext } from '../../../../../../ftr_provider_context';
import {
dataGeneratorFactory,
executeSetupModuleRequest,
forceStartDatafeeds,
getAlerts,
getOpenAlerts,
Expand All @@ -36,6 +35,7 @@ import {
previewRule,
previewRuleWithExceptionEntries,
setAlertStatus,
setupMlModulesWithRetry,
} from '../../../../utils';
import {
createRule,
Expand All @@ -51,6 +51,7 @@ export default ({ getService }: FtrProviderContext) => {
const es = getService('es');
const log = getService('log');
const config = getService('config');
const retry = getService('retry');

const isServerless = config.get('serverless');
const dataPathBuilder = new EsArchivePathBuilder(isServerless);
Expand Down Expand Up @@ -93,7 +94,7 @@ export default ({ getService }: FtrProviderContext) => {
// Order is critical here: auditbeat data must be loaded before attempting to start the ML job,
// as the job looks for certain indices on start
await esArchiver.load(auditbeatArchivePath);
await executeSetupModuleRequest({ module: mlModuleName, rspCode: 200, supertest });
await setupMlModulesWithRetry({ module: mlModuleName, retry, supertest });
await forceStartDatafeeds({ jobId: mlJobId, rspCode: 200, supertest });
await esArchiver.load('x-pack/test/functional/es_archives/security_solution/anomalies');
await deleteAllAnomalies(log, es);
Expand All @@ -112,8 +113,7 @@ export default ({ getService }: FtrProviderContext) => {
await deleteAllAnomalies(log, es);
});

// FLAKY: https://github.com/elastic/kibana/issues/187478
describe.skip('with per-execution suppression duration', () => {
describe('with per-execution suppression duration', () => {
beforeEach(() => {
ruleProps = {
...baseRuleProps,
Expand Down Expand Up @@ -245,8 +245,7 @@ export default ({ getService }: FtrProviderContext) => {
});
});

// FLAKY: https://github.com/elastic/kibana/issues/187614
describe.skip('with interval suppression duration', () => {
describe('with interval suppression duration', () => {
beforeEach(() => {
ruleProps = {
...baseRuleProps,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,18 @@
*/

import type SuperTest from 'supertest';
import { RetryService } from '@kbn/ftr-common-functional-services';
import { ML_GROUP_ID } from '@kbn/security-solution-plugin/common/constants';
import { getCommonRequestHeader } from '../../../../../functional/services/ml/common_api';

interface ModuleJob {
id: string;
success: boolean;
error?: {
status: number;
};
}

export const executeSetupModuleRequest = async ({
module,
rspCode,
Expand All @@ -17,7 +26,7 @@ export const executeSetupModuleRequest = async ({
module: string;
rspCode: number;
supertest: SuperTest.Agent;
}) => {
}): Promise<{ jobs: ModuleJob[] }> => {
const { body } = await supertest
.post(`/internal/ml/modules/setup/${module}`)
.set(getCommonRequestHeader('1'))
Expand All @@ -34,6 +43,35 @@ export const executeSetupModuleRequest = async ({
return body;
};

export const setupMlModulesWithRetry = async ({
module,
retry,
supertest,
}: {
module: string;
retry: RetryService;
supertest: SuperTest.Agent;
}) =>
retry.try(async () => {
const response = await executeSetupModuleRequest({
module,
rspCode: 200,
supertest,
});

const allJobsSucceeded = response?.jobs.every((job) => {
return job.success || (job.error?.status && job.error.status < 500);
});

if (!allJobsSucceeded) {
throw new Error(
`Expected all jobs to set up successfully, but got ${JSON.stringify(response)}`
);
}

return response;
});

export const forceStartDatafeeds = async ({
jobId,
rspCode,
Expand Down

0 comments on commit 3df635e

Please sign in to comment.