Skip to content

Commit

Permalink
Retry failed ML Job setup in DE FTR tests
Browse files Browse the repository at this point in the history
The flakiness here ends up being caused by sporadic unavailability of
shards during module setup. The underlying cause of that unavailability
is likely a race condition between ML, ES, and/or FTR, but luckily we
don't need to worry about that because simply retrying the API call
causes it to eventually succeed.

In those cases, some of the jobs will report a 4xx status, but that's
expected.

This is the result of a lot of prodding and CPU cycles on CI; see elastic#182183 for
the full details.
  • Loading branch information
rylnd committed Jul 11, 2024
1 parent 48e2b57 commit d63a91e
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ import {
importFile,
} from '../../../../../lists_and_exception_lists/utils';
import {
executeSetupModuleRequest,
forceStartDatafeeds,
getAlerts,
getPreviewAlerts,
previewRule,
previewRuleWithExceptionEntries,
setupMlModulesWithRetry,
} from '../../../../utils';
import {
createRule,
Expand Down Expand Up @@ -92,7 +92,7 @@ export default ({ getService }: FtrProviderContext) => {
// Order is critical here: auditbeat data must be loaded before attempting to start the ML job,
// as the job looks for certain indices on start
await esArchiver.load(auditPath);
await executeSetupModuleRequest({ module: siemModule, rspCode: 200, supertest });
await setupMlModulesWithRetry({ module: siemModule, supertest, retry });
await forceStartDatafeeds({ jobId: mlJobId, rspCode: 200, supertest });
await esArchiver.load('x-pack/test/functional/es_archives/security_solution/anomalies');
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import { EsArchivePathBuilder } from '../../../../../../es_archive_path_builder'
import { FtrProviderContext } from '../../../../../../ftr_provider_context';
import {
dataGeneratorFactory,
executeSetupModuleRequest,
forceStartDatafeeds,
getAlerts,
getOpenAlerts,
Expand All @@ -36,6 +35,7 @@ import {
previewRule,
previewRuleWithExceptionEntries,
setAlertStatus,
setupMlModulesWithRetry,
} from '../../../../utils';
import {
createRule,
Expand All @@ -51,6 +51,7 @@ export default ({ getService }: FtrProviderContext) => {
const es = getService('es');
const log = getService('log');
const config = getService('config');
const retry = getService('retry');

const isServerless = config.get('serverless');
const dataPathBuilder = new EsArchivePathBuilder(isServerless);
Expand Down Expand Up @@ -93,7 +94,7 @@ export default ({ getService }: FtrProviderContext) => {
// Order is critical here: auditbeat data must be loaded before attempting to start the ML job,
// as the job looks for certain indices on start
await esArchiver.load(auditbeatArchivePath);
await executeSetupModuleRequest({ module: mlModuleName, rspCode: 200, supertest });
await setupMlModulesWithRetry({ module: mlModuleName, retry, supertest });
await forceStartDatafeeds({ jobId: mlJobId, rspCode: 200, supertest });
await esArchiver.load('x-pack/test/functional/es_archives/security_solution/anomalies');
await deleteAllAnomalies(log, es);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,18 @@
*/

import type SuperTest from 'supertest';
import { RetryService } from '@kbn/ftr-common-functional-services';
import { ML_GROUP_ID } from '@kbn/security-solution-plugin/common/constants';
import { getCommonRequestHeader } from '../../../../../functional/services/ml/common_api';

interface ModuleJob {
id: string;
success: boolean;
error?: {
status: number;
};
}

export const executeSetupModuleRequest = async ({
module,
rspCode,
Expand All @@ -17,7 +26,7 @@ export const executeSetupModuleRequest = async ({
module: string;
rspCode: number;
supertest: SuperTest.Agent;
}) => {
}): Promise<{ jobs: ModuleJob[] }> => {
const { body } = await supertest
.post(`/internal/ml/modules/setup/${module}`)
.set(getCommonRequestHeader('1'))
Expand All @@ -34,6 +43,33 @@ export const executeSetupModuleRequest = async ({
return body;
};

export const setupMlModulesWithRetry = async ({
module,
retry,
supertest,
}: {
module: string;
retry: RetryService;
supertest: SuperTest.Agent;
}) =>
retry.try(async () => {
const response = await executeSetupModuleRequest({
module,
rspCode: 200,
supertest,
});

const allJobsSucceeded = response?.jobs.every((job) => {
return job.success || (job.error?.status && job.error.status < 500);
});

if (!allJobsSucceeded) {
throw new Error(`Expected all jobs to set up successfully, but got ${JSON.stringify(body)}`);
}

return response;
});

export const forceStartDatafeeds = async ({
jobId,
rspCode,
Expand Down

0 comments on commit d63a91e

Please sign in to comment.