diff --git a/x-pack/plugins/alerting/server/lib/rule_execution_status.test.ts b/x-pack/plugins/alerting/server/lib/rule_execution_status.test.ts index 9eb66bbf7dbb3..1742554065f06 100644 --- a/x-pack/plugins/alerting/server/lib/rule_execution_status.test.ts +++ b/x-pack/plugins/alerting/server/lib/rule_execution_status.test.ts @@ -17,6 +17,7 @@ import { import { ErrorWithReason } from './error_with_reason'; import { translations } from '../constants/translations'; import { RuleRunMetrics, RuleRunMetricsStore } from './rule_run_metrics_store'; +import { RuleResultService } from '../monitoring/rule_result_service'; const MockLogger = loggingSystemMock.create().get(); const executionMetrics = { @@ -56,7 +57,10 @@ describe('RuleExecutionStatus', () => { describe('executionStatusFromState()', () => { test('empty task state', () => { const emptyRuleRunState = new RuleRunMetricsStore().getMetrics(); - const { status, metrics } = executionStatusFromState({ metrics: emptyRuleRunState }); + const { status, metrics } = executionStatusFromState({ + stateWithMetrics: { metrics: emptyRuleRunState }, + ruleResultService: new RuleResultService(), + }); checkDateIsNearNow(status.lastExecutionDate); expect(status.status).toBe('ok'); expect(status.error).toBe(undefined); @@ -67,8 +71,11 @@ describe('RuleExecutionStatus', () => { test('task state with no instances', () => { const { status, metrics } = executionStatusFromState({ - alertInstances: {}, - metrics: executionMetrics, + stateWithMetrics: { + alertInstances: {}, + metrics: executionMetrics, + }, + ruleResultService: new RuleResultService(), }); checkDateIsNearNow(status.lastExecutionDate); expect(status.status).toBe('ok'); @@ -80,8 +87,11 @@ describe('RuleExecutionStatus', () => { test('task state with one instance', () => { const { status, metrics } = executionStatusFromState({ - alertInstances: { a: {} }, - metrics: executionMetrics, + stateWithMetrics: { + alertInstances: { a: {} }, + metrics: executionMetrics, + }, + ruleResultService: new RuleResultService(), }); checkDateIsNearNow(status.lastExecutionDate); expect(status.status).toBe('active'); @@ -93,8 +103,11 @@ describe('RuleExecutionStatus', () => { test('task state with max executable actions warning', () => { const { status, metrics } = executionStatusFromState({ - alertInstances: { a: {} }, - metrics: { ...executionMetrics, triggeredActionsStatus: ActionsCompletion.PARTIAL }, + stateWithMetrics: { + alertInstances: { a: {} }, + metrics: { ...executionMetrics, triggeredActionsStatus: ActionsCompletion.PARTIAL }, + }, + ruleResultService: new RuleResultService(), }); checkDateIsNearNow(status.lastExecutionDate); expect(status.warning).toEqual({ @@ -112,12 +125,15 @@ describe('RuleExecutionStatus', () => { test('task state with max queued actions warning', () => { const { status, metrics } = executionStatusFromState({ - alertInstances: { a: {} }, - metrics: { - ...executionMetrics, - triggeredActionsStatus: ActionsCompletion.PARTIAL, - hasReachedQueuedActionsLimit: true, + stateWithMetrics: { + alertInstances: { a: {} }, + metrics: { + ...executionMetrics, + triggeredActionsStatus: ActionsCompletion.PARTIAL, + hasReachedQueuedActionsLimit: true, + }, }, + ruleResultService: new RuleResultService(), }); checkDateIsNearNow(status.lastExecutionDate); expect(status.warning).toEqual({ @@ -136,8 +152,11 @@ describe('RuleExecutionStatus', () => { test('task state with max alerts warning', () => { const { status, metrics } = executionStatusFromState({ - alertInstances: { a: {} }, - metrics: { ...executionMetrics, hasReachedAlertLimit: true }, + stateWithMetrics: { + alertInstances: { a: {} }, + metrics: { ...executionMetrics, hasReachedAlertLimit: true }, + }, + ruleResultService: new RuleResultService(), }); checkDateIsNearNow(status.lastExecutionDate); expect(status.warning).toEqual({ @@ -152,6 +171,23 @@ describe('RuleExecutionStatus', () => { hasReachedAlertLimit: true, }); }); + + test('task state with lastRun error', () => { + const ruleResultService = new RuleResultService(); + const lastRunSetters = ruleResultService.getLastRunSetters(); + lastRunSetters.addLastRunError('an error'); + + const { status } = executionStatusFromState({ + stateWithMetrics: { + alertInstances: {}, + metrics: executionMetrics, + }, + ruleResultService, + }); + expect(status.status).toBe('error'); + expect(status.error).toEqual({ message: 'an error', reason: 'unknown' }); + expect(status.warning).toBe(undefined); + }); }); describe('executionStatusFromError()', () => { diff --git a/x-pack/plugins/alerting/server/lib/rule_execution_status.ts b/x-pack/plugins/alerting/server/lib/rule_execution_status.ts index 2fea90c2410ff..4a8869fac452e 100644 --- a/x-pack/plugins/alerting/server/lib/rule_execution_status.ts +++ b/x-pack/plugins/alerting/server/lib/rule_execution_status.ts @@ -7,6 +7,7 @@ import { Logger } from '@kbn/core/server'; import { ActionsCompletion } from '@kbn/alerting-state-types'; +import { RuleResultService } from '../monitoring/rule_result_service'; import { RuleExecutionStatus, RuleExecutionStatusValues, @@ -14,6 +15,7 @@ import { RawRuleExecutionStatus, RawRule, Rule, + RuleExecutionStatusErrorReasons, } from '../types'; import { getReasonFromError } from './error_with_reason'; import { getEsErrorMessage } from './errors'; @@ -27,10 +29,15 @@ export interface IExecutionStatusAndMetrics { metrics: RuleRunMetrics | null; } -export function executionStatusFromState( - stateWithMetrics: RuleTaskStateAndMetrics, - lastExecutionDate?: Date -): IExecutionStatusAndMetrics { +export function executionStatusFromState({ + stateWithMetrics, + ruleResultService, + lastExecutionDate, +}: { + stateWithMetrics: RuleTaskStateAndMetrics; + ruleResultService: RuleResultService; + lastExecutionDate?: Date; +}): IExecutionStatusAndMetrics { const alertIds = Object.keys(stateWithMetrics.alertInstances ?? {}); let status: RuleExecutionStatuses = @@ -38,6 +45,8 @@ export function executionStatusFromState( // Check for warning states let warning = null; + let error = null; + // We only have a single warning field so prioritizing the alert circuit breaker over the actions circuit breaker if (stateWithMetrics.metrics.hasReachedAlertLimit) { status = RuleExecutionStatusValues[5]; @@ -60,11 +69,23 @@ export function executionStatusFromState( } } + // Overwrite status to be error if last run reported any errors + const { errors: errorsFromLastRun } = ruleResultService.getLastRunResults(); + if (errorsFromLastRun.length > 0) { + status = RuleExecutionStatusValues[2]; + // These errors are reported by ruleResultService.addLastRunError, therefore they are landed in successful execution map + error = { + reason: RuleExecutionStatusErrorReasons.Unknown, + message: errorsFromLastRun.join(','), + }; + } + return { status: { lastExecutionDate: lastExecutionDate ?? new Date(), status, ...(warning ? { warning } : {}), + ...(error ? { error } : {}), }, metrics: stateWithMetrics.metrics, }; diff --git a/x-pack/plugins/alerting/server/task_runner/task_runner.test.ts b/x-pack/plugins/alerting/server/task_runner/task_runner.test.ts index cd9ca14d07a1e..47f8a3a83716c 100644 --- a/x-pack/plugins/alerting/server/task_runner/task_runner.test.ts +++ b/x-pack/plugins/alerting/server/task_runner/task_runner.test.ts @@ -88,6 +88,8 @@ import { alertsClientMock } from '../alerts_client/alerts_client.mock'; import { MaintenanceWindow } from '../application/maintenance_window/types'; import { RULE_SAVED_OBJECT_TYPE } from '../saved_objects'; import { getErrorSource } from '@kbn/task-manager-plugin/server/task_running'; +import { RuleResultService } from '../monitoring/rule_result_service'; +import { ruleResultServiceMock } from '../monitoring/rule_result_service.mock'; jest.mock('uuid', () => ({ v4: () => '5f6aa57d-3e22-484e-bae8-cbed868f4d28', @@ -99,6 +101,8 @@ jest.mock('../lib/wrap_scoped_cluster_client', () => ({ jest.mock('../lib/alerting_event_logger/alerting_event_logger'); +jest.mock('../monitoring/rule_result_service'); + let fakeTimer: sinon.SinonFakeTimers; const logger: ReturnType = loggingSystemMock.createLogger(); @@ -106,6 +110,7 @@ const mockUsageCountersSetup = usageCountersServiceMock.createSetupContract(); const mockUsageCounter = mockUsageCountersSetup.createUsageCounter('test'); const alertingEventLogger = alertingEventLoggerMock.create(); const alertsClient = alertsClientMock.create(); +const ruleResultService = ruleResultServiceMock.create(); describe('Task Runner', () => { let mockedTaskInstance: ConcreteTaskInstance; @@ -245,6 +250,13 @@ describe('Task Runner', () => { ruleType.executor.mockResolvedValue({ state: {} }); actionsClient.bulkEnqueueExecution.mockResolvedValue({ errors: false, items: [] }); + + ruleResultService.getLastRunResults.mockImplementation(() => ({ + errors: [], + warnings: [], + outcomeMessage: '', + })); + (RuleResultService as jest.Mock).mockImplementation(() => ruleResultService); }); test('successfully executes the task', async () => { @@ -2743,7 +2755,6 @@ describe('Task Runner', () => { const taskRunner = new TaskRunner({ ruleType, taskInstance: mockedTaskInstance, - context: taskRunnerFactoryInitializerParams, inMemoryMetrics, }); @@ -3285,6 +3296,39 @@ describe('Task Runner', () => { expect(mockUsageCounter.incrementCounter).not.toHaveBeenCalled(); }); + test('reports error to eventLogger when ruleResultService.addLastRunError adds an error', async () => { + rulesClient.getAlertFromRaw.mockReturnValue(mockedRuleTypeSavedObject as Rule); + encryptedSavedObjectsClient.getDecryptedAsInternalUser.mockResolvedValue(mockedRawRuleSO); + + const taskRunner = new TaskRunner({ + ruleType, + taskInstance: mockedTaskInstance, + context: taskRunnerFactoryInitializerParams, + inMemoryMetrics, + }); + + ruleResultService.getLastRunResults.mockImplementation(() => ({ + errors: ['an error occurred'], + warnings: [], + outcomeMessage: '', + })); + + const runnerResult = await taskRunner.run(); + + expect(inMemoryMetrics.increment).toHaveBeenCalledTimes(2); + expect(inMemoryMetrics.increment.mock.calls[0][0]).toBe(IN_MEMORY_METRICS.RULE_EXECUTIONS); + expect(inMemoryMetrics.increment.mock.calls[1][0]).toBe(IN_MEMORY_METRICS.RULE_FAILURES); + + testAlertingEventLogCalls({ + status: 'error', + softErrorFromLastRun: true, + errorMessage: 'an error occurred', + errorReason: 'unknown', + }); + + expect(getErrorSource(runnerResult.taskRunError as Error)).toBe(TaskErrorSource.FRAMEWORK); + }); + function testAlertingEventLogCalls({ ruleContext = alertingEventLoggerInitializer, activeAlerts = 0, @@ -3302,6 +3346,7 @@ describe('Task Runner', () => { logAction = 0, hasReachedAlertLimit = false, hasReachedQueuedActionsLimit = false, + softErrorFromLastRun = false, }: { status: string; ruleContext?: RuleContextOpts; @@ -3319,6 +3364,7 @@ describe('Task Runner', () => { errorMessage?: string; hasReachedAlertLimit?: boolean; hasReachedQueuedActionsLimit?: boolean; + softErrorFromLastRun?: boolean; }) { expect(alertingEventLogger.initialize).toHaveBeenCalledWith(ruleContext); if (status !== 'skip') { @@ -3338,27 +3384,42 @@ describe('Task Runner', () => { ); } if (status === 'error') { - expect(alertingEventLogger.done).toHaveBeenCalledWith({ - metrics: null, - status: { - lastExecutionDate: new Date('1970-01-01T00:00:00.000Z'), - status, - error: { - message: errorMessage, - reason: errorReason, + if (softErrorFromLastRun) { + expect(alertingEventLogger.done).toHaveBeenCalledWith( + expect.objectContaining({ + status: { + lastExecutionDate: new Date('1970-01-01T00:00:00.000Z'), + status, + error: { + message: errorMessage, + reason: errorReason, + }, + }, + }) + ); + } else { + expect(alertingEventLogger.done).toHaveBeenCalledWith({ + metrics: null, + status: { + lastExecutionDate: new Date('1970-01-01T00:00:00.000Z'), + status, + error: { + message: errorMessage, + reason: errorReason, + }, }, - }, - timings: { - claim_to_start_duration_ms: 0, - persist_alerts_duration_ms: 0, - prepare_rule_duration_ms: 0, - process_alerts_duration_ms: 0, - process_rule_duration_ms: 0, - rule_type_run_duration_ms: 0, - total_run_duration_ms: 0, - trigger_actions_duration_ms: 0, - }, - }); + timings: { + claim_to_start_duration_ms: 0, + persist_alerts_duration_ms: 0, + prepare_rule_duration_ms: 0, + process_alerts_duration_ms: 0, + process_rule_duration_ms: 0, + rule_type_run_duration_ms: 0, + total_run_duration_ms: 0, + trigger_actions_duration_ms: 0, + }, + }); + } } else if (status === 'warning') { expect(alertingEventLogger.done).toHaveBeenCalledWith({ metrics: { diff --git a/x-pack/plugins/alerting/server/task_runner/task_runner.ts b/x-pack/plugins/alerting/server/task_runner/task_runner.ts index 2831ad0657b61..28922187f902c 100644 --- a/x-pack/plugins/alerting/server/task_runner/task_runner.ts +++ b/x-pack/plugins/alerting/server/task_runner/task_runner.ts @@ -13,12 +13,19 @@ import { Logger } from '@kbn/core/server'; import { ConcreteTaskInstance, createTaskRunError, + TaskErrorSource, throwUnrecoverableError, } from '@kbn/task-manager-plugin/server'; import { nanosToMillis } from '@kbn/event-log-plugin/server'; import { getErrorSource } from '@kbn/task-manager-plugin/server/task_running'; import { ExecutionHandler, RunResult } from './execution_handler'; -import { TaskRunnerContext } from './types'; +import { + RuleTaskInstance, + RuleTaskRunResult, + RuleTaskStateAndMetrics, + RunRuleParams, + TaskRunnerContext, +} from './types'; import { getExecutorServices } from './get_executor_services'; import { ElasticsearchError, @@ -56,12 +63,6 @@ import { import { NormalizedRuleType, UntypedNormalizedRuleType } from '../rule_type_registry'; import { getEsErrorMessage } from '../lib/errors'; import { IN_MEMORY_METRICS, InMemoryMetrics } from '../monitoring'; -import { - RuleTaskInstance, - RuleTaskRunResult, - RuleTaskStateAndMetrics, - RunRuleParams, -} from './types'; import { IExecutionStatusAndMetrics } from '../lib/rule_execution_status'; import { RuleRunMetricsStore } from '../lib/rule_run_metrics_store'; import { AlertingEventLogger } from '../lib/alerting_event_logger/alerting_event_logger'; @@ -72,7 +73,7 @@ import { ILastRun, lastRunFromState, lastRunToRaw } from '../lib/last_run_status import { RunningHandler } from './running_handler'; import { RuleResultService } from '../monitoring/rule_result_service'; import { MaintenanceWindow } from '../application/maintenance_window/types'; -import { getMaintenanceWindows, filterMaintenanceWindowsIds } from './get_maintenance_windows'; +import { filterMaintenanceWindowsIds, getMaintenanceWindows } from './get_maintenance_windows'; import { RuleTypeRunner } from './rule_type_runner'; import { initializeAlertsClient } from '../alerts_client'; @@ -574,7 +575,11 @@ export class TaskRunner< >( stateWithMetrics, (ruleRunStateWithMetrics) => - executionStatusFromState(ruleRunStateWithMetrics, this.runDate), + executionStatusFromState({ + stateWithMetrics: ruleRunStateWithMetrics, + lastExecutionDate: this.runDate, + ruleResultService: this.ruleResult, + }), (err: ElasticsearchError) => executionStatusFromError(err, this.runDate) ); @@ -702,11 +707,23 @@ export class TaskRunner< }; const getTaskRunError = (state: Result) => { - return isErr(state) - ? { - taskRunError: createTaskRunError(state.error, getErrorSource(state.error)), - } - : {}; + if (isErr(state)) { + return { + taskRunError: createTaskRunError(state.error, getErrorSource(state.error)), + }; + } + + const { errors: errorsFromLastRun } = this.ruleResult.getLastRunResults(); + if (errorsFromLastRun.length > 0) { + return { + taskRunError: createTaskRunError( + new Error(errorsFromLastRun.join(',')), + TaskErrorSource.FRAMEWORK + ), + }; + } + + return {}; }; return {