Skip to content

Commit

Permalink
[8.6] Reduce startup time by skipping update mappings step when possi…
Browse files Browse the repository at this point in the history
…ble (#145604) (#146637)

# Backport

This will backport the following commits from `main` to `8.6`:
- [Reduce startup time by skipping update mappings step when possible
(#145604)](#145604)

<!--- Backport version: 8.9.7 -->

### Questions ?
Please refer to the [Backport tool
documentation](https://github.com/sqren/backport)

<!--BACKPORT [{"author":{"name":"Gerard
Soldevila","email":"gerard.soldevila@elastic.co"},"sourceCommit":{"committedDate":"2022-11-28T14:34:58Z","message":"Reduce
startup time by skipping update mappings step when possible
(#145604)\n\nThe goal of this PR is to reduce the startup times of
Kibana server by\r\nimproving the migration logic.\r\n\r\nFixes
https://github.com/elastic/kibana/issues/145743\r\nRelated
https://github.com/elastic/kibana/issues/144035)\r\n\r\nThe migration
logic is run systematically at startup, whether the\r\ncustomers are
upgrading or not.\r\nHistorically, these steps have been very quick, but
we recently found\r\nout about some customers that have more than **one
million** Saved\r\nObjects stored, making the overall startup process
slow, even when there\r\nare no migrations to perform.\r\n\r\nThis PR
specifically targets the case where there are no migrations
to\r\nperform, aka a Kibana node is started against an ES cluster that
is\r\nalready up to date wrt stack version and list of
plugins.\r\n\r\nIn this scenario, we aim at skipping the
`UPDATE_TARGET_MAPPINGS` step\r\nof the migration logic, which
internally runs the\r\n`updateAndPickupMappings` method, which turns out
to be expensive if the\r\nsystem indices contain lots of
SO.\r\n\r\n\r\nI locally tested the following scenarios too:\r\n\r\n-
**Fresh install.** The step is not even run, as the `.kibana`
index\r\ndid not exist ✅\r\n- **Stack version + list of plugins up to
date.** Simply restarting\r\nKibana after the fresh install. The step is
run and leads to `DONE`, as\r\nthe md5 hashes match those stored in
`.kibana._mapping._meta` ✅\r\n- **Faking re-enabling an old plugin.** I
manually removed one of the\r\nMD5 hashes from the stored
.kibana._mapping._meta through `curl`, and\r\nthen restarted Kibana. The
step is run and leads to\r\n`UPDATE_TARGET_MAPPINGS` as it used to
before the PR ✅\r\n- **Faking updating a plugin.** Same as the previous
one, but altering\r\nan existing md5 stored in the metas. ✅\r\n\r\nAnd
that is the curl command used to tamper with the stored
_meta:\r\n```bash\r\ncurl -X PUT
\"kibana:changeme@localhost:9200/.kibana/_mapping?pretty\" -H
'Content-Type: application/json' -d'\r\n{\r\n \"_meta\": {\r\n
\"migrationMappingPropertyHashes\": {\r\n \"references\":
\"7997cf5a56cc02bdc9c93361bde732b0\",\r\n }\r\n
}\r\n}\r\n'\r\n```","sha":"b1e18a0414ed99456706119d15173b687c6e7366","branchLabelMapping":{"^v8.7.0$":"main","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["Team:Core","enhancement","release_note:skip","Feature:Migrations","backport:prev-minor","v8.7.0"],"number":145604,"url":"https://github.com/elastic/kibana/pull/145604","mergeCommit":{"message":"Reduce
startup time by skipping update mappings step when possible
(#145604)\n\nThe goal of this PR is to reduce the startup times of
Kibana server by\r\nimproving the migration logic.\r\n\r\nFixes
https://github.com/elastic/kibana/issues/145743\r\nRelated
https://github.com/elastic/kibana/issues/144035)\r\n\r\nThe migration
logic is run systematically at startup, whether the\r\ncustomers are
upgrading or not.\r\nHistorically, these steps have been very quick, but
we recently found\r\nout about some customers that have more than **one
million** Saved\r\nObjects stored, making the overall startup process
slow, even when there\r\nare no migrations to perform.\r\n\r\nThis PR
specifically targets the case where there are no migrations
to\r\nperform, aka a Kibana node is started against an ES cluster that
is\r\nalready up to date wrt stack version and list of
plugins.\r\n\r\nIn this scenario, we aim at skipping the
`UPDATE_TARGET_MAPPINGS` step\r\nof the migration logic, which
internally runs the\r\n`updateAndPickupMappings` method, which turns out
to be expensive if the\r\nsystem indices contain lots of
SO.\r\n\r\n\r\nI locally tested the following scenarios too:\r\n\r\n-
**Fresh install.** The step is not even run, as the `.kibana`
index\r\ndid not exist ✅\r\n- **Stack version + list of plugins up to
date.** Simply restarting\r\nKibana after the fresh install. The step is
run and leads to `DONE`, as\r\nthe md5 hashes match those stored in
`.kibana._mapping._meta` ✅\r\n- **Faking re-enabling an old plugin.** I
manually removed one of the\r\nMD5 hashes from the stored
.kibana._mapping._meta through `curl`, and\r\nthen restarted Kibana. The
step is run and leads to\r\n`UPDATE_TARGET_MAPPINGS` as it used to
before the PR ✅\r\n- **Faking updating a plugin.** Same as the previous
one, but altering\r\nan existing md5 stored in the metas. ✅\r\n\r\nAnd
that is the curl command used to tamper with the stored
_meta:\r\n```bash\r\ncurl -X PUT
\"kibana:changeme@localhost:9200/.kibana/_mapping?pretty\" -H
'Content-Type: application/json' -d'\r\n{\r\n \"_meta\": {\r\n
\"migrationMappingPropertyHashes\": {\r\n \"references\":
\"7997cf5a56cc02bdc9c93361bde732b0\",\r\n }\r\n
}\r\n}\r\n'\r\n```","sha":"b1e18a0414ed99456706119d15173b687c6e7366"}},"sourceBranch":"main","suggestedTargetBranches":[],"targetPullRequestStates":[{"branch":"main","label":"v8.7.0","labelRegex":"^v8.7.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/145604","number":145604,"mergeCommit":{"message":"Reduce
startup time by skipping update mappings step when possible
(#145604)\n\nThe goal of this PR is to reduce the startup times of
Kibana server by\r\nimproving the migration logic.\r\n\r\nFixes
https://github.com/elastic/kibana/issues/145743\r\nRelated
https://github.com/elastic/kibana/issues/144035)\r\n\r\nThe migration
logic is run systematically at startup, whether the\r\ncustomers are
upgrading or not.\r\nHistorically, these steps have been very quick, but
we recently found\r\nout about some customers that have more than **one
million** Saved\r\nObjects stored, making the overall startup process
slow, even when there\r\nare no migrations to perform.\r\n\r\nThis PR
specifically targets the case where there are no migrations
to\r\nperform, aka a Kibana node is started against an ES cluster that
is\r\nalready up to date wrt stack version and list of
plugins.\r\n\r\nIn this scenario, we aim at skipping the
`UPDATE_TARGET_MAPPINGS` step\r\nof the migration logic, which
internally runs the\r\n`updateAndPickupMappings` method, which turns out
to be expensive if the\r\nsystem indices contain lots of
SO.\r\n\r\n\r\nI locally tested the following scenarios too:\r\n\r\n-
**Fresh install.** The step is not even run, as the `.kibana`
index\r\ndid not exist ✅\r\n- **Stack version + list of plugins up to
date.** Simply restarting\r\nKibana after the fresh install. The step is
run and leads to `DONE`, as\r\nthe md5 hashes match those stored in
`.kibana._mapping._meta` ✅\r\n- **Faking re-enabling an old plugin.** I
manually removed one of the\r\nMD5 hashes from the stored
.kibana._mapping._meta through `curl`, and\r\nthen restarted Kibana. The
step is run and leads to\r\n`UPDATE_TARGET_MAPPINGS` as it used to
before the PR ✅\r\n- **Faking updating a plugin.** Same as the previous
one, but altering\r\nan existing md5 stored in the metas. ✅\r\n\r\nAnd
that is the curl command used to tamper with the stored
_meta:\r\n```bash\r\ncurl -X PUT
\"kibana:changeme@localhost:9200/.kibana/_mapping?pretty\" -H
'Content-Type: application/json' -d'\r\n{\r\n \"_meta\": {\r\n
\"migrationMappingPropertyHashes\": {\r\n \"references\":
\"7997cf5a56cc02bdc9c93361bde732b0\",\r\n }\r\n
}\r\n}\r\n'\r\n```","sha":"b1e18a0414ed99456706119d15173b687c6e7366"}}]}]
BACKPORT-->
  • Loading branch information
gsoldevila authored Nov 30, 2022
1 parent c4c316e commit 42bf33f
Show file tree
Hide file tree
Showing 16 changed files with 812 additions and 73 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ export {
cloneIndex,
waitForTask,
updateAndPickupMappings,
updateTargetMappingsMeta,
updateAliases,
transformDocs,
setWriteBlock,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,51 +21,63 @@
- [LEGACY_DELETE](#legacy_delete)
- [Next action](#next-action-6)
- [New control state](#new-control-state-6)
- [WAIT_FOR_YELLOW_SOURCE](#wait_for_yellow_source)
- [WAIT_FOR_MIGRATION_COMPLETION](#wait_for_migration_completion)
- [Next action](#next-action-7)
- [New control state](#new-control-state-7)
- [SET_SOURCE_WRITE_BLOCK](#set_source_write_block)
- [WAIT_FOR_YELLOW_SOURCE](#wait_for_yellow_source)
- [Next action](#next-action-8)
- [New control state](#new-control-state-8)
- [CREATE_REINDEX_TEMP](#create_reindex_temp)
- [SET_SOURCE_WRITE_BLOCK](#set_source_write_block)
- [Next action](#next-action-9)
- [New control state](#new-control-state-9)
- [REINDEX_SOURCE_TO_TEMP_OPEN_PIT](#reindex_source_to_temp_open_pit)
- [CREATE_REINDEX_TEMP](#create_reindex_temp)
- [Next action](#next-action-10)
- [New control state](#new-control-state-10)
- [REINDEX_SOURCE_TO_TEMP_READ](#reindex_source_to_temp_read)
- [REINDEX_SOURCE_TO_TEMP_OPEN_PIT](#reindex_source_to_temp_open_pit)
- [Next action](#next-action-11)
- [New control state](#new-control-state-11)
- [REINDEX_SOURCE_TO_TEMP_TRANSFORM](#REINDEX_SOURCE_TO_TEMP_TRANSFORM)
- [REINDEX_SOURCE_TO_TEMP_READ](#reindex_source_to_temp_read)
- [Next action](#next-action-12)
- [New control state](#new-control-state-12)
- [REINDEX_SOURCE_TO_TEMP_INDEX_BULK](#reindex_source_to_temp_index_bulk)
- [REINDEX_SOURCE_TO_TEMP_TRANSFORM](#reindex_source_to_temp_transform)
- [Next action](#next-action-13)
- [New control state](#new-control-state-13)
- [REINDEX_SOURCE_TO_TEMP_CLOSE_PIT](#reindex_source_to_temp_close_pit)
- [REINDEX_SOURCE_TO_TEMP_INDEX_BULK](#reindex_source_to_temp_index_bulk)
- [Next action](#next-action-14)
- [New control state](#new-control-state-14)
- [SET_TEMP_WRITE_BLOCK](#set_temp_write_block)
- [REINDEX_SOURCE_TO_TEMP_CLOSE_PIT](#reindex_source_to_temp_close_pit)
- [Next action](#next-action-15)
- [New control state](#new-control-state-15)
- [CLONE_TEMP_TO_TARGET](#clone_temp_to_target)
- [SET_TEMP_WRITE_BLOCK](#set_temp_write_block)
- [Next action](#next-action-16)
- [New control state](#new-control-state-16)
- [OUTDATED_DOCUMENTS_SEARCH](#outdated_documents_search)
- [CLONE_TEMP_TO_TARGET](#clone_temp_to_target)
- [Next action](#next-action-17)
- [New control state](#new-control-state-17)
- [OUTDATED_DOCUMENTS_TRANSFORM](#outdated_documents_transform)
- [OUTDATED_DOCUMENTS_SEARCH](#outdated_documents_search)
- [Next action](#next-action-18)
- [New control state](#new-control-state-18)
- [UPDATE_TARGET_MAPPINGS](#update_target_mappings)
- [OUTDATED_DOCUMENTS_TRANSFORM](#outdated_documents_transform)
- [Next action](#next-action-19)
- [New control state](#new-control-state-19)
- [UPDATE_TARGET_MAPPINGS_WAIT_FOR_TASK](#update_target_mappings_wait_for_task)
- [CHECK_TARGET_MAPPINGS](#check_target_mappings)
- [Next action](#next-action-20)
- [New control state](#new-control-state-20)
- [MARK_VERSION_INDEX_READY_CONFLICT](#mark_version_index_ready_conflict)
- [UPDATE_TARGET_MAPPINGS](#update_target_mappings)
- [Next action](#next-action-21)
- [New control state](#new-control-state-21)
- [UPDATE_TARGET_MAPPINGS_WAIT_FOR_TASK](#update_target_mappings_wait_for_task)
- [Next action](#next-action-22)
- [New control state](#new-control-state-22)
- [CHECK_VERSION_INDEX_READY_ACTIONS](#check_version_index_ready_actions)
- [Next action](#next-action-23)
- [New control state](#new-control-state-23)
- [MARK_VERSION_INDEX_READY](#mark_version_index_ready)
- [Next action](#next-action-24)
- [New control state](#new-control-state-24)
- [MARK_VERSION_INDEX_READY_CONFLICT](#mark_version_index_ready_conflict)
- [Next action](#next-action-25)
- [New control state](#new-control-state-25)
- [Manual QA Test Plan](#manual-qa-test-plan)
- [1. Legacy pre-migration](#1-legacy-pre-migration)
- [2. Plugins enabled/disabled](#2-plugins-enableddisabled)
Expand Down Expand Up @@ -98,7 +110,7 @@ The design goals for the algorithm was to keep downtime below 10 minutes for
and explicit as possible.

The algorithm is implemented as a state-action machine based on https://www.microsoft.com/en-us/research/uploads/prod/2016/12/Computation-and-State-Machines.pdf

The state-action machine defines it's behaviour in steps. Each step is a
transition from a control state s_i to the contral state s_i+1 caused by an
action a_i.
Expand Down Expand Up @@ -152,10 +164,10 @@ index.
1. The Elasticsearch shard allocation cluster setting `cluster.routing.allocation.enable` needs to be unset or set to 'all'. When set to 'primaries', 'new_primaries' or 'none', the migration will timeout when waiting for index green status before bulk indexing because the replica cannot be allocated.

As per the Elasticsearch docs https://www.elastic.co/guide/en/elasticsearch/reference/8.2/restart-cluster.html#restart-cluster-rolling when Cloud performs a rolling restart such as during an upgrade, it will temporarily disable shard allocation. Kibana therefore keeps retrying the INIT step to wait for shard allocation to be enabled again.

The check only considers persistent and transient settings and does not take static configuration in `elasticsearch.yml` into account since there are no known use cases for doing so. If `cluster.routing.allocation.enable` is configured in `elaticsearch.yml` and not set to the default of 'all', the migration will timeout. Static settings can only be returned from the `nodes/info` API.
`INIT`

2. If `.kibana` is pointing to an index that belongs to a later version of
Kibana .e.g. a 7.11.0 instance found the `.kibana` alias pointing to
`.kibana_7.12.0_001` fail the migration
Expand Down Expand Up @@ -271,8 +283,8 @@ new `.kibana` alias that points to `.kibana_pre6.5.0_001`.
1. If the ui node finished the migration
`DONE`
2. Otherwise wait 2s and check again
→ WAIT_FOR_MIGRATION_COMPLETION
`WAIT_FOR_MIGRATION_COMPLETION`

## WAIT_FOR_YELLOW_SOURCE
### Next action
`waitForIndexStatus` (status='yellow')
Expand All @@ -284,7 +296,7 @@ Wait for the source index to become yellow. This means the index's primary has b
`SET_SOURCE_WRITE_BLOCK`
2. If the action fails with a `index_not_yellow_timeout`
`WAIT_FOR_YELLOW_SOURCE`

## SET_SOURCE_WRITE_BLOCK
### Next action
`setWriteBlock`
Expand All @@ -298,7 +310,7 @@ Set a write block on the source index to prevent any older Kibana instances from
### Next action
`createIndex`

This operation is idempotent, if the index already exist, we wait until its status turns green.
This operation is idempotent, if the index already exist, we wait until its status turns green.

- Because we will be transforming documents before writing them into this index, we can already set the mappings to the target mappings for this version. The source index might contain documents belonging to a disabled plugin. So set `dynamic: false` mappings for any unknown saved object types.
- (Since we never query the temporary index we can potentially disable refresh to speed up indexing performance. Profile to see if gains justify complexity)
Expand Down Expand Up @@ -334,7 +346,7 @@ Read the next batch of outdated documents from the source index by using search
`transformRawDocs`

Transform the current batch of documents

In order to support sharing saved objects to multiple spaces in 8.0, the
transforms will also regenerate document `_id`'s. To ensure that this step
remains idempotent, the new `_id` is deterministically generated using UUIDv5
Expand All @@ -361,7 +373,7 @@ completed this step:
`REINDEX_SOURCE_TO_TEMP_READ`
2. If there are more batches left in `transformedDocBatches`
`REINDEX_SOURCE_TO_TEMP_INDEX_BULK`

## REINDEX_SOURCE_TO_TEMP_CLOSE_PIT
### Next action
`closePIT`
Expand All @@ -376,7 +388,7 @@ completed this step:
Set a write block on the temporary index so that we can clone it.
### New control state
`CLONE_TEMP_TO_TARGET`

## CLONE_TEMP_TO_TARGET
### Next action
`cloneIndex`
Expand Down Expand Up @@ -419,6 +431,21 @@ Once transformed we use an index operation to overwrite the outdated document wi
### New control state
`OUTDATED_DOCUMENTS_SEARCH`

## CHECK_TARGET_MAPPINGS

### Next action

`checkTargetMappings`

Compare the calculated mappings' hashes against those stored in the `<index>.mappings._meta`.

### New control state

1. If calculated mappings don't match, we must update them.
`UPDATE_TARGET_MAPPINGS`
2. If calculated mappings and stored mappings match, we can skip directly to the next step.
`CHECK_VERSION_INDEX_READY_ACTIONS`

## UPDATE_TARGET_MAPPINGS
### Next action
`updateAndPickupMappings`
Expand All @@ -436,6 +463,21 @@ update the mappings and then use an update_by_query to ensure that all fields ar
### New control state
`MARK_VERSION_INDEX_READY`

## CHECK_VERSION_INDEX_READY_ACTIONS

Check if the state contains some `versionIndexReadyActions` from the `INIT` action.

### Next action

None

### New control state

1. If there are some `versionIndexReadyActions`, we performed a full migration and need to point the aliases to our newly migrated index.
`MARK_VERSION_INDEX_READY`
2. If there are no `versionIndexReadyActions`, another instance already completed this migration and we only transformed outdated documents and updated the mappings for in case a new plugin was enabled.
`DONE`

## MARK_VERSION_INDEX_READY
### Next action
`updateAliases`
Expand Down Expand Up @@ -552,4 +594,3 @@ have data loss when there's a user error.
other half.
5. Ensure that the document from step (2) has been migrated
(`migrationVersion` contains 7.11.0)

Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

import * as Either from 'fp-ts/lib/Either';
import type { IndexMapping } from '@kbn/core-saved-objects-base-server-internal';
import { checkTargetMappings } from './check_target_mappings';
import { diffMappings } from '../core/build_active_mappings';

jest.mock('../core/build_active_mappings');

const diffMappingsMock = diffMappings as jest.MockedFn<typeof diffMappings>;

const sourceIndexMappings: IndexMapping = {
properties: {
field: { type: 'integer' },
},
};

const targetIndexMappings: IndexMapping = {
properties: {
field: { type: 'long' },
},
};

describe('checkTargetMappings', () => {
beforeEach(() => {
jest.clearAllMocks();
});

it('returns match=false if source mappings are not defined', async () => {
const task = checkTargetMappings({
targetIndexMappings,
});

const result = await task();
expect(diffMappings).not.toHaveBeenCalled();
expect(result).toEqual(Either.right({ match: false }));
});

it('calls diffMappings() with the source and target mappings', async () => {
const task = checkTargetMappings({
sourceIndexMappings,
targetIndexMappings,
});

await task();
expect(diffMappings).toHaveBeenCalledTimes(1);
expect(diffMappings).toHaveBeenCalledWith(sourceIndexMappings, targetIndexMappings);
});

it('returns match=true if diffMappings() match', async () => {
diffMappingsMock.mockReturnValueOnce(undefined);

const task = checkTargetMappings({
sourceIndexMappings,
targetIndexMappings,
});

const result = await task();
expect(result).toEqual(Either.right({ match: true }));
});

it('returns match=false if diffMappings() finds differences', async () => {
diffMappingsMock.mockReturnValueOnce({ changedProp: 'field' });

const task = checkTargetMappings({
sourceIndexMappings,
targetIndexMappings,
});

const result = await task();
expect(result).toEqual(Either.right({ match: false }));
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
import * as Either from 'fp-ts/lib/Either';
import * as TaskEither from 'fp-ts/lib/TaskEither';

import { IndexMapping } from '@kbn/core-saved-objects-base-server-internal';
import { diffMappings } from '../core/build_active_mappings';

/** @internal */
export interface CheckTargetMappingsParams {
sourceIndexMappings?: IndexMapping;
targetIndexMappings: IndexMapping;
}

/** @internal */
export interface TargetMappingsCompareResult {
match: boolean;
}

export const checkTargetMappings =
({
sourceIndexMappings,
targetIndexMappings,
}: CheckTargetMappingsParams): TaskEither.TaskEither<never, TargetMappingsCompareResult> =>
async () => {
if (!sourceIndexMappings) {
return Either.right({ match: false });
}
const diff = diffMappings(sourceIndexMappings, targetIndexMappings);
return Either.right({ match: !diff });
};
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
* Side Public License, v 1.
*/

import { RetryableEsClientError } from './catch_retryable_es_client_errors';
import { DocumentsTransformFailed } from '../core/migrate_raw_docs';
import { type Either, right } from 'fp-ts/lib/Either';
import type { RetryableEsClientError } from './catch_retryable_es_client_errors';
import type { DocumentsTransformFailed } from '../core/migrate_raw_docs';

export {
BATCH_SIZE,
Expand Down Expand Up @@ -78,6 +79,12 @@ export { updateAliases } from './update_aliases';
export type { CreateIndexParams } from './create_index';
export { createIndex } from './create_index';

export { checkTargetMappings } from './check_target_mappings';

export { updateTargetMappingsMeta } from './update_target_mappings_meta';

export const noop = async (): Promise<Either<never, 'noop'>> => right('noop' as const);

export type {
UpdateAndPickupMappingsResponse,
UpdateAndPickupMappingsParams,
Expand Down
Loading

0 comments on commit 42bf33f

Please sign in to comment.