Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix stale management cluster resources #8224

Merged
merged 3 commits into from
Feb 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions shell/detail/provisioning.cattle.io.cluster.vue
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import Socket, {
import { get } from '@shell/utils/object';
import CapiMachineDeployment from '@shell/models/cluster.x-k8s.io.machinedeployment';
import { isAlternate } from '@shell/utils/platform';
import { defaultTableSortGenerationFn } from '@shell/components/ResourceTable.vue';

let lastId = 1;
const ansiup = new AnsiUp();
Expand Down Expand Up @@ -638,6 +639,26 @@ export default {
return day(time).format(this.dateTimeFormatStr);
}
},

machineSortGenerationFn() {
// The sort generation function creates a unique value and is used to create a key including sort details.
// The unique key determines if the list is redrawn or a cached version is shown.
// Because we ensure the 'not in a pool' group is there via a row, and timing issues, the unqiue key doesn't change
// after a machine is added/removed... so the list won't update... so we need to inject a string to ensure the key is fresh
const base = defaultTableSortGenerationFn(this.machineSchema, this.$store);

return base + (!!this.fakeMachines.length ? '-fake' : '');
},

nodeSortGenerationFn() {
// The sort generation function creates a unique value and is used to create a key including sort details.
// The unique key determines if the list is redrawn or a cached version is shown.
// Because we ensure the 'not in a pool' group is there via a row, and timing issues, the unqiue key doesn't change
// after a machine is added/removed... so the list won't update... so we need to inject a string to ensure the key is fresh
const base = defaultTableSortGenerationFn(this.mgmtNodeSchema, this.$store);

return base + (!!this.fakeNodes.length ? '-fake' : '');
},
}
};
</script>
Expand Down Expand Up @@ -681,6 +702,7 @@ export default {
:group-by="value.isCustom ? null : 'poolId'"
group-ref="pool"
:group-sort="['pool.nameDisplay']"
:sort-generation-fn="machineSortGenerationFn"
>
<template #main-row:isFake="{fullColspan}">
<tr class="main-row">
Expand Down Expand Up @@ -765,6 +787,7 @@ export default {
:group-by="value.isCustom ? null : 'spec.nodePoolName'"
group-ref="pool"
:group-sort="['pool.nameDisplay']"
:sort-generation-fn="nodeSortGenerationFn"
>
<template #main-row:isFake="{fullColspan}">
<tr class="main-row">
Expand Down
4 changes: 2 additions & 2 deletions shell/models/provisioning.cattle.io.cluster.js
Original file line number Diff line number Diff line change
Expand Up @@ -457,9 +457,9 @@ export default class ProvCluster extends SteveModel {
return names.join('<br>');
} else {
const names = this.machines.filter((machine) => {
return machine.status.conditions.find(c => c.error && c.type === 'NodeHealthy');
return machine.status?.conditions?.find(c => c.error && c.type === 'NodeHealthy');
}).map((machine) => {
if (machine.status.nodeRef?.name) {
if (machine.status?.nodeRef?.name) {
return this.t('cluster.availabilityWarnings.node', { name: machine.status.nodeRef.name });
}

Expand Down
6 changes: 4 additions & 2 deletions shell/plugins/dashboard-store/actions.js
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,8 @@ export default {
dispatch('watch', {
type,
revision: out.revision,
namespace: opt.watchNamespace
namespace: opt.watchNamespace,
force: opt.forceWatch === true,
});
}

Expand Down Expand Up @@ -377,7 +378,8 @@ export default {
dispatch('watch', {
type,
selector,
revision: res.revision
revision: res.revision,
force: opt.forceWatch === true,
});
}

Expand Down
73 changes: 29 additions & 44 deletions shell/plugins/steve/subscribe.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import Socket, {
EVENT_DISCONNECT_ERROR,
NO_WATCH,
NO_SCHEMA,
REVISION_TOO_OLD
} from '@shell/utils/socket';
import { normalizeType } from '@shell/plugins/dashboard-store/normalize';
import day from 'dayjs';
Expand Down Expand Up @@ -306,15 +307,18 @@ const sharedActions = {

// If socket is in error don't try to watch.... unless we `force` it
if ( !stop && !force && !getters.canWatch(params) ) {
console.error(`Cannot Watch [${ getters.storeName }]`, JSON.stringify(params)); // eslint-disable-line no-console
console.error(`Aborting Watch Request [${ getters.storeName }] (socket probably in error)`, JSON.stringify(params)); // eslint-disable-line no-console

return;
}

if ( !stop && getters.watchStarted({
type, id, selector, namespace
}) ) {
state.debugSocket && console.debug(`Already Watching [${ getters.storeName }]`, JSON.stringify(params)); // eslint-disable-line no-console
// eslint-disable-next-line no-console
state.debugSocket && console.debug(`Already Watching [${ getters.storeName }]`, {
type, id, selector, namespace
});

return;
}
Expand Down Expand Up @@ -693,19 +697,26 @@ const defaultActions = {
} else if ( err.includes('failed to find schema') ) {
commit('setInError', { type: msg.resourceType, reason: NO_SCHEMA });
} else if ( err.includes('too old') ) {
// Set an error for (all) subs of this type. This..
// 1) blocks attempts by resource.stop to resub (as type is in error)
// 2) will be cleared when resyncWatch --> watch (with force) --> resource.start completes
commit('setInError', { type: msg.resourceType, reason: REVISION_TOO_OLD });
dispatch('resyncWatch', msg);
}
},

/**
* Steve only event
*
* Steve only seems to send out `resource.stop` messages for two reasons
* - We have requested that the resource watch should be stopped and we receive this event as confirmation
* - Steve tells us that the resource is no longer watched
*
* Steve has stopped watching this resource. This happens for a couple of reasons
* - We have requested that the resource watch should be stopped (and we receive this event as confirmation)
* - Steve tells us that the resource watch has been stopped. Possible reasons
* - The rancher <--> k8s socket closed (happens every ~30 mins on mgmt socket)
* - Permissions has changed for the subscribed resource, so rancher closes socket
*/
'ws.resource.stop'({ getters, commit, dispatch }, msg) {
'ws.resource.stop'({
state, getters, commit, dispatch
}, msg) {
const type = msg.resourceType;
const obj = {
type,
Expand All @@ -714,51 +725,23 @@ const defaultActions = {
selector: msg.selector
};

// console.warn(`Resource stop: [${ getters.storeName }]`, msg); // eslint-disable-line no-console
state.debugSocket && console.info(`Resource Stop [${ getters.storeName }]`, type, msg); // eslint-disable-line no-console

if (!type) {
console.error(`Resource Stop [${ getters.storeName }]. Received resource.stop with an empty resourceType, aborting`, msg); // eslint-disable-line no-console

return;
}

// If we're trying to watch this event, attempt to re-watch
if ( getters['schemaFor'](type) && getters['watchStarted'](obj) ) {
commit('setWatchStopped', obj);

// In summary, we need to re-watch but with a reliable `revision` (to avoid `too old` message kicking off a full re-fetch of all
// resources). To get a reliable `revision` go out and fetch the latest for that resource type, in theory our local cache should be
// up to date with that revision.

const revisionExisting = getters.nextResourceVersion(type, obj.id);

let revisionLatest;

if (revisionExisting) {
// Attempt to fetch the latest revision at the time the resource watch was stopped, in theory our local cache should be up to
// date with this
// Ideally we shouldn't need to fetch here and supply `0`, `-1` or `null` to start watching from the latest revision, however steve
// will send the current state of each resource via a `resource.created` event.
const opt = { limit: 1 };

opt.url = getters.urlFor(type, null, opt);
revisionLatest = dispatch('request', { opt, type } )
.then(res => res.revision)
.catch((err) => {
// For some reason we can't fetch a reasonable revision, so force a re-fetch
console.warn(`Resource error retrieving resourceVersion, forcing re-fetch`, type, ':', err); // eslint-disable-line no-console
dispatch('resyncWatch', msg);
throw err;
});
} else {
// Some v1 resource types don't have revisions (either at the collection or resource level), so we avoided making an API request
// for them
revisionLatest = Promise.resolve(null); // Null to ensure we don't go through `nextResourceVersion` again
}

setTimeout(() => {
// Delay a bit so that immediate start/error/stop causes
// only a slow infinite loop instead of a tight one.
revisionLatest.then(revision => dispatch('watch', { ...obj, revision }));
}, 5000);
dispatch('watch', obj);
}
},

'ws.resource.create'(ctx, msg) {
ctx.state.debugSocket && console.info(`Resource Create [${ ctx.getters.storeName }]`, msg.resourceType, msg); // eslint-disable-line no-console
queueChange(ctx, msg, true, 'Create');
},

Expand Down Expand Up @@ -809,6 +792,8 @@ const defaultActions = {
const data = msg.data;
const type = data.type;

ctx.state.debugSocket && console.info(`Resource Remove [${ ctx.getters.storeName }]`, type, msg); // eslint-disable-line no-console

if (type === SCHEMA) {
const worker = (this.$workers || {})[ctx.getters.storeName];

Expand Down
1 change: 1 addition & 0 deletions shell/utils/socket.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export const EVENT_DISCONNECT_ERROR = 'disconnect_error';

export const NO_WATCH = 'NO_WATCH';
export const NO_SCHEMA = 'NO_SCHEMA';
export const REVISION_TOO_OLD = 'TOO_OLD';

export default class Socket extends EventTarget {
url;
Expand Down