Skip to content

Commit

Permalink
[ML] Extra logging for debugging rolling upgrade test failure elastic…
Browse files Browse the repository at this point in the history
…#100800

For investigating elastic#100371
  • Loading branch information
davidkyle authored Oct 13, 2023
1 parent cb30096 commit 2ce5392
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,8 @@ public void clusterChanged(ClusterChangedEvent event) {
}

if (eventStateMinTransportVersionIsBeforeDistributedModelAllocationTransportVersion(event)) {
logger.trace("min transport version is before assignment change on " + event.state().nodes().getAllNodes().size() + " nodes");

// we should not try to rebalance assignments while there may be nodes running on a version
// prior to introducing distributed model allocation.
// But we should remove routing to removed or shutting down nodes.
Expand Down Expand Up @@ -238,6 +240,7 @@ public void onFailure(Exception e) {
}

private void removeRoutingToRemovedOrShuttingDownNodes(ClusterChangedEvent event) {
logger.trace("remove routing to removed or shutting down nodes ");
if (areAssignedNodesRemoved(event)) {
submitUnbatchedTask("removing routing entries for removed or shutting down nodes", new ClusterStateUpdateTask() {
@Override
Expand Down Expand Up @@ -282,6 +285,7 @@ static boolean areAssignedNodesRemoved(ClusterChangedEvent event) {

// Visible for testing
static ClusterState removeRoutingToUnassignableNodes(ClusterState currentState) {
logger.trace("remove routing to unassignable nodes");
Set<String> assignableNodes = getAssignableNodes(currentState).stream().map(DiscoveryNode::getId).collect(Collectors.toSet());
TrainedModelAssignmentMetadata metadata = TrainedModelAssignmentMetadata.fromState(currentState);
TrainedModelAssignmentMetadata.Builder builder = TrainedModelAssignmentMetadata.builder(currentState);
Expand Down Expand Up @@ -431,6 +435,7 @@ public void createNewModelAssignment(
}

public void setModelAssignmentToStopping(String modelId, ActionListener<AcknowledgedResponse> listener) {
logger.trace("set to stopping");
submitUnbatchedTask("set model assignment stopping", new ClusterStateUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
Expand All @@ -450,6 +455,7 @@ public void clusterStateProcessed(ClusterState oldState, ClusterState newState)
}

public void removeModelAssignment(String deploymentId, ActionListener<AcknowledgedResponse> listener) {
logger.trace("remove model assignments");
submitUnbatchedTask("delete model deployment assignment", new ClusterStateUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
Expand Down Expand Up @@ -486,6 +492,7 @@ public void clusterStateProcessed(ClusterState oldState, ClusterState newState)

// Used by the reset action directly
public void removeAllModelAssignments(ActionListener<AcknowledgedResponse> listener) {
logger.trace("remove all assignments");
submitUnbatchedTask("delete all model assignments", new ClusterStateUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
Expand Down Expand Up @@ -518,9 +525,11 @@ private static ClusterState forceUpdate(ClusterState currentState, TrainedModelA
logger.debug(() -> format("updated assignments: %s", modelAssignments.build()));
Metadata.Builder metadata = Metadata.builder(currentState.metadata());
if (currentState.getMinTransportVersion().onOrAfter(RENAME_ALLOCATION_TO_ASSIGNMENT_TRANSPORT_VERSION)) {
logger.trace("putting custom new name");
metadata.putCustom(TrainedModelAssignmentMetadata.NAME, modelAssignments.build())
.removeCustom(TrainedModelAssignmentMetadata.DEPRECATED_NAME);
} else {
logger.trace("putting custom old name");
metadata.putCustom(TrainedModelAssignmentMetadata.DEPRECATED_NAME, modelAssignments.buildOld());
}
return ClusterState.builder(currentState).metadata(metadata).build();
Expand Down Expand Up @@ -616,6 +625,7 @@ ClusterState stopPlatformSpecificModelsInHeterogeneousClusters(
modelToAdd.get().getModelId(),
mlNodesArchitectures
);
logger.info(reasonToStop);
updatedState = callSetToStopping(reasonToStop, modelToAdd.get().getDeploymentId(), clusterState);
}
return updatedState;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ public void setUpLogging() throws IOException {
{
"persistent": {
"logger.org.elasticsearch.xpack.ml.inference": "TRACE",
"logger.org.elasticsearch.xpack.ml.inference.assignments": "DEBUG",
"logger.org.elasticsearch.xpack.ml.inference.assignments": "TRACE",
"logger.org.elasticsearch.xpack.ml.process": "DEBUG",
"logger.org.elasticsearch.xpack.ml.action": "TRACE"
}
Expand All @@ -97,6 +97,7 @@ public void removeLogging() throws IOException {
client().performRequest(request);
}

@AwaitsFix(bugUrl = "mute to try and reproduce https://github.com/elastic/elasticsearch/issues/100379")
public void testTrainedModelDeployment() throws Exception {
assumeTrue("NLP model deployments added in 8.0", UPGRADE_FROM_VERSION.onOrAfter(Version.V_8_0_0));

Expand Down

0 comments on commit 2ce5392

Please sign in to comment.