Skip to content

Commit

Permalink
Don't allow downscales if some shards are unassigned (#3883) (#3887)
Browse files Browse the repository at this point in the history
* Don't allow downscales if some shards are unassigned

In some conditions, for example when a Pod gets killed/restarted right
before a downscale happens, the shards on that Pod is reported as UNASSIGNED.
At this point it is dangerous to definitely remove the Pod (downscale the cluster)
since we can't know for sure the Pod to remove isn't supposed to hold any of the
unassigned shards.

To avoid that situation, this commit disallows any downscale to happen if
some of the shards don't have a node assigned to them (regardless of their status -
unassigned or not). The logic to allow a node to be downscaled is rather simple:
- all shards must have a node assigned to them
- the pod to remove must not have a shard assigned to it

This is a rather conservative/safe approach that could be optimized in the future.

I tried implementing an e2e test for this, but it's a bit tricky to setup the right way
so it consistently fails without this commit. I'm considering the unit test is good enough.

* Improve comments

Co-authored-by: Anya Sabo <anya@sabolicio.us>

Co-authored-by: Anya Sabo <anya@sabolicio.us>

Co-authored-by: Anya Sabo <anya@sabolicio.us>
  • Loading branch information
sebgl and anyasabo authored Oct 30, 2020
1 parent 2e2a853 commit 696ceb5
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 8 deletions.
2 changes: 1 addition & 1 deletion pkg/controller/elasticsearch/driver/downscale.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ func calculatePerformableDownscale(
}
// iterate on all leaving nodes (ordered by highest ordinal first)
for _, node := range downscale.leavingNodeNames() {
migrating, err := migration.NodeHasShard(ctx.parentCtx, ctx.shardLister, node)
migrating, err := migration.NodeMayHaveShard(ctx.parentCtx, ctx.es, ctx.shardLister, node)
if err != nil {
return performableDownscale, err
}
Expand Down
16 changes: 13 additions & 3 deletions pkg/controller/elasticsearch/migration/migrate_data.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,27 @@ import (

var log = logf.Log.WithName("migrate-data")

// NodeHasShard returns true if the given ES Pod is holding at least one shard (primary or replica).
func NodeHasShard(ctx context.Context, shardLister esclient.ShardLister, podName string) (bool, error) {
// NodeMayHaveShard returns true if one of those condition is met:
// - the given ES Pod is holding at least one shard (primary or replica)
// - some shards in the cluster don't have a node assigned, in which case we can't be sure about the 1st condition
// this may happen if the node was just restarted: the shards it is holding appear unassigned
func NodeMayHaveShard(ctx context.Context, es esv1.Elasticsearch, shardLister esclient.ShardLister, podName string) (bool, error) {
shards, err := shardLister.GetShards(ctx)
if err != nil {
return false, err
}
// filter shards affected by node removal
for _, shard := range shards {
// shard still on the node
if shard.NodeName == podName {
return true, nil
}
// shard node undefined (likely unassigned)
if shard.NodeName == "" {
log.Info("Found orphan shard, preventing data migration",
"namespace", es.Namespace, "es_name", es.Name,
"index", shard.Index, "shard", shard.Shard, "shard_state", shard.State)
return true, nil
}
}
return false, nil
}
Expand Down
19 changes: 15 additions & 4 deletions pkg/controller/elasticsearch/migration/migrate_data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import (
"github.com/stretchr/testify/require"
)

func TestNodeHasShard(t *testing.T) {
func TestNodeMayHaveShard(t *testing.T) {
type args struct {
shardLister client.ShardLister
podName string
Expand Down Expand Up @@ -61,16 +61,27 @@ func TestNodeHasShard(t *testing.T) {
},
want: false,
},
{
name: "Some shards have no node assigned",
args: args{
podName: "A",
shardLister: NewFakeShardLister([]client.Shard{
{Index: "index-1", Shard: "0", NodeName: ""},
{Index: "index-1", Shard: "0", NodeName: "C"},
}),
},
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := NodeHasShard(context.Background(), tt.args.shardLister, tt.args.podName)
got, err := NodeMayHaveShard(context.Background(), esv1.Elasticsearch{}, tt.args.shardLister, tt.args.podName)
if (err != nil) != tt.wantErr {
t.Errorf("NodeHasShard() error = %v, wantErr %v", err, tt.wantErr)
t.Errorf("NodeMayHaveShard() error = %v, wantErr %v", err, tt.wantErr)
return
}
if got != tt.want {
t.Errorf("NodeHasShard() = %v, want %v", got, tt.want)
t.Errorf("NodeMayHaveShard() = %v, want %v", got, tt.want)
}
})
}
Expand Down

0 comments on commit 696ceb5

Please sign in to comment.