Skip to content

Commit

Permalink
valkey-cli make source node ignores NOREPLICAS when doing the last CL…
Browse files Browse the repository at this point in the history
…USTER SETSLOT (valkey-io#928)

This fixes valkey-io#899. In that issue, the primary is cluster-allow-replica-migration no
and its replica is cluster-allow-replica-migration yes.

And during the slot migration:
1. Primary calling blockClientForReplicaAck, waiting its replica.
2. Its replica reconfiguring itself as a replica of other shards due to
replica migration and disconnect from the old primary.
3. The old primary never got the chance to receive the ack, so it got a
timeout and got a NOREPLICAS error.

In this case, the replicas might automatically migrate to another primary,
resulting in the client being unblocked with the NOREPLICAS error. In this
case, since the configuration will eventually propagate itself, we can safely
ignore this error on the source node.

Signed-off-by: Binbin <binloveplay1314@qq.com>
Signed-off-by: Ping Xie <pingxie@google.com>
  • Loading branch information
PingXie committed Sep 14, 2024
1 parent 44bb9f1 commit fb30516
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 353 deletions.
217 changes: 23 additions & 194 deletions tests/unit/cluster/replica-migration.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,8 @@ proc my_slot_allocation {masters replicas} {
R [expr $masters-1] cluster addslots 0
}

proc get_my_primary_peer {srv_idx} {
set role_response [R $srv_idx role]
set primary_ip [lindex $role_response 1]
set primary_port [lindex $role_response 2]
set primary_peer "$primary_ip:$primary_port"
return $primary_peer
}

proc test_migrated_replica {type} {
test "Migrated replica reports zero repl offset and rank, and fails to win election - $type" {
start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test "Migrated replica reports zero repl offset and rank, and fails to win election" {
# Write some data to primary 0, slot 1, make a small repl_offset.
for {set i 0} {$i < 1024} {incr i} {
R 0 incr key_991803
Expand Down Expand Up @@ -53,14 +45,8 @@ proc test_migrated_replica {type} {
R 3 config set cluster-allow-replica-migration yes
R 7 config set cluster-allow-replica-migration yes

if {$type == "shutdown"} {
# Shutdown primary 0.
catch {R 0 shutdown nosave}
} elseif {$type == "sigstop"} {
# Pause primary 0.
set primary0_pid [s 0 process_id]
pause_process $primary0_pid
}
# Shutdown primary 0.
catch {R 0 shutdown nosave}

# Wait for the replica to become a primary, and make sure
# the other primary become a replica.
Expand Down Expand Up @@ -107,30 +93,11 @@ proc test_migrated_replica {type} {
puts "R 7: [R 7 keys *]"
fail "Key not consistent"
}

if {$type == "sigstop"} {
resume_process $primary0_pid

# Wait for the old primary to go online and become a replica.
wait_for_condition 1000 50 {
[s 0 role] eq {slave}
} else {
fail "The old primary was not converted into replica"
}
}
}
} ;# proc

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test_migrated_replica "shutdown"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test_migrated_replica "sigstop"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster

proc test_nonempty_replica {type} {
test "New non-empty replica reports zero repl offset and rank, and fails to win election - $type" {
test "New non-empty replica reports zero repl offset and rank, and fails to win election" {
# Write some data to primary 0, slot 1, make a small repl_offset.
for {set i 0} {$i < 1024} {incr i} {
R 0 incr key_991803
Expand All @@ -151,14 +118,8 @@ proc test_nonempty_replica {type} {
R 7 config set cluster-allow-replica-migration yes
R 7 cluster replicate [R 0 cluster myid]

if {$type == "shutdown"} {
# Shutdown primary 0.
catch {R 0 shutdown nosave}
} elseif {$type == "sigstop"} {
# Pause primary 0.
set primary0_pid [s 0 process_id]
pause_process $primary0_pid
}
# Shutdown primary 0.
catch {R 0 shutdown nosave}

# Wait for the replica to become a primary.
wait_for_condition 1000 50 {
Expand Down Expand Up @@ -194,151 +155,18 @@ proc test_nonempty_replica {type} {
puts "R 7: [R 7 get key_991803]"
fail "Key not consistent"
}

if {$type == "sigstop"} {
resume_process $primary0_pid

# Wait for the old primary to go online and become a replica.
wait_for_condition 1000 50 {
[s 0 role] eq {slave}
} else {
fail "The old primary was not converted into replica"
}
}
}
} ;# proc

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test_nonempty_replica "shutdown"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test_nonempty_replica "sigstop"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster

proc test_sub_replica {type} {
test "Sub-replica reports zero repl offset and rank, and fails to win election - $type" {
# Write some data to primary 0, slot 1, make a small repl_offset.
for {set i 0} {$i < 1024} {incr i} {
R 0 incr key_991803
}
assert_equal {1024} [R 0 get key_991803]

# Write some data to primary 3, slot 0, make a big repl_offset.
for {set i 0} {$i < 10240} {incr i} {
R 3 incr key_977613
}
assert_equal {10240} [R 3 get key_977613]

R 3 config set cluster-replica-validity-factor 0
R 7 config set cluster-replica-validity-factor 0
R 3 config set cluster-allow-replica-migration yes
R 7 config set cluster-allow-replica-migration no

# 10s, make sure primary 0 will hang in the save.
R 0 config set rdb-key-save-delay 100000000

# Move slot 0 from primary 3 to primary 0.
set addr "[srv 0 host]:[srv 0 port]"
set myid [R 3 CLUSTER MYID]
set code [catch {
exec src/valkey-cli {*}[valkeycli_tls_config "./tests"] --cluster rebalance $addr --cluster-weight $myid=0
} result]
if {$code != 0} {
fail "valkey-cli --cluster rebalance returns non-zero exit code, output below:\n$result"
}

# Make sure server 3 and server 7 becomes a replica of primary 0.
wait_for_condition 1000 50 {
[get_my_primary_peer 3] eq $addr &&
[get_my_primary_peer 7] eq $addr
} else {
puts "R 3 role: [R 3 role]"
puts "R 7 role: [R 7 role]"
fail "Server 3 and 7 role response has not changed"
}

# Make sure server 7 got a sub-replica log.
verify_log_message -7 "*I'm a sub-replica!*" 0

if {$type == "shutdown"} {
# Shutdown primary 0.
catch {R 0 shutdown nosave}
} elseif {$type == "sigstop"} {
# Pause primary 0.
set primary0_pid [s 0 process_id]
pause_process $primary0_pid
}

# Wait for the replica to become a primary, and make sure
# the other primary become a replica.
wait_for_condition 1000 50 {
[s -4 role] eq {master} &&
[s -3 role] eq {slave} &&
[s -7 role] eq {slave}
} else {
puts "s -4 role: [s -4 role]"
puts "s -3 role: [s -3 role]"
puts "s -7 role: [s -7 role]"
fail "Failover does not happened"
}

# Make sure the offset of server 3 / 7 is 0.
verify_log_message -3 "*Start of election*offset 0*" 0
verify_log_message -7 "*Start of election*offset 0*" 0

# Wait for the cluster to be ok.
wait_for_condition 1000 50 {
[CI 3 cluster_state] eq "ok" &&
[CI 4 cluster_state] eq "ok" &&
[CI 7 cluster_state] eq "ok"
} else {
puts "R 3: [R 3 cluster info]"
puts "R 4: [R 4 cluster info]"
puts "R 7: [R 7 cluster info]"
fail "Cluster is down"
}

# Make sure the key exists and is consistent.
R 3 readonly
R 7 readonly
wait_for_condition 1000 50 {
[R 3 get key_991803] == 1024 && [R 3 get key_977613] == 10240 &&
[R 4 get key_991803] == 1024 && [R 4 get key_977613] == 10240 &&
[R 7 get key_991803] == 1024 && [R 7 get key_977613] == 10240
} else {
puts "R 3: [R 3 keys *]"
puts "R 4: [R 4 keys *]"
puts "R 7: [R 7 keys *]"
fail "Key not consistent"
}

if {$type == "sigstop"} {
resume_process $primary0_pid

# Wait for the old primary to go online and become a replica.
wait_for_condition 1000 50 {
[s 0 role] eq {slave}
} else {
fail "The old primary was not converted into replica"
}
}
}
}

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test_sub_replica "shutdown"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test_sub_replica "sigstop"
} my_slot_allocation cluster_allocate_replicas ;# start_cluster

start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} {
test "valkey-cli make source node ignores NOREPLICAS error when doing the last CLUSTER SETSLOT" {
R 3 config set cluster-allow-replica-migration no
R 7 config set cluster-allow-replica-migration yes

# Record the current primary node, server 7 will be migrated later.
set old_primary_ip [lindex [R 7 role] 1]
set old_primary_port [lindex [R 7 role] 2]

# Move slot 0 from primary 3 to primary 0.
set addr "[srv 0 host]:[srv 0 port]"
set myid [R 3 CLUSTER MYID]
Expand All @@ -349,16 +177,17 @@ start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout
fail "valkey-cli --cluster rebalance returns non-zero exit code, output below:\n$result"
}

# Make sure server 3 lost its replica (server 7) and server 7 becomes a replica of primary 0.
wait_for_condition 1000 50 {
[s -3 role] eq {master} &&
[s -3 connected_slaves] eq 0 &&
[s -7 role] eq {slave} &&
[get_my_primary_peer 7] eq $addr
} else {
puts "R 3 role: [R 3 role]"
puts "R 7 role: [R 7 role]"
fail "Server 3 and 7 role response has not changed"
}
wait_for_cluster_propagation
wait_for_cluster_state "ok"

# Make sure server 3 is still a primary and has no replicas.
assert_equal [s -3 role] {master}
assert_equal [lindex [R 3 role] 2] {}

# And server 7 becomes a replica of another primary.
set new_primary_ip [lindex [R 7 role] 1]
set new_primary_port [lindex [R 7 role] 2]
assert_equal [s -7 role] {slave}
assert_not_equal "$old_primary_ip:$old_primary_port" "new_primary_ip:new_primary_port"
}
} my_slot_allocation cluster_allocate_replicas ;# start_cluster
Loading

0 comments on commit fb30516

Please sign in to comment.