Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dm/scheduler: fix inconsistent of relay status #3474

Merged
merged 8 commits into from
Nov 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions dm/dm/master/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -1753,6 +1753,16 @@ func (s *Scheduler) handleWorkerOnline(ev ha.WorkerEvent, toLock bool) error {

// 3. change the stage (from Offline) to Free or Relay.
lastRelaySource := w.RelaySourceID()
if lastRelaySource == "" {
// when worker is removed (for example lost keepalive when master scheduler boots up), w.RelaySourceID() is
// of course nothing, so we find the relay source from a better place
for source, workerM := range s.relayWorkers {
if _, ok2 := workerM[w.BaseInfo().Name]; ok2 {
lastRelaySource = source
break
}
}
}
w.ToFree()
// TODO: rename ToFree to Online and move below logic inside it
if lastRelaySource != "" {
Expand Down
14 changes: 14 additions & 0 deletions dm/tests/new_relay/conf/source2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
source-id: mysql-replica-02
server-id: 123456
flavor: 'mysql'
enable-gtid: true
relay-binlog-name: ''
relay-binlog-gtid: ''
enable-relay: false
from:
host: 127.0.0.1
user: root
password: /Q7B9DizNLLTTfiZHv9WoEAKamfpIUs=
port: 3307
checker:
check-enable: false
79 changes: 78 additions & 1 deletion dm/tests/new_relay/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,82 @@ function test_cant_dail_downstream() {
cleanup_data $TEST_NAME
}

function test_restart_relay_status() {
cleanup_data $TEST_NAME
cleanup_process

run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT
run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT

dmctl_operate_source create $cur/conf/source1.yaml $SOURCE_ID1

run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"start-relay -s $SOURCE_ID1 worker1"
run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"query-status -s $SOURCE_ID1" \
"\"result\": true" 2 \
"\"worker\": \"worker1\"" 1

run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml
check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT

dmctl_operate_source create $cur/conf/source2.yaml $SOURCE_ID2

run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"start-relay -s $SOURCE_ID2 worker2"
run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"query-status -s $SOURCE_ID2" \
"\"result\": true" 2 \
"\"worker\": \"worker2\"" 1

run_dm_worker $WORK_DIR/worker3 $WORKER3_PORT $cur/conf/dm-worker3.toml
check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER3_PORT

run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"start-relay -s $SOURCE_ID2 worker3"
run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"query-status -s $SOURCE_ID2" \
"\"result\": true" 3 \
"\"worker\": \"worker2\"" 1 \
"\"worker\": \"worker3\"" 1

run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"list-member -n worker3" \
"relay" 1

kill_dm_worker
kill_dm_master

run_dm_master $WORK_DIR/master $MASTER_PORT $cur/conf/dm-master.toml
check_rpc_alive $cur/../bin/check_master_online 127.0.0.1:$MASTER_PORT

run_dm_worker $WORK_DIR/worker1 $WORKER1_PORT $cur/conf/dm-worker1.toml
run_dm_worker $WORK_DIR/worker3 $WORKER3_PORT $cur/conf/dm-worker3.toml
check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER1_PORT
check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER3_PORT

run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml
check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT

run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"query-status -s $SOURCE_ID1" \
"\"result\": true" 2 \
"\"worker\": \"worker1\"" 1

run_dm_ctl_with_retry $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"query-status -s $SOURCE_ID2" \
"\"result\": true" 3 \
"\"worker\": \"worker2\"" 1 \
"\"worker\": \"worker3\"" 1

run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
"list-member --worker" \
"relay" 1 \
"bound" 2
}

function test_kill_dump_connection() {
cleanup_data $TEST_NAME
cleanup_process
Expand All @@ -108,7 +184,7 @@ function test_kill_dump_connection() {
"\"worker\": \"worker1\"" 1
run_sql_source1 "show processlist"

# kill dumop connection to test wheather relay will auto reconnect db
# kill dump connection to test whether relay will auto reconnect db
dump_conn_id=$(cat $TEST_DIR/sql_res.$TEST_NAME.txt | grep Binlog -B 4 | grep Id | cut -d : -f2)
run_sql_source1 "kill ${dump_conn_id}"

Expand All @@ -123,6 +199,7 @@ function test_kill_dump_connection() {
}

function run() {
test_restart_relay_status
test_cant_dail_downstream
test_cant_dail_upstream

Expand Down