From 6c3e00ad1f851891244b61220b30a034076d0efd Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 13 Jul 2024 17:52:37 +0800 Subject: [PATCH 1/2] Improve cluster cant failover log conditions This PR adjusts the logging conditions of clusterLogCantFailover in this two ways. 1. For the same cant_failover_reason, we will print the log once in CLUSTER_CANT_FAILOVER_RELOG_PERIOD, but its value is 10s, which is a bit long, shorten it to 5s, so we can better track its state. 2. We will not print logs before the nolog_fail_time, its value is cluster-node-timeout+5000. This may casue us to lose some logs, for example, if cluster-node-timeout is small, auth_timeout will be 2000, and auth_retry_time will be 4000. In this case, we will lose all the reasons during the election if the failover is timedout. So remove the nolog_fail_time logic, since we still do have the CLUSTER_CANT_FAILOVER_RELOG_PERIOD logic, We won't print too many logs. Signed-off-by: Binbin --- src/cluster_legacy.c | 11 ----------- src/cluster_legacy.h | 2 +- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 035b9fc876..0fcf95a096 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4191,9 +4191,6 @@ int clusterGetReplicaRank(void) { * 2) Also, the log is emitted again if the primary is still down and * the reason for not failing over is still the same, but more than * CLUSTER_CANT_FAILOVER_RELOG_PERIOD seconds elapsed. - * 3) Finally, the function only logs if the replica is down for more than - * five seconds + NODE_TIMEOUT. This way nothing is logged when a - * failover starts in a reasonable time. * * The function is called with the reason why the replica can't failover * which is one of the integer macros CLUSTER_CANT_FAILOVER_*. @@ -4202,7 +4199,6 @@ int clusterGetReplicaRank(void) { void clusterLogCantFailover(int reason) { char *msg; static time_t lastlog_time = 0; - mstime_t nolog_fail_time = server.cluster_node_timeout + 5000; /* Don't log if we have the same reason for some time. */ if (reason == server.cluster->cant_failover_reason && @@ -4211,13 +4207,6 @@ void clusterLogCantFailover(int reason) { server.cluster->cant_failover_reason = reason; - /* We also don't emit any log if the primary failed no long ago, the - * goal of this function is to log replicas in a stalled condition for - * a long time. */ - if (myself->replicaof && nodeFailed(myself->replicaof) && - (mstime() - myself->replicaof->fail_time) < nolog_fail_time) - return; - switch (reason) { case CLUSTER_CANT_FAILOVER_DATA_AGE: msg = "Disconnected from primary for longer than allowed. " diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 3c5696273b..96c783507d 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -17,7 +17,7 @@ #define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2 #define CLUSTER_CANT_FAILOVER_EXPIRED 3 #define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4 -#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (10) /* seconds. */ +#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD 5 /* seconds. */ /* clusterState todo_before_sleep flags. */ #define CLUSTER_TODO_HANDLE_FAILOVER (1 << 0) From 9e050f841af939ff2c1a121f368bfa9529afce31 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 16 Jul 2024 17:31:55 +0800 Subject: [PATCH 2/2] change CLUSTER_CANT_FAILOVER_RELOG_PERIOD to 1s Signed-off-by: Binbin --- src/cluster_legacy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 96c783507d..1c77f613a5 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -17,7 +17,7 @@ #define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2 #define CLUSTER_CANT_FAILOVER_EXPIRED 3 #define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4 -#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD 5 /* seconds. */ +#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD 1 /* seconds. */ /* clusterState todo_before_sleep flags. */ #define CLUSTER_TODO_HANDLE_FAILOVER (1 << 0)