From 6c3e00ad1f851891244b61220b30a034076d0efd Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sat, 13 Jul 2024 17:52:37 +0800
Subject: [PATCH 1/2] Improve cluster cant failover log conditions

This PR adjusts the logging conditions of clusterLogCantFailover
in this two ways.

1. For the same cant_failover_reason, we will print the log once
in CLUSTER_CANT_FAILOVER_RELOG_PERIOD, but its value is 10s, which
is a bit long, shorten it to 5s, so we can better track its state.

2. We will not print logs before the nolog_fail_time, its value
is cluster-node-timeout+5000. This may casue us to lose some logs,
for example, if cluster-node-timeout is small, auth_timeout will
be 2000, and auth_retry_time will be 4000. In this case, we will
lose all the reasons during the election if the failover is timedout.
So remove the nolog_fail_time logic, since we still do have the
CLUSTER_CANT_FAILOVER_RELOG_PERIOD logic, We won't print too many
logs.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 11 -----------
 src/cluster_legacy.h |  2 +-
 2 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 035b9fc876..0fcf95a096 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4191,9 +4191,6 @@ int clusterGetReplicaRank(void) {
  * 2) Also, the log is emitted again if the primary is still down and
  *    the reason for not failing over is still the same, but more than
  *    CLUSTER_CANT_FAILOVER_RELOG_PERIOD seconds elapsed.
- * 3) Finally, the function only logs if the replica is down for more than
- *    five seconds + NODE_TIMEOUT. This way nothing is logged when a
- *    failover starts in a reasonable time.
  *
  * The function is called with the reason why the replica can't failover
  * which is one of the integer macros CLUSTER_CANT_FAILOVER_*.
@@ -4202,7 +4199,6 @@ int clusterGetReplicaRank(void) {
 void clusterLogCantFailover(int reason) {
     char *msg;
     static time_t lastlog_time = 0;
-    mstime_t nolog_fail_time = server.cluster_node_timeout + 5000;
 
     /* Don't log if we have the same reason for some time. */
     if (reason == server.cluster->cant_failover_reason &&
@@ -4211,13 +4207,6 @@ void clusterLogCantFailover(int reason) {
 
     server.cluster->cant_failover_reason = reason;
 
-    /* We also don't emit any log if the primary failed no long ago, the
-     * goal of this function is to log replicas in a stalled condition for
-     * a long time. */
-    if (myself->replicaof && nodeFailed(myself->replicaof) &&
-        (mstime() - myself->replicaof->fail_time) < nolog_fail_time)
-        return;
-
     switch (reason) {
     case CLUSTER_CANT_FAILOVER_DATA_AGE:
         msg = "Disconnected from primary for longer than allowed. "
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 3c5696273b..96c783507d 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -17,7 +17,7 @@
 #define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2
 #define CLUSTER_CANT_FAILOVER_EXPIRED 3
 #define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4
-#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD (10) /* seconds. */
+#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD 5 /* seconds. */
 
 /* clusterState todo_before_sleep flags. */
 #define CLUSTER_TODO_HANDLE_FAILOVER (1 << 0)

From 9e050f841af939ff2c1a121f368bfa9529afce31 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Tue, 16 Jul 2024 17:31:55 +0800
Subject: [PATCH 2/2] change CLUSTER_CANT_FAILOVER_RELOG_PERIOD to 1s

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 96c783507d..1c77f613a5 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -17,7 +17,7 @@
 #define CLUSTER_CANT_FAILOVER_WAITING_DELAY 2
 #define CLUSTER_CANT_FAILOVER_EXPIRED 3
 #define CLUSTER_CANT_FAILOVER_WAITING_VOTES 4
-#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD 5 /* seconds. */
+#define CLUSTER_CANT_FAILOVER_RELOG_PERIOD 1 /* seconds. */
 
 /* clusterState todo_before_sleep flags. */
 #define CLUSTER_TODO_HANDLE_FAILOVER (1 << 0)