From 8392a70aab15569c52f728f6d7e4a52ac030cbfb Mon Sep 17 00:00:00 2001 From: Liviu Chircu Date: Tue, 22 Oct 2024 17:46:55 +0300 Subject: [PATCH] clusterer: Prevent startup delays from causing aborted syncs This patch makes it so the 'seed_fallback_interval' modparam time-counting is done in a way that *ignores* any random startup delays (e.g. mod init stuff which may take time such as loading drouting, dispatcher, rtpproxy, etc.). Previously, the effect of this issue was that some modules (e.g. usrloc) would randomly *not* sync data on startup, depending if the startup delay happened to be lesser or greater than the 'seed_fallback_interval'. (cherry picked from commit 928727410a63fa137f48aa8a77492c3c7b14b460) --- globals.c | 8 ++++++-- globals.h | 2 +- modules/clusterer/clusterer.c | 17 ++++++++++++----- modules/clusterer/clusterer.h | 9 +++++---- modules/clusterer/sync.c | 4 ++++ sr_module.c | 7 ++++++- status_report.c | 5 +++++ 7 files changed, 39 insertions(+), 13 deletions(-) diff --git a/globals.c b/globals.c index f418205c638..640071a329d 100644 --- a/globals.c +++ b/globals.c @@ -155,8 +155,12 @@ int process_no = 0; /* cfg parsing */ int cfg_errors=0; -/* start-up time */ -time_t startup_time = 0; +/** + * @startup_time - near to OpenSIPS launch time, see init_mi_uptime() + * @ready_time - per process, denotes completion of mod_init + child_init + * @ready_delay - difference between above two UNIX timestamps + */ +time_t startup_time, ready_time, ready_delay; /* shared memory (in MB) */ unsigned long shm_mem_size=SHM_MEM_SIZE * 1024 * 1024; diff --git a/globals.h b/globals.h index f3b0008ce26..51b796ab476 100644 --- a/globals.h +++ b/globals.h @@ -144,7 +144,7 @@ extern int max_while_loops; extern int sl_fwd_disabled; -extern time_t startup_time; +extern time_t startup_time, ready_time, ready_delay; extern char *db_version_table; extern char *db_default_url; diff --git a/modules/clusterer/clusterer.c b/modules/clusterer/clusterer.c index 1f0f2c73156..f96ad4bd0df 100644 --- a/modules/clusterer/clusterer.c +++ b/modules/clusterer/clusterer.c @@ -73,6 +73,12 @@ void sync_check_timer(utime_t ticks, void *param) struct local_cap *cap; struct timeval now; + if (sr_get_core_status() != STATE_RUNNING) { + LM_DBG("opensips is not operational (state: %d), nothing " + "to check for now\n", sr_get_core_status()); + return; + } + gettimeofday(&now, NULL); lock_start_read(cl_list_lock); @@ -93,10 +99,11 @@ void sync_check_timer(utime_t ticks, void *param) if ((cap->flags & CAP_SYNC_PENDING) && (cl->current_node->flags & NODE_IS_SEED) && (TIME_DIFF(cap->sync_req_time, now) >= - seed_fb_interval*1000000)) { + ((cap->flags&CAP_SYNC_STARTUP ? ready_delay:0) + + seed_fb_interval) * 1000000)) { cap->flags |= CAP_STATE_OK; - cap->flags &= ~CAP_SYNC_PENDING; + cap->flags &= ~(CAP_SYNC_PENDING|CAP_SYNC_STARTUP); sr_set_status(cl_srg, STR2CI(cap->reg.sr_id), CAP_SR_SYNCED, STR2CI(CAP_SR_STATUS_STR(CAP_SR_SYNCED)), 0); sr_add_report_fmt(cl_srg, STR2CI(cap->reg.sr_id), 0, @@ -866,7 +873,7 @@ static void handle_cap_update(bin_packet_t *packet, node_info_t *source) node_id); if (rc == CLUSTERER_SEND_SUCCESS) { lock_get(source->cluster->lock); - lcap->flags &= ~CAP_SYNC_PENDING; + lcap->flags &= ~(CAP_SYNC_PENDING|CAP_SYNC_STARTUP); lock_release(source->cluster->lock); } else if (rc == CLUSTERER_SEND_ERR) LM_ERR("Failed to send sync request to node: %d\n", @@ -1640,7 +1647,7 @@ void do_actions_node_ev(cluster_info_t *clusters, int *select_cluster, /* check pending sync replies */ for (n_cap = node->capabilities; n_cap; n_cap = n_cap->next) { if (n_cap->flags & CAP_SYNC_PENDING) { - n_cap->flags &= ~CAP_SYNC_PENDING; + n_cap->flags &= ~(CAP_SYNC_PENDING|CAP_SYNC_STARTUP); lock_release(node->lock); /* reply now that the node is up */ if (ipc_dispatch_sync_reply(cl, node->node_id, @@ -1693,7 +1700,7 @@ void do_actions_node_ev(cluster_info_t *clusters, int *select_cluster, * a module tries to sync on node UP event */ if (rst_sync_pending) { lock_get(cl->lock); - cap_it->flags &= ~CAP_SYNC_PENDING; + cap_it->flags &= ~(CAP_SYNC_PENDING|CAP_SYNC_STARTUP); lock_release(cl->lock); } } diff --git a/modules/clusterer/clusterer.h b/modules/clusterer/clusterer.h index 29ff9c8e422..18acf1b85eb 100644 --- a/modules/clusterer/clusterer.h +++ b/modules/clusterer/clusterer.h @@ -54,10 +54,11 @@ #define NODE_IS_SEED (1<<3) /* capability flags */ -#define CAP_STATE_OK (1<<0) -#define CAP_SYNC_PENDING (1<<1) -#define CAP_SYNC_IN_PROGRESS (1<<2) -#define CAP_STATE_ENABLED (1<<3) +#define CAP_STATE_OK (1<<0) +#define CAP_SYNC_STARTUP (1<<1) +#define CAP_SYNC_PENDING (1<<2) +#define CAP_SYNC_IN_PROGRESS (1<<3) +#define CAP_STATE_ENABLED (1<<4) #define CAP_DISABLED 0 #define CAP_ENABLED 1 diff --git a/modules/clusterer/sync.c b/modules/clusterer/sync.c index 09cf6f31860..37d9602f8a3 100644 --- a/modules/clusterer/sync.c +++ b/modules/clusterer/sync.c @@ -95,6 +95,10 @@ int queue_sync_request(cluster_info_t *cluster, struct local_cap *lcap) { lock_get(cluster->lock); lcap->flags |= CAP_SYNC_PENDING; + if (sr_get_core_status() == STATE_INITIALIZING) + lcap->flags |= CAP_SYNC_STARTUP; + else + lcap->flags &= ~CAP_SYNC_STARTUP; if (cluster->current_node->flags & NODE_IS_SEED) gettimeofday(&lcap->sync_req_time, NULL); diff --git a/sr_module.c b/sr_module.c index efded7d60b5..f65743e7575 100644 --- a/sr_module.c +++ b/sr_module.c @@ -643,6 +643,7 @@ static int init_mod_child( struct sr_module* m, int rank, char *type, int init_child(int rank) { char* type; + int rc; type = 0; @@ -660,7 +661,11 @@ int init_child(int rank) type = "UNKNOWN"; } - return init_mod_child(modules, rank, type, 0); + rc = init_mod_child(modules, rank, type, 0); + ready_time = time(NULL); + ready_delay = ready_time - startup_time; + + return rc; } diff --git a/status_report.c b/status_report.c index 704ffef6594..0c110893324 100644 --- a/status_report.c +++ b/status_report.c @@ -697,6 +697,11 @@ static int _check_status(sr_group *srg, str *identifier, mi_item_t *id_item) int sr_set_core_status(int status, char *txt_s, int txt_len) { + if (status == STATE_RUNNING) { + ready_time = time(NULL); + ready_delay = ready_time - startup_time; + } + return sr_set_status( srg_core, CHAR_INT_NULL /*main*/, status, txt_s, txt_len, 0); }