Skip to content

Commit

Permalink
clusterer: Prevent startup delays from causing aborted syncs
Browse files Browse the repository at this point in the history
This patch makes it so the 'seed_fallback_interval' modparam
time-counting is done in a way that *ignores* any random startup delays
(e.g. mod init stuff which may take time such as loading drouting,
dispatcher, rtpproxy, etc.).

Previously, the effect of this issue was that some modules (e.g. usrloc)
would randomly *not* sync data on startup, depending if the startup
delay happened to be lesser or greater than the 'seed_fallback_interval'.
  • Loading branch information
liviuchircu committed Oct 22, 2024
1 parent 2bf1bdc commit 9287274
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 13 deletions.
8 changes: 6 additions & 2 deletions globals.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,12 @@ int process_no = 0;
/* cfg parsing */
int cfg_errors=0;

/* start-up time */
time_t startup_time = 0;
/**
* @startup_time - near to OpenSIPS launch time, see init_mi_uptime()
* @ready_time - per process, denotes completion of mod_init + child_init
* @ready_delay - difference between above two UNIX timestamps
*/
time_t startup_time, ready_time, ready_delay;

/* shared memory (in MB) */
unsigned long shm_mem_size=SHM_MEM_SIZE * 1024 * 1024;
Expand Down
2 changes: 1 addition & 1 deletion globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ extern int max_while_loops;

extern int sl_fwd_disabled;

extern time_t startup_time;
extern time_t startup_time, ready_time, ready_delay;

extern char *db_version_table;
extern char *db_default_url;
Expand Down
17 changes: 12 additions & 5 deletions modules/clusterer/clusterer.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ void sync_check_timer(utime_t ticks, void *param)
struct local_cap *cap;
struct timeval now;

if (sr_get_core_status() != STATE_RUNNING) {
LM_DBG("opensips is not operational (state: %d), nothing "
"to check for now\n", sr_get_core_status());
return;
}

gettimeofday(&now, NULL);

lock_start_read(cl_list_lock);
Expand All @@ -94,10 +100,11 @@ void sync_check_timer(utime_t ticks, void *param)
if ((cap->flags & CAP_SYNC_PENDING) &&
(cl->current_node->flags & NODE_IS_SEED) &&
(TIME_DIFF(cap->sync_req_time, now) >=
seed_fb_interval*1000000)) {
((cap->flags&CAP_SYNC_STARTUP ? ready_delay:0)
+ seed_fb_interval) * 1000000)) {

cap->flags |= CAP_STATE_OK;
cap->flags &= ~CAP_SYNC_PENDING;
cap->flags &= ~(CAP_SYNC_PENDING|CAP_SYNC_STARTUP);
sr_set_status(cl_srg, STR2CI(cap->reg.sr_id), CAP_SR_SYNCED,
STR2CI(CAP_SR_STATUS_STR(CAP_SR_SYNCED)), 0);
sr_add_report_fmt(cl_srg, STR2CI(cap->reg.sr_id), 0,
Expand Down Expand Up @@ -867,7 +874,7 @@ static void handle_cap_update(bin_packet_t *packet, node_info_t *source)
node_id);
if (rc == CLUSTERER_SEND_SUCCESS) {
lock_get(source->cluster->lock);
lcap->flags &= ~CAP_SYNC_PENDING;
lcap->flags &= ~(CAP_SYNC_PENDING|CAP_SYNC_STARTUP);
lock_release(source->cluster->lock);
} else if (rc == CLUSTERER_SEND_ERR)
LM_ERR("Failed to send sync request to node: %d\n",
Expand Down Expand Up @@ -1645,7 +1652,7 @@ void do_actions_node_ev(cluster_info_t *clusters, int *select_cluster,
/* check pending sync replies */
for (n_cap = node->capabilities; n_cap; n_cap = n_cap->next) {
if (n_cap->flags & CAP_SYNC_PENDING) {
n_cap->flags &= ~CAP_SYNC_PENDING;
n_cap->flags &= ~(CAP_SYNC_PENDING|CAP_SYNC_STARTUP);
lock_release(node->lock);
/* reply now that the node is up */
if (ipc_dispatch_sync_reply(cl, node->node_id,
Expand Down Expand Up @@ -1698,7 +1705,7 @@ void do_actions_node_ev(cluster_info_t *clusters, int *select_cluster,
* a module tries to sync on node UP event */
if (rst_sync_pending) {
lock_get(cl->lock);
cap_it->flags &= ~CAP_SYNC_PENDING;
cap_it->flags &= ~(CAP_SYNC_PENDING|CAP_SYNC_STARTUP);
lock_release(cl->lock);
}
}
Expand Down
9 changes: 5 additions & 4 deletions modules/clusterer/clusterer.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,11 @@
#define NODE_IS_SEED (1<<3)

/* capability flags */
#define CAP_STATE_OK (1<<0)
#define CAP_SYNC_PENDING (1<<1)
#define CAP_SYNC_IN_PROGRESS (1<<2)
#define CAP_STATE_ENABLED (1<<3)
#define CAP_STATE_OK (1<<0)
#define CAP_SYNC_STARTUP (1<<1)
#define CAP_SYNC_PENDING (1<<2)
#define CAP_SYNC_IN_PROGRESS (1<<3)
#define CAP_STATE_ENABLED (1<<4)

#define CAP_DISABLED 0
#define CAP_ENABLED 1
Expand Down
4 changes: 4 additions & 0 deletions modules/clusterer/sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ int queue_sync_request(cluster_info_t *cluster, struct local_cap *lcap)
{
lock_get(cluster->lock);
lcap->flags |= CAP_SYNC_PENDING;
if (sr_get_core_status() == STATE_INITIALIZING)
lcap->flags |= CAP_SYNC_STARTUP;
else
lcap->flags &= ~CAP_SYNC_STARTUP;

if (cluster->current_node->flags & NODE_IS_SEED)
gettimeofday(&lcap->sync_req_time, NULL);
Expand Down
7 changes: 6 additions & 1 deletion sr_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,7 @@ static int init_mod_child( struct sr_module* m, int rank, char *type,
int init_child(int rank)
{
char* type;
int rc;

type = 0;

Expand All @@ -721,7 +722,11 @@ int init_child(int rank)
type = "UNKNOWN";
}

return init_mod_child(modules, rank, type, 0);
rc = init_mod_child(modules, rank, type, 0);
ready_time = time(NULL);
ready_delay = ready_time - startup_time;

return rc;
}


Expand Down
5 changes: 5 additions & 0 deletions status_report.c
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,11 @@ static int _check_status(sr_group *srg, str *identifier, mi_item_t *id_item)

int sr_set_core_status(enum sr_core_states status, char *txt_s, int txt_len)
{
if (status == STATE_RUNNING) {
ready_time = time(NULL);
ready_delay = ready_time - startup_time;
}

return sr_set_status( srg_core, CHAR_INT_NULL /*main*/, status,
txt_s, txt_len, 0);
}
Expand Down

0 comments on commit 9287274

Please sign in to comment.