Skip to content

Commit

Permalink
Adjusting schema as per latest discussions
Browse files Browse the repository at this point in the history
Signed-off-by: Rodny Molina <rmolina@linkedin.com>
  • Loading branch information
Rodny Molina committed Sep 6, 2018
1 parent 1f7a0cc commit f9be17a
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 44 deletions.
36 changes: 27 additions & 9 deletions doc/swss-schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,15 @@ Equivalent RedisDB entry:
; and push the delta to appDB
; Valid value is 1-9999. 0 is invalid.

bgp_timer = 1*4DIGIT ; bgp_timer holds the time interval utilized by fpmsyncd during warm-restart episodes.
; During this interval fpmsyncd will recover all the routing state previously pushed to
; AppDB, as well as all the new state coming from zebra/bgpd. Upon expiration of this
; timer, fpmsyncd will execute the reconciliation logic to eliminate all the staled
; state from AppDB. This timer should match the BGP-GR restart-timer configured within
; the elected routing-stack.
; Supported range: 1-9999.


### VXLAN\_TUNNEL
Stores vxlan tunnels configuration
Status: ready
Expand Down Expand Up @@ -674,15 +683,25 @@ Status: ready
;Status: work in progress

key = WARM_RESTART_TABLE:process_name ; process_name is a unique process identifier.
restart_count = 1*10DIGIT ; a number between 0 and 2147483647,
; count of warm start times.

state = "init" / "restored" / "reconciled" ; init: process init with warm start enabled.
; restored: process restored to the previous
; state using saved data.
; reconciled: process reconciled with up to date
; dynanic data like port state, neighbor, routes
; and so on.
restore_count = 1*10DIGIT ; a value between 0 and 2147483647 to keep track
; of the number of times that an application has
; 'restored' its state from its associated redis
; data-store; which is equivalent to the number
; of times an application has iterated through
; a warm-restart cycle.

state = "initialized" / "restored" / "reconciled" ; initialized: default/initial state for processes
; with warm-restart capabilities turned on. This
; state will be applied permanently for processes
; with warm-restart feature being turned off.
;
; restored: process restored to the previous
; state using saved data.
;
; reconciled: process reconciled with updated
; dynanic data like port state, neighbor, routes
; and so on.

## Configuration files
What configuration files should we have? Do apps, orch agent each need separate files?
Expand All @@ -692,4 +711,3 @@ What configuration files should we have? Do apps, orch agent each need separate
portsyncd reads from port_config.ini and updates PORT_TABLE in APP_DB

All other apps (intfsyncd) read from PORT_TABLE in APP_DB

40 changes: 20 additions & 20 deletions tests/test_warm_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,23 @@
import time
import json

# Get restart count of all processes supporting warm restart
def swss_get_RestartCount(state_db):
restart_count = {}
# Get restore count of all processes supporting warm restart
def swss_get_RestoreCount(state_db):
restore_count = {}
warmtbl = swsscommon.Table(state_db, swsscommon.STATE_WARM_RESTART_TABLE_NAME)
keys = warmtbl.getKeys()
assert len(keys) != 0
for key in keys:
(status, fvs) = warmtbl.get(key)
assert status == True
for fv in fvs:
if fv[0] == "restart_count":
restart_count[key] = int(fv[1])
print(restart_count)
return restart_count
if fv[0] == "restore_count":
restore_count[key] = int(fv[1])
print(restore_count)
return restore_count

# function to check the restart count incremented by 1 for all processes supporting warm restart
def swss_check_RestartCount(state_db, restart_count):
# function to check the restore count incremented by 1 for all processes supporting warm restart
def swss_check_RestoreCount(state_db, restore_count):
warmtbl = swsscommon.Table(state_db, swsscommon.STATE_WARM_RESTART_TABLE_NAME)
keys = warmtbl.getKeys()
print(keys)
Expand All @@ -29,8 +29,8 @@ def swss_check_RestartCount(state_db, restart_count):
(status, fvs) = warmtbl.get(key)
assert status == True
for fv in fvs:
if fv[0] == "restart_count":
assert int(fv[1]) == restart_count[key] + 1
if fv[0] == "restore_count":
assert int(fv[1]) == restore_count[key] + 1
elif fv[0] == "state":
assert fv[1] == "reconciled"

Expand All @@ -46,21 +46,21 @@ def check_port_oper_status(appl_db, port_name, state):
break
assert oper_status == state

# function to check the restart count incremented by 1 for a single process
def swss_app_check_RestartCount_single(state_db, restart_count, name):
# function to check the restore count incremented by 1 for a single process
def swss_app_check_RestoreCount_single(state_db, restore_count, name):
warmtbl = swsscommon.Table(state_db, swsscommon.STATE_WARM_RESTART_TABLE_NAME)
keys = warmtbl.getKeys()
print(keys)
print(restart_count)
print(restore_count)
assert len(keys) > 0
for key in keys:
if key != name:
continue
(status, fvs) = warmtbl.get(key)
assert status == True
for fv in fvs:
if fv[0] == "restart_count":
assert int(fv[1]) == restart_count[key] + 1
if fv[0] == "restore_count":
assert int(fv[1]) == restore_count[key] + 1
elif fv[0] == "state":
assert fv[1] == "reconciled"
def create_entry(tbl, key, pairs):
Expand Down Expand Up @@ -146,7 +146,7 @@ def test_PortSyncdWarmRestart(dvs):
(status, fvs) = neighTbl.get("Ethernet20:11.0.0.10")
assert status == True

restart_count = swss_get_RestartCount(state_db)
restore_count = swss_get_RestoreCount(state_db)

# restart portsyncd
dvs.runcmd(['sh', '-c', 'pkill -x portsyncd; cp /var/log/swss/sairedis.rec /var/log/swss/sairedis.rec.b; echo > /var/log/swss/sairedis.rec'])
Expand Down Expand Up @@ -175,7 +175,7 @@ def test_PortSyncdWarmRestart(dvs):
check_port_oper_status(appl_db, "Ethernet24", "up")


swss_app_check_RestartCount_single(state_db, restart_count, "portsyncd")
swss_app_check_RestoreCount_single(state_db, restore_count, "portsyncd")


def test_VlanMgrdWarmRestart(dvs):
Expand Down Expand Up @@ -263,7 +263,7 @@ def test_VlanMgrdWarmRestart(dvs):
(exitcode, bv_before) = dvs.runcmd("bridge vlan")
print(bv_before)

restart_count = swss_get_RestartCount(state_db)
restore_count = swss_get_RestoreCount(state_db)

dvs.runcmd(['sh', '-c', 'pkill -x vlanmgrd; cp /var/log/swss/sairedis.rec /var/log/swss/sairedis.rec.b; echo > /var/log/swss/sairedis.rec'])
dvs.runcmd(['sh', '-c', 'supervisorctl start vlanmgrd'])
Expand All @@ -284,4 +284,4 @@ def test_VlanMgrdWarmRestart(dvs):
(status, fvs) = tbl.get("Vlan20:11.0.0.11")
assert status == True

swss_app_check_RestartCount_single(state_db, restart_count, "vlanmgrd")
swss_app_check_RestoreCount_single(state_db, restore_count, "vlanmgrd")
31 changes: 16 additions & 15 deletions warmrestart/warm_restart.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,14 @@ void WarmStart::initialize(const std::string &app_name,
* No need to check docker level knobs in this case since the whole system is being rebooted .
* <2> Upon docker service start, first to check system knob.
* if enabled, docker warm start should be performed, otherwise system warm reboot will be ruined.
* If system knob disabled, while docker knob enabled, this is likely an individual docker warm restart request.
* if enabled, docker warm-start should be performed, otherwise system warm-reboot will be ruined.
* If system knob disabled, while docker knob enabled, this is likely an individual docker
* warm-restart request.
* Within each application which should take care warm start case,
* when the system knob or docker knob enabled, we do further check on the
* actual warm start state ( restart_count), if no warm start state data available,
* the database has been flushed, do cold start. Otherwise warm start.
* actual warm-start state ( restore_count), if no warm-start state data available,
* the database has been flushed, do cold start. Otherwise warm-start.
*/

/*
Expand Down Expand Up @@ -95,31 +96,31 @@ bool WarmStart::checkWarmStart(const std::string &app_name,
// Create the entry for this app here.
if (!warmStart.m_enabled)
{
warmStart.m_stateWarmRestartTable->hset(app_name, "restart_count", "0");
warmStart.m_stateWarmRestartTable->hset(app_name, "restore_count", "0");
return false;
}

uint32_t restart_count = 0;
warmStart.m_stateWarmRestartTable->hget(app_name, "restart_count", value);
uint32_t restore_count = 0;
warmStart.m_stateWarmRestartTable->hget(app_name, "restore_count", value);
if (value == "")
{
SWSS_LOG_WARN("%s doing warm start, but restart_count not found in stateDB %s table, fall back to cold start",
SWSS_LOG_WARN("%s doing warm start, but restore_count not found in stateDB %s table, fall back to cold start",
app_name.c_str(), STATE_WARM_RESTART_TABLE_NAME);
warmStart.m_enabled = false;
warmStart.m_stateWarmRestartTable->hset(app_name, "restart_count", "0");
warmStart.m_stateWarmRestartTable->hset(app_name, "restore_count", "0");
return false;
}
else
{
restart_count = (uint32_t)stoul(value);
restore_count = (uint32_t)stoul(value);
}

restart_count++;
warmStart.m_stateWarmRestartTable->hset(app_name, "restart_count",
std::to_string(restart_count));
restore_count++;
warmStart.m_stateWarmRestartTable->hset(app_name, "restore_count",
std::to_string(restore_count));

SWSS_LOG_NOTICE("%s doing warm start, restart count %d", app_name.c_str(),
restart_count);
SWSS_LOG_NOTICE("%s doing warm start, restore count %d", app_name.c_str(),
restore_count);

return true;
}
Expand Down

0 comments on commit f9be17a

Please sign in to comment.