Skip to content

Commit

Permalink
[buffermgr/bufferorch] Support dynamic buffer calculation
Browse files Browse the repository at this point in the history
1. Extend the CLI options for buffermgrd:
   -a: asic_table provided,
   -p: peripheral_table provided
   The buffermgrd will start with dynamic headroom calculation mode With -a provided
   Otherwise it will start the legacy mode (pg_headroom_profile looking up)
2. A new class is provided for dynamic buffer calculation while the old one remains.
   The daemon will instantiate the corresponding class according to the CLI option when it starts.
3. In both mode, the buffermgrd will copy BUFFER_XXX tables from CONFIG_DB to APPL_DB
   and the bufferorch will consume BUFFER_XXX tables from APPL_DB
The following points are for dynamic buffer calculation mode
4. In the dynamic buffer calculation mode, there are 3 lua plugins are provided for vendor-specific operations:
   - buffer_headroom_<vendor>.lua, for calculationg headroom size.
   - buffer_pool_<vendor>.lua, for calculating buffer pool size.
   - buffer_check_headroom_<vendor>.lua, for checking whether headroom exceeds the limit
5. During initialization, The daemon will:
   - load asic_table and peripheral_table from the given json file, parse them
     and push them into STATE_DB.ASIC_TABLE and STATE_DB.PERIPHERAL_TABLE respectively
   - load all plugins
   - try to load the STATE_DB.BUFFER_MAX_PARAM.mmu_size which is used for updating buffer pool size
   - a timer will be started for periodic buffer pool size audit
6. The daemon will listen to and handle the following tables from CONFIG_DB
   The tables will be cached internally in the damon for the purpose of saving access time
   - BUFFER_POOL:
     - if size is provided: insert the entry to APPL_DB
     - otherwise: cache them and push to APPL_DB after the size is calculated by lua plugin
   - BUFFER_PROFILE and BUFFER_PG:
     - items for ingress lossless headroom need to be cached and handled (according to the design)
     - other items will be inserted to the APPL_DB directly
   - PORT_TABLE, for ports' speed update
   - CABLE_LENGTH, for ports' cable length
7. Other tables will be copied to APPL_DB directly:
   - BUFFER_QUEUE
   - BUFFER_PORT_INGRESS_PROFILE_LIST
   - BUFFER_PORT_EGRESS_PROFILE_LIST
8. BufferOrch modified accordingly:
   - Consume buffer relavent tables from APPL_DB instead of CONFIG_DB
   - For BUFFER_POOL, don't set ingress/egress and static/dynamic to sai if the pool has already existed
     because that will fail the sai
   - For BUFFER_PROFILE, don't set pool for the same reasom
9. Warm reboot:
   - db_migrator is responsible for copying the data from CONFIG_DB to APPL_DB if switch is warm-rebooted
   from an old image to the new image for the first time
   - no specific handling in the daemon side
10.Provide vstest script

Signed-off-by: Stephen Sun <stephens@mellanox.com>
  • Loading branch information
Stephen Sun committed Jul 1, 2020
1 parent c05601c commit d55a7e9
Show file tree
Hide file tree
Showing 17 changed files with 3,052 additions and 133 deletions.
9 changes: 8 additions & 1 deletion cfgmgr/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ LIBNL_LIBS = -lnl-genl-3 -lnl-route-3 -lnl-3

bin_PROGRAMS = vlanmgrd teammgrd portmgrd intfmgrd buffermgrd vrfmgrd nbrmgrd vxlanmgrd sflowmgrd natmgrd

cfgmgrdir = $(datadir)/swss

dist_cfgmgr_DATA = \
buffer_check_headroom_mellanox.lua \
buffer_headroom_mellanox.lua \
buffer_pool_mellanox.lua

if DEBUG
DBGFLAGS = -ggdb -DDEBUG
else
Expand All @@ -31,7 +38,7 @@ intfmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
intfmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
intfmgrd_LDADD = -lswsscommon

buffermgrd_SOURCES = buffermgrd.cpp buffermgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h
buffermgrd_SOURCES = buffermgrd.cpp buffermgr.cpp buffermgrdyn.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h
buffermgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
buffermgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
buffermgrd_LDADD = -lswsscommon
Expand Down
80 changes: 80 additions & 0 deletions cfgmgr/buffer_check_headroom_mellanox.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
-- KEYS - port name

local port = KEYS[1]
local profile
local lossless_profile
local lossless_headroom_size
local lossless_profile_name
local accumulative_size = 0

local appl_db = "0"
local state_db = "6"

local ret = {}

-- Fetch the threshold from STATE_DB
redis.call('SELECT', state_db)

local asic_keys = redis.call('KEYS', 'BUFFER_MAX_PARAM*')
if #asic_keys == 0 then
table.insert(ret, "result:true")
return ret
end

local max_headroom_size = tonumber(redis.call('HGET', asic_keys[1], 'max_headroom_size'))
if max_headroom_size == nil then
table.insert(ret, "result:true")
return ret
end

asic_keys = redis.call('KEYS', 'ASIC_TABLE*')
local pipeline_delay = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency'))
accumulative_size = accumulative_size + 2 * pipeline_delay * 1024

-- Fetch all keys in BUFFER_PG according to the port
redis.call('SELECT', appl_db)

local function get_number_of_pgs(keyname)
local range = string.match(keyname, "Ethernet%d+:([^%s]+)$")
local size
if string.len(range) == 1 then
size = 1
else
size = 1 + tonumber(string.sub(range, -1)) - tonumber(string.sub(range, 1, 1))
end
return size
end

-- Fetch all the PGs, accumulate the sizes
-- Assume there is only one lossless profile configured among all PGs on each port
local pg_keys = redis.call('KEYS', 'BUFFER_PG:' .. port .. '*')
for i = 1, #pg_keys do
profile = string.sub(redis.call('HGET', pg_keys[i], 'profile'), 2, -2)
if lossless_profile_name ~= nil then
if profile == lossless_profile_name then
accumulative_size = accumulative_size + lossless_headroom_size * get_number_of_pgs(pg_keys[i])
end
else
lossless_profile = redis.call('HGETALL', profile)
for j = 1, #lossless_profile, 2 do
if lossless_profile[j] == 'xoff' then
lossless_profile_name = profile
end
if lossless_profile[j] == 'size' then
lossless_headroom_size = tonumber(lossless_profile[j+1])
accumulative_size = lossless_headroom_size * get_number_of_pgs(pg_keys[i])
end
end
end
end

if max_headroom_size > accumulative_size then
table.insert(ret, "result:true")
else
table.insert(ret, "result:false")
end

table.insert(ret, "max headroom:" .. max_headroom_size)
table.insert(ret, "accumulative headroom:" .. accumulative_size)

return ret
1 change: 1 addition & 0 deletions cfgmgr/buffer_check_headroom_vs.lua
131 changes: 131 additions & 0 deletions cfgmgr/buffer_headroom_mellanox.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
-- KEYS - profile name
-- ARGV[1] - port speed
-- ARGV[2] - cable length
-- ARGV[3] - gearbox delay

-- parameters retried from databases:
-- From CONFIG_DB.LOSSLESS_TRAFFIC_PATTERN
-- small packet percentage: the parameter which is used to control worst case regarding the cell utilization
-- mtu: the mtu of lossless packet
-- From STATE_DB.ASIC_TABLE:
-- cell size: cell_size of the ASIC
-- pipeline_latency: the latency
-- mac_phy_delay:
-- peer_response_time:

local mtu
local small_packet_percentage
local cell_size
local pipeline_latency
local mac_phy_delay
local peer_response_time

local port_speed = tonumber(ARGV[1])
local cable_length = tonumber(string.sub(ARGV[2], 1, -2))
local gearbox_delay = tonumber(ARGV[3])

local appl_db = "0"
local config_db = "4"
local state_db = "6"

local ret = {}

if gearbox_delay == nil then
gearbox_delay = 0
end

-- Fetch ASIC info from ASIC table in STATE_DB
redis.call('SELECT', state_db)
local asic_keys = redis.call('KEYS', 'ASIC_TABLE*')

-- Only one key should exist
local asic_table_content = redis.call('HGETALL', asic_keys[1])
for i = 1, #asic_table_content, 2 do
if asic_table_content[i] == "cell_size" then
cell_size = tonumber(asic_table_content[i+1])
end
if asic_table_content[i] == "pipeline_latency" then
pipeline_latency = tonumber(asic_table_content[i+1]) * 1024
end
if asic_table_content[i] == "mac_phy_delay" then
mac_phy_delay = tonumber(asic_table_content[i+1]) * 1024
end
if asic_table_content[i] == "peer_response_time" then
peer_response_time = tonumber(asic_table_content[i+1]) * 1024
end
end

-- Fetch lossless traffic info from CONFIG_DB
redis.call('SELECT', config_db)
local lossless_traffic_keys = redis.call('KEYS', 'LOSSLESS_TRAFFIC_PATTERN*')

-- Only one key should exist
local lossless_traffic_table_content = redis.call('HGETALL', lossless_traffic_keys[1])
for i = 1, #lossless_traffic_table_content, 2 do
if lossless_traffic_table_content[i] == "mtu" then
mtu = tonumber(lossless_traffic_table_content[i+1])
end
if lossless_traffic_table_content[i] == "small_packet_percentage" then
small_packet_percentage = tonumber(lossless_traffic_table_content[i+1])
end
end

-- Fetch DEFAULT_LOSSLESS_BUFFER_PARAMETER from CONFIG_DB
local lossless_traffic_keys = redis.call('KEYS', 'DEFAULT_LOSSLESS_BUFFER_PARAMETER*')

-- Only one key should exist
local default_threshold = redis.call('HGET', lossless_traffic_keys[1], 'default_dynamic_th')

-- Calculate the headroom information
local speed_of_light = 198000000
local minimal_packet_size = 64
local cell_occupancy
local worst_case_factor
local propagation_delay
local bytes_on_cable
local bytes_on_gearbox
local xoff_value
local xon_value
local headroom_size
local speed_overhead

-- Adjustment for 400G
if port_speed == 400000 then
pipeline_latency = pipeline_latency * 2
speed_overhead = mtu
else
speed_overhead = 0
end

if cell_size > 2 * minimal_packet_size then
worst_case_factor = cell_size / minimal_packet_size
else
worst_case_factor = (2 * cell_size) / (1 + cell_size)
end

cell_occupancy = (100 - small_packet_percentage + small_packet_percentage * worst_case_factor) / 100

if (gearbox_delay == 0) then
bytes_on_gearbox = 0
else
bytes_on_gearbox = port_speed * gearbox_delay / (8 * 1000)
end

bytes_on_cable = 2 * cable_length * port_speed * 1000000 / speed_of_light / 8
propagation_delay = mtu + 2 * (bytes_on_cable + bytes_on_gearbox) + mac_phy_delay + peer_response_time

-- Calculate the xoff and xon and then round up at 1024 bytes
xoff_value = mtu + propagation_delay * cell_occupancy
xoff_value = math.ceil(xoff_value / 1024) * 1024
xon_value = pipeline_latency
xon_value = math.ceil(xon_value / 1024) * 1024

headroom_size = xoff_value + xon_value + speed_overhead
headroom_size = math.ceil(headroom_size / 1024) * 1024

table.insert(ret, "xon" .. ":" .. math.ceil(xon_value))
table.insert(ret, "xoff" .. ":" .. math.ceil(xoff_value))
table.insert(ret, "size" .. ":" .. math.ceil(headroom_size))
table.insert(ret, "threshold" .. ":" .. default_threshold)

return ret
1 change: 1 addition & 0 deletions cfgmgr/buffer_headroom_vs.lua
Loading

0 comments on commit d55a7e9

Please sign in to comment.