-
Notifications
You must be signed in to change notification settings - Fork 532
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[buffermgr/bufferorch] Support dynamic buffer calculation
1. Extend the CLI options for buffermgrd: -a: asic_table provided, -p: peripheral_table provided The buffermgrd will start with dynamic headroom calculation mode With -a provided Otherwise it will start the legacy mode (pg_headroom_profile looking up) 2. A new class is provided for dynamic buffer calculation while the old one remains. The daemon will instantiate the corresponding class according to the CLI option when it starts. 3. In both mode, the buffermgrd will copy BUFFER_XXX tables from CONFIG_DB to APPL_DB and the bufferorch will consume BUFFER_XXX tables from APPL_DB The following points are for dynamic buffer calculation mode 4. In the dynamic buffer calculation mode, there are 3 lua plugins are provided for vendor-specific operations: - buffer_headroom_<vendor>.lua, for calculationg headroom size. - buffer_pool_<vendor>.lua, for calculating buffer pool size. - buffer_check_headroom_<vendor>.lua, for checking whether headroom exceeds the limit 5. During initialization, The daemon will: - load asic_table and peripheral_table from the given json file, parse them and push them into STATE_DB.ASIC_TABLE and STATE_DB.PERIPHERAL_TABLE respectively - load all plugins - try to load the STATE_DB.BUFFER_MAX_PARAM.mmu_size which is used for updating buffer pool size - a timer will be started for periodic buffer pool size audit 6. The daemon will listen to and handle the following tables from CONFIG_DB The tables will be cached internally in the damon for the purpose of saving access time - BUFFER_POOL: - if size is provided: insert the entry to APPL_DB - otherwise: cache them and push to APPL_DB after the size is calculated by lua plugin - BUFFER_PROFILE and BUFFER_PG: - items for ingress lossless headroom need to be cached and handled (according to the design) - other items will be inserted to the APPL_DB directly - PORT_TABLE, for ports' speed update - CABLE_LENGTH, for ports' cable length 7. Other tables will be copied to APPL_DB directly: - BUFFER_QUEUE - BUFFER_PORT_INGRESS_PROFILE_LIST - BUFFER_PORT_EGRESS_PROFILE_LIST 8. BufferOrch modified accordingly: - Consume buffer relavent tables from APPL_DB instead of CONFIG_DB - For BUFFER_POOL, don't set ingress/egress and static/dynamic to sai if the pool has already existed because that will fail the sai - For BUFFER_PROFILE, don't set pool for the same reasom 9. Warm reboot: - db_migrator is responsible for copying the data from CONFIG_DB to APPL_DB if switch is warm-rebooted from an old image to the new image for the first time - no specific handling in the daemon side 10.Provide vstest script Signed-off-by: Stephen Sun <stephens@mellanox.com>
- Loading branch information
Stephen Sun
committed
Jul 1, 2020
1 parent
c05601c
commit d55a7e9
Showing
17 changed files
with
3,052 additions
and
133 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
-- KEYS - port name | ||
|
||
local port = KEYS[1] | ||
local profile | ||
local lossless_profile | ||
local lossless_headroom_size | ||
local lossless_profile_name | ||
local accumulative_size = 0 | ||
|
||
local appl_db = "0" | ||
local state_db = "6" | ||
|
||
local ret = {} | ||
|
||
-- Fetch the threshold from STATE_DB | ||
redis.call('SELECT', state_db) | ||
|
||
local asic_keys = redis.call('KEYS', 'BUFFER_MAX_PARAM*') | ||
if #asic_keys == 0 then | ||
table.insert(ret, "result:true") | ||
return ret | ||
end | ||
|
||
local max_headroom_size = tonumber(redis.call('HGET', asic_keys[1], 'max_headroom_size')) | ||
if max_headroom_size == nil then | ||
table.insert(ret, "result:true") | ||
return ret | ||
end | ||
|
||
asic_keys = redis.call('KEYS', 'ASIC_TABLE*') | ||
local pipeline_delay = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency')) | ||
accumulative_size = accumulative_size + 2 * pipeline_delay * 1024 | ||
|
||
-- Fetch all keys in BUFFER_PG according to the port | ||
redis.call('SELECT', appl_db) | ||
|
||
local function get_number_of_pgs(keyname) | ||
local range = string.match(keyname, "Ethernet%d+:([^%s]+)$") | ||
local size | ||
if string.len(range) == 1 then | ||
size = 1 | ||
else | ||
size = 1 + tonumber(string.sub(range, -1)) - tonumber(string.sub(range, 1, 1)) | ||
end | ||
return size | ||
end | ||
|
||
-- Fetch all the PGs, accumulate the sizes | ||
-- Assume there is only one lossless profile configured among all PGs on each port | ||
local pg_keys = redis.call('KEYS', 'BUFFER_PG:' .. port .. '*') | ||
for i = 1, #pg_keys do | ||
profile = string.sub(redis.call('HGET', pg_keys[i], 'profile'), 2, -2) | ||
if lossless_profile_name ~= nil then | ||
if profile == lossless_profile_name then | ||
accumulative_size = accumulative_size + lossless_headroom_size * get_number_of_pgs(pg_keys[i]) | ||
end | ||
else | ||
lossless_profile = redis.call('HGETALL', profile) | ||
for j = 1, #lossless_profile, 2 do | ||
if lossless_profile[j] == 'xoff' then | ||
lossless_profile_name = profile | ||
end | ||
if lossless_profile[j] == 'size' then | ||
lossless_headroom_size = tonumber(lossless_profile[j+1]) | ||
accumulative_size = lossless_headroom_size * get_number_of_pgs(pg_keys[i]) | ||
end | ||
end | ||
end | ||
end | ||
|
||
if max_headroom_size > accumulative_size then | ||
table.insert(ret, "result:true") | ||
else | ||
table.insert(ret, "result:false") | ||
end | ||
|
||
table.insert(ret, "max headroom:" .. max_headroom_size) | ||
table.insert(ret, "accumulative headroom:" .. accumulative_size) | ||
|
||
return ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
buffer_check_headroom_mellanox.lua |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
-- KEYS - profile name | ||
-- ARGV[1] - port speed | ||
-- ARGV[2] - cable length | ||
-- ARGV[3] - gearbox delay | ||
|
||
-- parameters retried from databases: | ||
-- From CONFIG_DB.LOSSLESS_TRAFFIC_PATTERN | ||
-- small packet percentage: the parameter which is used to control worst case regarding the cell utilization | ||
-- mtu: the mtu of lossless packet | ||
-- From STATE_DB.ASIC_TABLE: | ||
-- cell size: cell_size of the ASIC | ||
-- pipeline_latency: the latency | ||
-- mac_phy_delay: | ||
-- peer_response_time: | ||
|
||
local mtu | ||
local small_packet_percentage | ||
local cell_size | ||
local pipeline_latency | ||
local mac_phy_delay | ||
local peer_response_time | ||
|
||
local port_speed = tonumber(ARGV[1]) | ||
local cable_length = tonumber(string.sub(ARGV[2], 1, -2)) | ||
local gearbox_delay = tonumber(ARGV[3]) | ||
|
||
local appl_db = "0" | ||
local config_db = "4" | ||
local state_db = "6" | ||
|
||
local ret = {} | ||
|
||
if gearbox_delay == nil then | ||
gearbox_delay = 0 | ||
end | ||
|
||
-- Fetch ASIC info from ASIC table in STATE_DB | ||
redis.call('SELECT', state_db) | ||
local asic_keys = redis.call('KEYS', 'ASIC_TABLE*') | ||
|
||
-- Only one key should exist | ||
local asic_table_content = redis.call('HGETALL', asic_keys[1]) | ||
for i = 1, #asic_table_content, 2 do | ||
if asic_table_content[i] == "cell_size" then | ||
cell_size = tonumber(asic_table_content[i+1]) | ||
end | ||
if asic_table_content[i] == "pipeline_latency" then | ||
pipeline_latency = tonumber(asic_table_content[i+1]) * 1024 | ||
end | ||
if asic_table_content[i] == "mac_phy_delay" then | ||
mac_phy_delay = tonumber(asic_table_content[i+1]) * 1024 | ||
end | ||
if asic_table_content[i] == "peer_response_time" then | ||
peer_response_time = tonumber(asic_table_content[i+1]) * 1024 | ||
end | ||
end | ||
|
||
-- Fetch lossless traffic info from CONFIG_DB | ||
redis.call('SELECT', config_db) | ||
local lossless_traffic_keys = redis.call('KEYS', 'LOSSLESS_TRAFFIC_PATTERN*') | ||
|
||
-- Only one key should exist | ||
local lossless_traffic_table_content = redis.call('HGETALL', lossless_traffic_keys[1]) | ||
for i = 1, #lossless_traffic_table_content, 2 do | ||
if lossless_traffic_table_content[i] == "mtu" then | ||
mtu = tonumber(lossless_traffic_table_content[i+1]) | ||
end | ||
if lossless_traffic_table_content[i] == "small_packet_percentage" then | ||
small_packet_percentage = tonumber(lossless_traffic_table_content[i+1]) | ||
end | ||
end | ||
|
||
-- Fetch DEFAULT_LOSSLESS_BUFFER_PARAMETER from CONFIG_DB | ||
local lossless_traffic_keys = redis.call('KEYS', 'DEFAULT_LOSSLESS_BUFFER_PARAMETER*') | ||
|
||
-- Only one key should exist | ||
local default_threshold = redis.call('HGET', lossless_traffic_keys[1], 'default_dynamic_th') | ||
|
||
-- Calculate the headroom information | ||
local speed_of_light = 198000000 | ||
local minimal_packet_size = 64 | ||
local cell_occupancy | ||
local worst_case_factor | ||
local propagation_delay | ||
local bytes_on_cable | ||
local bytes_on_gearbox | ||
local xoff_value | ||
local xon_value | ||
local headroom_size | ||
local speed_overhead | ||
|
||
-- Adjustment for 400G | ||
if port_speed == 400000 then | ||
pipeline_latency = pipeline_latency * 2 | ||
speed_overhead = mtu | ||
else | ||
speed_overhead = 0 | ||
end | ||
|
||
if cell_size > 2 * minimal_packet_size then | ||
worst_case_factor = cell_size / minimal_packet_size | ||
else | ||
worst_case_factor = (2 * cell_size) / (1 + cell_size) | ||
end | ||
|
||
cell_occupancy = (100 - small_packet_percentage + small_packet_percentage * worst_case_factor) / 100 | ||
|
||
if (gearbox_delay == 0) then | ||
bytes_on_gearbox = 0 | ||
else | ||
bytes_on_gearbox = port_speed * gearbox_delay / (8 * 1000) | ||
end | ||
|
||
bytes_on_cable = 2 * cable_length * port_speed * 1000000 / speed_of_light / 8 | ||
propagation_delay = mtu + 2 * (bytes_on_cable + bytes_on_gearbox) + mac_phy_delay + peer_response_time | ||
|
||
-- Calculate the xoff and xon and then round up at 1024 bytes | ||
xoff_value = mtu + propagation_delay * cell_occupancy | ||
xoff_value = math.ceil(xoff_value / 1024) * 1024 | ||
xon_value = pipeline_latency | ||
xon_value = math.ceil(xon_value / 1024) * 1024 | ||
|
||
headroom_size = xoff_value + xon_value + speed_overhead | ||
headroom_size = math.ceil(headroom_size / 1024) * 1024 | ||
|
||
table.insert(ret, "xon" .. ":" .. math.ceil(xon_value)) | ||
table.insert(ret, "xoff" .. ":" .. math.ceil(xoff_value)) | ||
table.insert(ret, "size" .. ":" .. math.ceil(headroom_size)) | ||
table.insert(ret, "threshold" .. ":" .. default_threshold) | ||
|
||
return ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
buffer_headroom_mellanox.lua |
Oops, something went wrong.