Skip to content

Commit

Permalink
Remove PGs from an administratively down port. (sonic-net#1677)
Browse files Browse the repository at this point in the history
- What I did
Bug fixes: Remove PGs from an administratively down port.

- Why I did it
To fix bugs

- How I verified it
Run regression and vs test

- Details if related
Remove PGs from an administratively down port.
Introduce a new state: PORT_ADMIN_DOWN which represents the port is administratively down.
Remove all PGs when the port is shut down and re-add all configured PGs when port is started up.
Only record the new value but don't touch BUFFER_PG_TABLE if the following events come when a port is administratively down a port's MTU, speed, or cable length is updated a new PG is added to a port or an existing PG is removed from a port.
Optimize the port event handling flow since refreshPriorityGroupsForPort should be called only once in case more than one fields are updated.
Adjust the Lua plugin which calculates the buffer pool size accordingly

Signed-off-by: Stephen Sun <stephens@nvidia.com>
  • Loading branch information
stephenxs authored Mar 29, 2021
1 parent c7a9c05 commit 8eb2326
Show file tree
Hide file tree
Showing 4 changed files with 328 additions and 116 deletions.
73 changes: 34 additions & 39 deletions cfgmgr/buffer_pool_mellanox.lua
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ local lossypg_400g = 0
local result = {}
local profiles = {}

local count_up_port = 0
local total_port = 0

local mgmt_pool_size = 256 * 1024
local egress_mirror_headroom = 10 * 1024
Expand All @@ -30,56 +30,46 @@ end

local function iterate_all_items(all_items)
table.sort(all_items)
local prev_port = "None"
local port
local is_up
local fvpairs
local status
local admin_down_ports = 0
for i = 1, #all_items, 1 do
-- Check whether the port on which pg or tc hosts is admin down
-- Count the number of priorities or queues in each BUFFER_PG or BUFFER_QUEUE item
-- For example, there are:
-- 3 queues in 'BUFFER_QUEUE_TABLE:Ethernet0:0-2'
-- 2 priorities in 'BUFFER_PG_TABLE:Ethernet0:3-4'
port = string.match(all_items[i], "Ethernet%d+")
if port ~= nil then
if prev_port ~= port then
status = redis.call('HGET', 'PORT_TABLE:'..port, 'admin_status')
prev_port = port
if status == "down" then
is_up = false
else
is_up = true
end
local range = string.match(all_items[i], "Ethernet%d+:([^%s]+)$")
local profile = redis.call('HGET', all_items[i], 'profile')
local index = find_profile(profile)
if index == 0 then
-- Indicate an error in case the referenced profile hasn't been inserted or has been removed
-- It's possible when the orchagent is busy
-- The buffermgrd will take care of it and retry later
return 1
end
if is_up == true then
local range = string.match(all_items[i], "Ethernet%d+:([^%s]+)$")
local profile = redis.call('HGET', all_items[i], 'profile')
local index = find_profile(profile)
local size
if string.len(range) == 1 then
size = 1
else
size = 1 + tonumber(string.sub(range, -1)) - tonumber(string.sub(range, 1, 1))
end
profiles[index][2] = profiles[index][2] + size
local speed = redis.call('HGET', 'PORT_TABLE:'..port, 'speed')
if speed == '400000' and profile == '[BUFFER_PROFILE_TABLE:ingress_lossy_profile]' then
lossypg_400g = lossypg_400g + size
end
local size
if string.len(range) == 1 then
size = 1
else
size = 1 + tonumber(string.sub(range, -1)) - tonumber(string.sub(range, 1, 1))
end
profiles[index][2] = profiles[index][2] + size
local speed = redis.call('HGET', 'PORT_TABLE:'..port, 'speed')
if speed == '400000' and profile == '[BUFFER_PROFILE_TABLE:ingress_lossy_profile]' then
lossypg_400g = lossypg_400g + size
end
end
end
return 0
end

-- Connect to CONFIG_DB
redis.call('SELECT', config_db)

local ports_table = redis.call('KEYS', 'PORT|*')

for i = 1, #ports_table do
local status = redis.call('HGET', ports_table[i], 'admin_status')
if status == "up" then
count_up_port = count_up_port + 1
end
end
total_port = #ports_table

local egress_lossless_pool_size = redis.call('HGET', 'BUFFER_POOL|egress_lossless_pool', 'size')

Expand Down Expand Up @@ -114,8 +104,12 @@ end
local all_pgs = redis.call('KEYS', 'BUFFER_PG*')
local all_tcs = redis.call('KEYS', 'BUFFER_QUEUE*')

iterate_all_items(all_pgs)
iterate_all_items(all_tcs)
local fail_count = 0
fail_count = fail_count + iterate_all_items(all_pgs)
fail_count = fail_count + iterate_all_items(all_tcs)
if fail_count > 0 then
return {}
end

local statistics = {}

Expand All @@ -130,7 +124,7 @@ for i = 1, #profiles, 1 do
size = size + lossypg_reserved
end
if profiles[i][1] == "BUFFER_PROFILE_TABLE:egress_lossy_profile" then
profiles[i][2] = count_up_port
profiles[i][2] = total_port
end
if size ~= 0 then
if shp_enabled and shp_size == 0 then
Expand All @@ -152,7 +146,7 @@ local lossypg_extra_for_400g = (lossypg_reserved_400g - lossypg_reserved) * loss
accumulative_occupied_buffer = accumulative_occupied_buffer + lossypg_extra_for_400g

-- Accumulate sizes for egress mirror and management pool
local accumulative_egress_mirror_overhead = count_up_port * egress_mirror_headroom
local accumulative_egress_mirror_overhead = total_port * egress_mirror_headroom
accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_egress_mirror_overhead + mgmt_pool_size

-- Fetch mmu_size
Expand Down Expand Up @@ -240,5 +234,6 @@ table.insert(result, "debug:egress_mirror:" .. accumulative_egress_mirror_overhe
table.insert(result, "debug:shp_enabled:" .. tostring(shp_enabled))
table.insert(result, "debug:shp_size:" .. shp_size)
table.insert(result, "debug:accumulative xoff:" .. accumulative_xoff)
table.insert(result, "debug:total port:" .. total_port)

return result
Loading

0 comments on commit 8eb2326

Please sign in to comment.