[Dynamic Buffer Calc][Mellanox] Bug fixes and enhancements for the lu…

…a plugins for buffer pool calculation and headroom checking (#1781) What I did Bug fixes for buffer pool calculation and headroom checking on Mellanox platforms. Test the number of lanes instead of the speed when determining whether special handling is required for a port. For speeds other than 400G, eg 100G, it's possible that some 100G ports have 8 lanes and others have 4 lanes, which means they can not share the same buffer profile. A suffix _8lane is introduced to indicate it, like pg_lossless_100000_5m_8lane_profile Take the private headroom into account when calculating the buffer pool size Take deviation into account when checking the headroom against the per-port limit to avoid the inaccurate result in a rare case Use hashtable to record the reference count of a profile in lug plugin Signed-off-by: Stephen Sun stephens@nvidia.com How I verified it Run regression and manually test Details if related Test the number of lanes instead of the speed when determining whether special handling (double headroom size) is required for a port. Originally, it was determined by testing whether the ports' speed is 400G but that is not accurate. A user can configure a port with 8 lanes to 100G. In this case, special handling is still required for a port that is not 400G. So we need to adjust the way to do that. The variable names are also updated accordingly: xxx_400g => xxx_8lanes Take deviation into account when checking the headroom against the per-port limit to avoid the inaccurate result in a rare case There are some deviations that make the accumulative headroom a bit larger than the quantity calculated by the buffer manager. We need to take it into account when calculating the accumulative headroom.
sonic-net · Jun 28, 2021 · 6c88e47 · 6c88e47
1 parent e86b900
commit 6c88e47
Show file tree

Hide file tree

Showing 6 changed files with 190 additions and 82 deletions.
diff --git a/cfgmgr/buffer_check_headroom_mellanox.lua b/cfgmgr/buffer_check_headroom_mellanox.lua
@@ -7,10 +7,25 @@ local port = KEYS[1]
 local input_profile_name = ARGV[1]
 local input_profile_size = ARGV[2]
 local new_pg = ARGV[3]
-local accumulative_size = 0
+
+local function is_port_with_8lanes(lanes)
+    -- On Spectrum 3, ports with 8 lanes have doubled pipeline latency
+    local number_of_lanes = 0
+    if lanes then
+        local _
+        _, number_of_lanes = string.gsub(lanes, ",", ",")
+        number_of_lanes = number_of_lanes + 1
+    end
+    return number_of_lanes == 8
+end
+
+-- Initialize the accumulative size with 4096
+-- This is to absorb the possible deviation
+local accumulative_size = 4096
 
 local appl_db = "0"
 local state_db = "6"
+local config_db = "4"
 
 local ret_true = {}
 local ret = {}
@@ -20,7 +35,13 @@ table.insert(ret_true, "result:true")
 
 default_ret = ret_true
 
-local speed = redis.call('HGET', 'PORT|' .. port, 'speed')
+-- Connect to CONFIG_DB
+redis.call('SELECT', config_db)
+
+local lanes
+
+-- We need to know whether it's a 8-lane port because it has extra pipeline latency
+lanes = redis.call('HGET', 'PORT|' .. port, 'lanes')
 
 -- Fetch the threshold from STATE_DB
 redis.call('SELECT', state_db)
@@ -31,11 +52,12 @@ if max_headroom_size == nil then
 end
 
 local asic_keys = redis.call('KEYS', 'ASIC_TABLE*')
-local pipeline_delay = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency'))
-if speed == 400000 then
-    pipeline_delay = pipeline_delay * 2 - 1
+local pipeline_latency = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency'))
+if is_port_with_8lanes(lanes) then
+    -- The pipeline latency should be adjusted accordingly for ports with 2 buffer units
+    pipeline_latency = pipeline_latency * 2 - 1
 end
-accumulative_size = accumulative_size + 2 * pipeline_delay * 1024
+accumulative_size = accumulative_size + 2 * pipeline_latency * 1024
 
 -- Fetch all keys in BUFFER_PG according to the port
 redis.call('SELECT', appl_db)
@@ -95,6 +117,7 @@ end
 
 if max_headroom_size > accumulative_size then
     table.insert(ret, "result:true")
+    table.insert(ret, "debug:Accumulative headroom on port " .. accumulative_size .. ", the maximum available headroom " .. max_headroom_size)
 else
     table.insert(ret, "result:false")
     table.insert(ret, "debug:Accumulative headroom on port " .. accumulative_size .. " exceeds the maximum available headroom which is " .. max_headroom_size)

diff --git a/cfgmgr/buffer_headroom_mellanox.lua b/cfgmgr/buffer_headroom_mellanox.lua
@@ -3,6 +3,7 @@
 -- ARGV[2] - cable length
 -- ARGV[3] - port mtu
 -- ARGV[4] - gearbox delay
+-- ARGV[5] - lane count of the ports on which the profile will be applied
 
 -- parameters retried from databases:
 -- From CONFIG_DB.LOSSLESS_TRAFFIC_PATTERN
@@ -26,6 +27,7 @@ local port_speed = tonumber(ARGV[1])
 local cable_length = tonumber(string.sub(ARGV[2], 1, -2))
 local port_mtu = tonumber(ARGV[3])
 local gearbox_delay = tonumber(ARGV[4])
+local is_8lane = (ARGV[5] == "8")
 
 local appl_db = "0"
 local config_db = "4"
@@ -100,9 +102,9 @@ local xon_value
 local headroom_size
 local speed_overhead
 
--- Adjustment for 400G
-if port_speed == 400000 then
-    pipeline_latency = 37 * 1024
+-- Adjustment for 8-lane port
+if is_8lane ~= nil and is_8lane then
+    pipeline_latency = pipeline_latency * 2 - 1024
     speed_overhead = port_mtu
 else
     speed_overhead = 0

diff --git a/cfgmgr/buffer_pool_mellanox.lua b/cfgmgr/buffer_pool_mellanox.lua
@@ -5,34 +5,31 @@ local appl_db = "0"
 local config_db = "4"
 local state_db = "6"
 
-local lossypg_reserved = 19 * 1024
-local lossypg_reserved_400g = 37 * 1024
--- Number of 400G ports
-local port_count_400g = 0
--- Number of lossy PG on 400G ports
-local lossypg_400g = 0
+-- Number of ports with 8 lanes (whose pipeline latency should be doubled)
+local port_count_8lanes = 0
+-- Number of lossy PG on ports with 8 lanes
+local lossypg_8lanes = 0
+
+-- Private headrom
+local private_headroom = 10 * 1024
 
 local result = {}
 local profiles = {}
+local lossless_profiles = {}
 
 local total_port = 0
 
 local mgmt_pool_size = 256 * 1024
 local egress_mirror_headroom = 10 * 1024
 
-local function find_profile(ref)
-    -- Remove the surrounding square bracket and the find in the list
-    local name = string.sub(ref, 2, -2)
-    for i = 1, #profiles, 1 do
-        if profiles[i][1] == name then
-            return i
-        end
-    end
-    return 0
-end
+-- The set of ports with 8 lanes
+local port_set_8lanes = {}
+-- Number of ports with lossless profiles
+local lossless_port_count = 0
 
-local function iterate_all_items(all_items)
+local function iterate_all_items(all_items, check_lossless)
     table.sort(all_items)
+    local lossless_ports = {}
     local port
     local fvpairs
     for i = 1, #all_items, 1 do
@@ -43,9 +40,13 @@ local function iterate_all_items(all_items)
         port = string.match(all_items[i], "Ethernet%d+")
         if port ~= nil then
             local range = string.match(all_items[i], "Ethernet%d+:([^%s]+)$")
-            local profile = redis.call('HGET', all_items[i], 'profile')
-            local index = find_profile(profile)
-            if index == 0 then
+            local profile_name = redis.call('HGET', all_items[i], 'profile')
+            if not profile_name then
+                return 1
+            end
+            profile_name = string.sub(profile_name, 2, -2)
+            local profile_ref_count = profiles[profile_name]
+            if profile_ref_count == nil then
                 -- Indicate an error in case the referenced profile hasn't been inserted or has been removed
                 -- It's possible when the orchagent is busy
                 -- The buffermgrd will take care of it and retry later
@@ -57,13 +58,15 @@ local function iterate_all_items(all_items)
             else
                 size = 1 + tonumber(string.sub(range, -1)) - tonumber(string.sub(range, 1, 1))
             end
-            profiles[index][2] = profiles[index][2] + size
-            local speed = redis.call('HGET', 'PORT_TABLE:'..port, 'speed')
-            if speed == '400000' then
-                if profile == '[BUFFER_PROFILE_TABLE:ingress_lossy_profile]' then
-                    lossypg_400g = lossypg_400g + size
+            profiles[profile_name] = profile_ref_count + size
+            if port_set_8lanes[port] and profile_name == 'BUFFER_PROFILE_TABLE:ingress_lossy_profile' then
+                lossypg_8lanes = lossypg_8lanes + size
+            end
+            if check_lossless and lossless_profiles[profile_name] then
+                if lossless_ports[port] == nil then
+                    lossless_port_count = lossless_port_count + 1
+                    lossless_ports[port] = true
                 end
-                port_count_400g = port_count_400g + 1
             end
         end
     end
@@ -77,6 +80,27 @@ local ports_table = redis.call('KEYS', 'PORT|*')
 
 total_port = #ports_table
 
+-- Initialize the port_set_8lanes set
+local lanes
+local number_of_lanes
+local port
+for i = 1, total_port, 1 do
+    -- Load lanes from PORT table
+    lanes = redis.call('HGET', ports_table[i], 'lanes')
+    if lanes then
+        local _
+        _, number_of_lanes = string.gsub(lanes, ",", ",")
+        number_of_lanes = number_of_lanes + 1
+        port = string.sub(ports_table[i], 6, -1)
+        if (number_of_lanes == 8) then
+            port_set_8lanes[port] = true
+            port_count_8lanes = port_count_8lanes + 1
+        else
+            port_set_8lanes[port] = false
+        end
+    end
+end
+
 local egress_lossless_pool_size = redis.call('HGET', 'BUFFER_POOL|egress_lossless_pool', 'size')
 
 -- Whether shared headroom pool is enabled?
@@ -97,22 +121,45 @@ else
     shp_size = 0
 end
 
+-- Fetch mmu_size
+redis.call('SELECT', state_db)
+local mmu_size = tonumber(redis.call('HGET', 'BUFFER_MAX_PARAM_TABLE|global', 'mmu_size'))
+if mmu_size == nil then
+    mmu_size = tonumber(egress_lossless_pool_size)
+end
+local asic_keys = redis.call('KEYS', 'ASIC_TABLE*')
+local cell_size = tonumber(redis.call('HGET', asic_keys[1], 'cell_size'))
+local pipeline_latency = tonumber(redis.call('HGET', asic_keys[1], 'pipeline_latency'))
+
+local lossypg_reserved = pipeline_latency * 1024
+local lossypg_reserved_8lanes = (2 * pipeline_latency - 1) * 1024
+
+-- Align mmu_size at cell size boundary, otherwise the sdk will complain and the syncd will fail
+local number_of_cells = math.floor(mmu_size / cell_size)
+local ceiling_mmu_size = number_of_cells * cell_size
+
 -- Switch to APPL_DB
 redis.call('SELECT', appl_db)
 
 -- Fetch names of all profiles and insert them into the look up table
 local all_profiles = redis.call('KEYS', 'BUFFER_PROFILE*')
 for i = 1, #all_profiles, 1 do
-    table.insert(profiles, {all_profiles[i], 0})
+    if all_profiles[i] ~= "BUFFER_PROFILE_TABLE_KEY_SET" and all_profiles[i] ~= "BUFFER_PROFILE_TABLE_DEL_SET" then
+        local xoff = redis.call('HGET', all_profiles[i], 'xoff')
+        if xoff then
+            lossless_profiles[all_profiles[i]] = true
+        end
+        profiles[all_profiles[i]] = 0
+    end
 end
 
 -- Fetch all the PGs
 local all_pgs = redis.call('KEYS', 'BUFFER_PG*')
 local all_tcs = redis.call('KEYS', 'BUFFER_QUEUE*')
 
 local fail_count = 0
-fail_count = fail_count + iterate_all_items(all_pgs)
-fail_count = fail_count + iterate_all_items(all_tcs)
+fail_count = fail_count + iterate_all_items(all_pgs, true)
+fail_count = fail_count + iterate_all_items(all_tcs, false)
 if fail_count > 0 then
     return {}
 end
@@ -122,56 +169,55 @@ local statistics = {}
 -- Fetch sizes of all of the profiles, accumulate them
 local accumulative_occupied_buffer = 0
 local accumulative_xoff = 0
-for i = 1, #profiles, 1 do
-    if profiles[i][1] ~= "BUFFER_PROFILE_TABLE_KEY_SET" and profiles[i][1] ~= "BUFFER_PROFILE_TABLE_DEL_SET" then
-        local size = tonumber(redis.call('HGET', profiles[i][1], 'size'))
+
+for name in pairs(profiles) do
+    if name ~= "BUFFER_PROFILE_TABLE_KEY_SET" and name ~= "BUFFER_PROFILE_TABLE_DEL_SET" then
+        local size = tonumber(redis.call('HGET', name, 'size'))
         if size ~= nil then 
-            if profiles[i][1] == "BUFFER_PROFILE_TABLE:ingress_lossy_profile" then
+            if name == "BUFFER_PROFILE_TABLE:ingress_lossy_profile" then
                 size = size + lossypg_reserved
             end
-            if profiles[i][1] == "BUFFER_PROFILE_TABLE:egress_lossy_profile" then
-                profiles[i][2] = total_port
+            if name == "BUFFER_PROFILE_TABLE:egress_lossy_profile" then
+                profiles[name] = total_port
             end
             if size ~= 0 then
                 if shp_enabled and shp_size == 0 then
-                    local xon = tonumber(redis.call('HGET', profiles[i][1], 'xon'))
-                    local xoff = tonumber(redis.call('HGET', profiles[i][1], 'xoff'))
+                    local xon = tonumber(redis.call('HGET', name, 'xon'))
+                    local xoff = tonumber(redis.call('HGET', name, 'xoff'))
                     if xon ~= nil and xoff ~= nil and xon + xoff > size then
-                        accumulative_xoff = accumulative_xoff + (xon + xoff - size) * profiles[i][2]
+                        accumulative_xoff = accumulative_xoff + (xon + xoff - size) * profiles[name]
                     end
                 end
-                accumulative_occupied_buffer = accumulative_occupied_buffer + size * profiles[i][2]
+                accumulative_occupied_buffer = accumulative_occupied_buffer + size * profiles[name]
             end
-            table.insert(statistics, {profiles[i][1], size, profiles[i][2]})
+            table.insert(statistics, {name, size, profiles[name]})
         end
     end
 end
 
--- Extra lossy xon buffer for 400G port
-local lossypg_extra_for_400g = (lossypg_reserved_400g - lossypg_reserved) * lossypg_400g
-accumulative_occupied_buffer = accumulative_occupied_buffer + lossypg_extra_for_400g
+-- Extra lossy xon buffer for ports with 8 lanes
+local lossypg_extra_for_8lanes = (lossypg_reserved_8lanes - lossypg_reserved) * lossypg_8lanes
+accumulative_occupied_buffer = accumulative_occupied_buffer + lossypg_extra_for_8lanes
+
+-- Accumulate sizes for private headrooms
+local accumulative_private_headroom = 0
+if shp_enabled then
+    accumulative_private_headroom = lossless_port_count * private_headroom
+    accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_private_headroom
+    accumulative_xoff = accumulative_xoff - accumulative_private_headroom
+    if accumulative_xoff < 0 then
+        accumulative_xoff = 0
+    end
+end
 
 -- Accumulate sizes for management PGs
-local accumulative_management_pg = (total_port - port_count_400g) * lossypg_reserved + port_count_400g * lossypg_reserved_400g
+local accumulative_management_pg = (total_port - port_count_8lanes) * lossypg_reserved + port_count_8lanes * lossypg_reserved_8lanes
 accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_management_pg
 
 -- Accumulate sizes for egress mirror and management pool
 local accumulative_egress_mirror_overhead = total_port * egress_mirror_headroom
 accumulative_occupied_buffer = accumulative_occupied_buffer + accumulative_egress_mirror_overhead + mgmt_pool_size
 
--- Fetch mmu_size
-redis.call('SELECT', state_db)
-local mmu_size = tonumber(redis.call('HGET', 'BUFFER_MAX_PARAM_TABLE|global', 'mmu_size'))
-if mmu_size == nil then
-    mmu_size = tonumber(egress_lossless_pool_size)
-end
-local asic_keys = redis.call('KEYS', 'ASIC_TABLE*')
-local cell_size = tonumber(redis.call('HGET', asic_keys[1], 'cell_size'))
-
--- Align mmu_size at cell size boundary, otherwise the sdk will complain and the syncd will fail
-local number_of_cells = math.floor(mmu_size / cell_size)
-local ceiling_mmu_size = number_of_cells * cell_size
-
 -- Switch to CONFIG_DB
 redis.call('SELECT', config_db)
 
@@ -238,13 +284,16 @@ table.insert(result, "debug:accumulative size:" .. accumulative_occupied_buffer)
 for i = 1, #statistics do
     table.insert(result, "debug:" .. statistics[i][1] .. ":" .. statistics[i][2] .. ":" .. statistics[i][3])
 end
-table.insert(result, "debug:extra_400g:" .. (lossypg_reserved_400g - lossypg_reserved) .. ":" .. lossypg_400g .. ":" .. port_count_400g)
+table.insert(result, "debug:extra_8lanes:" .. (lossypg_reserved_8lanes - lossypg_reserved) .. ":" .. lossypg_8lanes .. ":" .. port_count_8lanes)
 table.insert(result, "debug:mgmt_pool:" .. mgmt_pool_size)
+if shp_enabled then
+    table.insert(result, "debug:accumulative_private_headroom:" .. accumulative_private_headroom)
+    table.insert(result, "debug:accumulative xoff:" .. accumulative_xoff)
+end
 table.insert(result, "debug:accumulative_mgmt_pg:" .. accumulative_management_pg)
 table.insert(result, "debug:egress_mirror:" .. accumulative_egress_mirror_overhead)
 table.insert(result, "debug:shp_enabled:" .. tostring(shp_enabled))
 table.insert(result, "debug:shp_size:" .. shp_size)
-table.insert(result, "debug:accumulative xoff:" .. accumulative_xoff)
-table.insert(result, "debug:total port:" .. total_port)
+table.insert(result, "debug:total port:" .. total_port .. " ports with 8 lanes:" .. port_count_8lanes)
 
 return result