Support shared headroom pool on top of dynamic buffer calculation (#1581

) * Support shared headroom pool on top of dynamic buffer calculation - Feature is enabled/disabled on-the-fly by configuring over-subscribe-ration and shared headroom pool size. If both are configured, the shared headroom pool size will take effect. When turn on/off the feature, all the lossless profiles and buffer pool size will be recalculated. - Support calculating shared headroom pool while ingress lossless pool is statically configured. - Check accumulative headroom before toggling SHP state To disable SHP results in size of PG increasing. Hence needs to check whether accumulative headroom exceed limit - Split the function doUpdateStaticProfileTask into two functions Originally it was called for static profile only and consisted of two parts: - One is for dynamic th updated. It will go over all the buffer profiles dynamically generated according to the dynamic th and update them - The other is for size updated. It will go over each port referencing the profile and check whether the accumulative headroom exceeds limit Now that it is also called by shared headroom pool, we split it into two functions to make it more clear Signed-off-by: Stephen Sun <stephens@nvidia.com> How I verified it Run vs test and regression test.
sonic-net · Feb 7, 2021 · 5fa2329 · 5fa2329
1 parent 1438a70
commit 5fa2329
Show file tree

Hide file tree

Showing 5 changed files with 465 additions and 37 deletions.
diff --git a/cfgmgr/buffer_headroom_mellanox.lua b/cfgmgr/buffer_headroom_mellanox.lua
@@ -16,6 +16,7 @@
 
 local lossless_mtu
 local small_packet_percentage
+local over_subscribe_ratio = 0
 local cell_size
 local pipeline_latency
 local mac_phy_delay
@@ -72,8 +73,19 @@ for i = 1, #lossless_traffic_table_content, 2 do
     end
 end
 
--- Fetch DEFAULT_LOSSLESS_BUFFER_PARAMETER from CONFIG_DB
-local lossless_traffic_keys = redis.call('KEYS', 'DEFAULT_LOSSLESS_BUFFER_PARAMETER*')
+-- Fetch over subscribe ratio
+local default_lossless_param_keys = redis.call('KEYS', 'DEFAULT_LOSSLESS_BUFFER_PARAMETER*')
+local over_subscribe_ratio = tonumber(redis.call('HGET', default_lossless_param_keys[1], 'over_subscribe_ratio'))
+
+-- Fetch the shared headroom pool size
+local shp_size = tonumber(redis.call('HGET', 'BUFFER_POOL|ingress_lossless_pool', 'xoff'))
+
+local shp_enabled
+if shp_size ~= nil and shp_size ~= 0 or over_subscribe_ratio ~= nil and over_subscribe_ratio ~= 0 then
+    shp_enabled = true
+else
+    shp_enabled = false
+end
 
 -- Calculate the headroom information
 local speed_of_light = 198000000
@@ -119,7 +131,11 @@ xoff_value = math.ceil(xoff_value / 1024) * 1024
 xon_value = pipeline_latency
 xon_value = math.ceil(xon_value / 1024) * 1024
 
-headroom_size = xoff_value + xon_value + speed_overhead
+if shp_enabled then
+    headroom_size = xon_value
+else
+    headroom_size = xoff_value + xon_value + speed_overhead
+end
 headroom_size = math.ceil(headroom_size / 1024) * 1024
 
 table.insert(ret, "xon" .. ":" .. math.ceil(xon_value))

diff --git a/cfgmgr/buffer_pool_mellanox.lua b/cfgmgr/buffer_pool_mellanox.lua
@@ -83,6 +83,24 @@ end
 
 local egress_lossless_pool_size = redis.call('HGET', 'BUFFER_POOL|egress_lossless_pool', 'size')
 
+-- Whether shared headroom pool is enabled?
+local default_lossless_param_keys = redis.call('KEYS', 'DEFAULT_LOSSLESS_BUFFER_PARAMETER*')
+local over_subscribe_ratio = tonumber(redis.call('HGET', default_lossless_param_keys[1], 'over_subscribe_ratio'))
+
+-- Fetch the shared headroom pool size
+local shp_size = tonumber(redis.call('HGET', 'BUFFER_POOL|ingress_lossless_pool', 'xoff'))
+
+local shp_enabled = false
+if over_subscribe_ratio ~= nil and over_subscribe_ratio ~= 0 then
+    shp_enabled = true
+end
+
+if shp_size ~= nil and shp_size ~= 0 then
+    shp_enabled = true
+else
+    shp_size = 0
+end
+
 -- Switch to APPL_DB
 redis.call('SELECT', appl_db)
 
@@ -103,6 +121,7 @@ local statistics = {}
 
 -- Fetch sizes of all of the profiles, accumulate them
 local accumulative_occupied_buffer = 0
+local accumulative_xoff = 0
 for i = 1, #profiles, 1 do
     if profiles[i][1] ~= "BUFFER_PROFILE_TABLE_KEY_SET" and profiles[i][1] ~= "BUFFER_PROFILE_TABLE_DEL_SET" then
         local size = tonumber(redis.call('HGET', profiles[i][1], 'size'))
@@ -114,6 +133,13 @@ for i = 1, #profiles, 1 do
                 profiles[i][2] = count_up_port
             end
             if size ~= 0 then
+                if shp_enabled and shp_size == 0 then
+                    local xon = tonumber(redis.call('HGET', profiles[i][1], 'xon'))
+                    local xoff = tonumber(redis.call('HGET', profiles[i][1], 'xoff'))
+                    if xon ~= nil and xoff ~= nil and xon + xoff > size then
+                        accumulative_xoff = accumulative_xoff + (xon + xoff - size) * profiles[i][2]
+                    end
+                end
                 accumulative_occupied_buffer = accumulative_occupied_buffer + size * profiles[i][2]
             end
             table.insert(statistics, {profiles[i][1], size, profiles[i][2]})
@@ -138,7 +164,7 @@ end
 local asic_keys = redis.call('KEYS', 'ASIC_TABLE*')
 local cell_size = tonumber(redis.call('HGET', asic_keys[1], 'cell_size'))
 
--- Align mmu_size at cell size boundary, otherwith the sdk will complain and the syncd will faill
+-- Align mmu_size at cell size boundary, otherwise the sdk will complain and the syncd will fail
 local number_of_cells = math.floor(mmu_size / cell_size)
 local ceiling_mmu_size = number_of_cells * cell_size
 
@@ -149,11 +175,16 @@ redis.call('SELECT', config_db)
 local pools_need_update = {}
 local ipools = redis.call('KEYS', 'BUFFER_POOL|ingress*')
 local ingress_pool_count = 0
+local ingress_lossless_pool_size = nil
 for i = 1, #ipools, 1 do
     local size = tonumber(redis.call('HGET', ipools[i], 'size'))
     if not size then
         table.insert(pools_need_update, ipools[i])
         ingress_pool_count = ingress_pool_count + 1
+    else
+        if ipools[i] == 'BUFFER_POOL|ingress_lossless_pool' and shp_enabled and shp_size == 0 then
+            ingress_lossless_pool_size = size
+        end
     end
 end
 
@@ -165,7 +196,14 @@ for i = 1, #epools, 1 do
     end
 end
 
+if shp_enabled and shp_size == 0 then
+    shp_size = math.ceil(accumulative_xoff / over_subscribe_ratio)
+end
+
 local pool_size
+if shp_size then
+    accumulative_occupied_buffer = accumulative_occupied_buffer + shp_size
+end
 if ingress_pool_count == 1 then
     pool_size = mmu_size - accumulative_occupied_buffer
 else
@@ -176,18 +214,31 @@ if pool_size > ceiling_mmu_size then
     pool_size = ceiling_mmu_size
 end
 
+local shp_deployed = false
 for i = 1, #pools_need_update, 1 do
     local pool_name = string.match(pools_need_update[i], "BUFFER_POOL|([^%s]+)$")
-    table.insert(result, pool_name .. ":" .. math.ceil(pool_size))
+    if shp_size ~= 0 and pool_name == "ingress_lossless_pool" then
+        table.insert(result, pool_name .. ":" .. math.ceil(pool_size) .. ":" .. math.ceil(shp_size))
+        shp_deployed = true
+    else
+        table.insert(result, pool_name .. ":" .. math.ceil(pool_size))
+    end
+end
+
+if not shp_deployed and shp_size ~= 0 and ingress_lossless_pool_size ~= nil then
+    table.insert(result, "ingress_lossless_pool:" .. math.ceil(ingress_lossless_pool_size) .. ":" .. math.ceil(shp_size))
 end
 
 table.insert(result, "debug:mmu_size:" .. mmu_size)
-table.insert(result, "debug:accumulative:" .. accumulative_occupied_buffer)
+table.insert(result, "debug:accumulative size:" .. accumulative_occupied_buffer)
 for i = 1, #statistics do
     table.insert(result, "debug:" .. statistics[i][1] .. ":" .. statistics[i][2] .. ":" .. statistics[i][3])
 end
 table.insert(result, "debug:extra_400g:" .. (lossypg_reserved_400g - lossypg_reserved) .. ":" .. lossypg_400g)
 table.insert(result, "debug:mgmt_pool:" .. mgmt_pool_size)
 table.insert(result, "debug:egress_mirror:" .. accumulative_egress_mirror_overhead)
+table.insert(result, "debug:shp_enabled:" .. tostring(shp_enabled))
+table.insert(result, "debug:shp_size:" .. shp_size)
+table.insert(result, "debug:accumulative xoff:" .. accumulative_xoff)
 
 return result