Skip to content

Commit

Permalink
net/mlx5e: Make tx_port_ts logic resilient to out-of-order CQEs
Browse files Browse the repository at this point in the history
Use a map structure for associating CQEs containing port timestamping
information with the appropriate skb. Track order of WQEs submitted using a
FIFO. Check if the corresponding port timestamping CQEs from the lookup
values in the FIFO are considered dropped due to time elapsed. Return the
lookup value to a freelist after consuming the skb. Reuse the freed lookup
in future WQE submission iterations.

The map structure uses an integer identifier for the key and returns an skb
corresponding to that identifier. Embed the integer identifier in the WQE
submitted to the WQ for the transmit path when the SQ is a PTP (port
timestamping) SQ. The embedded identifier can then be queried using a field
in the CQE of the corresponding port timestamping CQ. In the port
timestamping napi_poll context, the identifier is queried from the CQE
polled from CQ and used to lookup the corresponding skb from the WQE submit
path. The skb reference is removed from map and then embedded with the port
HW timestamp information from the CQE and eventually consumed.

The metadata freelist FIFO is an array containing integer identifiers that
can be pushed and popped in the FIFO. The purpose of this structure is
bookkeeping what identifier values can safely be used in a subsequent WQE
submission and should not contain identifiers that have still not been
reaped by processing a corresponding CQE completion on the port
timestamping CQ.

The ts_cqe_pending_list structure is a combination of an array and linked
list. The array is pre-populated with the nodes that will be added and
removed from the head of the linked list. Each node contains the unique
identifier value associated with the values submitted in the WQEs and
retrieved in the port timestamping CQEs. When a WQE is submitted, the node
in the array corresponding to the identifier popped from the metadata
freelist is added to the end of the CQE pending list and is marked as
"in-use". The node is removed from the linked list under two conditions.
The first condition is that the corresponding port timestamping CQE is
polled in the PTP napi_poll context. The second condition is that more than
a second has elapsed since the DMA timestamp value corresponding to the WQE
submission. When the first condition occurs, the "in-use" bit in the linked
list node is cleared, and the resources corresponding to the WQE submission
are then released. The second condition, however, indicates that the port
timestamping CQE will likely never be delivered. It's not impossible for
the device to post a CQE after an infinite amount of time though highly
improbable. In order to be resilient to this improbable case, resources
related to the corresponding WQE submission are still kept, the identifier
value is not returned to the freelist, and the "in-use" bit is cleared on
the node to indicate that it's no longer part of the linked list of "likely
to be delivered" port timestamping CQE identifiers. A count for the number
of port timestamping CQEs considered highly likely to never be delivered by
the device is maintained. This count gets decremented in the unlikely event
a port timestamping CQE considered unlikely to ever be delivered is polled
in the PTP napi_poll context.

Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
  • Loading branch information
Binary-Eater authored and Saeed Mahameed committed Aug 14, 2023
1 parent b608dd6 commit 3178308
Show file tree
Hide file tree
Showing 7 changed files with 236 additions and 81 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,12 @@ the software port.
time protocol.
- Error

* - `ptp_cq[i]_late_cqe`
- Number of times a CQE has been delivered on the PTP timestamping CQ when
the CQE was not expected since a certain amount of time had elapsed where
the device typically ensures not posting the CQE.
- Error

.. [#ring_global] The corresponding ring and global counters do not share the
same name (i.e. do not follow the common naming scheme).
Expand Down
215 changes: 158 additions & 57 deletions drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include "en/txrx.h"
#include "en/params.h"
#include "en/fs_tt_redirect.h"
#include <linux/list.h>
#include <linux/spinlock.h>

struct mlx5e_ptp_fs {
struct mlx5_flow_handle *l2_rule;
Expand All @@ -19,6 +21,48 @@ struct mlx5e_ptp_params {
struct mlx5e_rq_param rq_param;
};

struct mlx5e_ptp_port_ts_cqe_tracker {
u8 metadata_id;
bool inuse : 1;
struct list_head entry;
};

struct mlx5e_ptp_port_ts_cqe_list {
struct mlx5e_ptp_port_ts_cqe_tracker *nodes;
struct list_head tracker_list_head;
/* Sync list operations in xmit and napi_poll contexts */
spinlock_t tracker_list_lock;
};

static inline void
mlx5e_ptp_port_ts_cqe_list_add(struct mlx5e_ptp_port_ts_cqe_list *list, u8 metadata)
{
struct mlx5e_ptp_port_ts_cqe_tracker *tracker = &list->nodes[metadata];

WARN_ON_ONCE(tracker->inuse);
tracker->inuse = true;
spin_lock(&list->tracker_list_lock);
list_add_tail(&tracker->entry, &list->tracker_list_head);
spin_unlock(&list->tracker_list_lock);
}

static void
mlx5e_ptp_port_ts_cqe_list_remove(struct mlx5e_ptp_port_ts_cqe_list *list, u8 metadata)
{
struct mlx5e_ptp_port_ts_cqe_tracker *tracker = &list->nodes[metadata];

WARN_ON_ONCE(!tracker->inuse);
tracker->inuse = false;
spin_lock(&list->tracker_list_lock);
list_del(&tracker->entry);
spin_unlock(&list->tracker_list_lock);
}

void mlx5e_ptpsq_track_metadata(struct mlx5e_ptpsq *ptpsq, u8 metadata)
{
mlx5e_ptp_port_ts_cqe_list_add(ptpsq->ts_cqe_pending_list, metadata);
}

struct mlx5e_skb_cb_hwtstamp {
ktime_t cqe_hwtstamp;
ktime_t port_hwtstamp;
Expand Down Expand Up @@ -79,75 +123,88 @@ void mlx5e_skb_cb_hwtstamp_handler(struct sk_buff *skb, int hwtstamp_type,
memset(skb->cb, 0, sizeof(struct mlx5e_skb_cb_hwtstamp));
}

#define PTP_WQE_CTR2IDX(val) ((val) & ptpsq->ts_cqe_ctr_mask)

static bool mlx5e_ptp_ts_cqe_drop(struct mlx5e_ptpsq *ptpsq, u16 skb_ci, u16 skb_id)
static struct sk_buff *
mlx5e_ptp_metadata_map_lookup(struct mlx5e_ptp_metadata_map *map, u16 metadata)
{
return (ptpsq->ts_cqe_ctr_mask && (skb_ci != skb_id));
return map->data[metadata];
}

static bool mlx5e_ptp_ts_cqe_ooo(struct mlx5e_ptpsq *ptpsq, u16 skb_id)
static struct sk_buff *
mlx5e_ptp_metadata_map_remove(struct mlx5e_ptp_metadata_map *map, u16 metadata)
{
u16 skb_ci = PTP_WQE_CTR2IDX(ptpsq->skb_fifo_cc);
u16 skb_pi = PTP_WQE_CTR2IDX(ptpsq->skb_fifo_pc);
struct sk_buff *skb;

if (PTP_WQE_CTR2IDX(skb_id - skb_ci) >= PTP_WQE_CTR2IDX(skb_pi - skb_ci))
return true;
skb = map->data[metadata];
map->data[metadata] = NULL;

return false;
return skb;
}

static void mlx5e_ptp_skb_fifo_ts_cqe_resync(struct mlx5e_ptpsq *ptpsq, u16 skb_ci,
u16 skb_id, int budget)
static void mlx5e_ptpsq_mark_ts_cqes_undelivered(struct mlx5e_ptpsq *ptpsq,
ktime_t port_tstamp)
{
struct skb_shared_hwtstamps hwts = {};
struct sk_buff *skb;
struct mlx5e_ptp_port_ts_cqe_list *cqe_list = ptpsq->ts_cqe_pending_list;
ktime_t timeout = ns_to_ktime(MLX5E_PTP_TS_CQE_UNDELIVERED_TIMEOUT);
struct mlx5e_ptp_metadata_map *metadata_map = &ptpsq->metadata_map;
struct mlx5e_ptp_port_ts_cqe_tracker *pos, *n;

spin_lock(&cqe_list->tracker_list_lock);
list_for_each_entry_safe(pos, n, &cqe_list->tracker_list_head, entry) {
struct sk_buff *skb =
mlx5e_ptp_metadata_map_lookup(metadata_map, pos->metadata_id);
ktime_t dma_tstamp = mlx5e_skb_cb_get_hwts(skb)->cqe_hwtstamp;

ptpsq->cq_stats->resync_event++;
if (!dma_tstamp ||
ktime_after(ktime_add(dma_tstamp, timeout), port_tstamp))
break;

while (skb_ci != skb_id) {
skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo);
hwts.hwtstamp = mlx5e_skb_cb_get_hwts(skb)->cqe_hwtstamp;
skb_tstamp_tx(skb, &hwts);
ptpsq->cq_stats->resync_cqe++;
napi_consume_skb(skb, budget);
skb_ci = PTP_WQE_CTR2IDX(ptpsq->skb_fifo_cc);
metadata_map->undelivered_counter++;
WARN_ON_ONCE(!pos->inuse);
pos->inuse = false;
list_del(&pos->entry);
}
spin_unlock(&cqe_list->tracker_list_lock);
}

#define PTP_WQE_CTR2IDX(val) ((val) & ptpsq->ts_cqe_ctr_mask)

static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
struct mlx5_cqe64 *cqe,
int budget)
{
u16 skb_id = PTP_WQE_CTR2IDX(be16_to_cpu(cqe->wqe_counter));
u16 skb_ci = PTP_WQE_CTR2IDX(ptpsq->skb_fifo_cc);
struct mlx5e_ptp_port_ts_cqe_list *pending_cqe_list = ptpsq->ts_cqe_pending_list;
u8 metadata_id = PTP_WQE_CTR2IDX(be16_to_cpu(cqe->wqe_counter));
bool is_err_cqe = !!MLX5E_RX_ERR_CQE(cqe);
struct mlx5e_txqsq *sq = &ptpsq->txqsq;
struct sk_buff *skb;
ktime_t hwtstamp;

if (unlikely(MLX5E_RX_ERR_CQE(cqe))) {
skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo);
ptpsq->cq_stats->err_cqe++;
goto out;
if (likely(pending_cqe_list->nodes[metadata_id].inuse)) {
mlx5e_ptp_port_ts_cqe_list_remove(pending_cqe_list, metadata_id);
} else {
/* Reclaim space in the unlikely event CQE was delivered after
* marking it late.
*/
ptpsq->metadata_map.undelivered_counter--;
ptpsq->cq_stats->late_cqe++;
}

if (mlx5e_ptp_ts_cqe_drop(ptpsq, skb_ci, skb_id)) {
if (mlx5e_ptp_ts_cqe_ooo(ptpsq, skb_id)) {
/* already handled by a previous resync */
ptpsq->cq_stats->ooo_cqe_drop++;
return;
}
mlx5e_ptp_skb_fifo_ts_cqe_resync(ptpsq, skb_ci, skb_id, budget);
skb = mlx5e_ptp_metadata_map_remove(&ptpsq->metadata_map, metadata_id);

if (unlikely(is_err_cqe)) {
ptpsq->cq_stats->err_cqe++;
goto out;
}

skb = mlx5e_skb_fifo_pop(&ptpsq->skb_fifo);
hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, get_cqe_ts(cqe));
mlx5e_skb_cb_hwtstamp_handler(skb, MLX5E_SKB_CB_PORT_HWTSTAMP,
hwtstamp, ptpsq->cq_stats);
ptpsq->cq_stats->cqe++;

mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp);
out:
napi_consume_skb(skb, budget);
mlx5e_ptp_metadata_fifo_push(&ptpsq->metadata_freelist, metadata_id);
}

static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget)
Expand Down Expand Up @@ -291,36 +348,78 @@ static void mlx5e_ptp_destroy_sq(struct mlx5_core_dev *mdev, u32 sqn)

static int mlx5e_ptp_alloc_traffic_db(struct mlx5e_ptpsq *ptpsq, int numa)
{
int wq_sz = mlx5_wq_cyc_get_size(&ptpsq->txqsq.wq);
struct mlx5_core_dev *mdev = ptpsq->txqsq.mdev;
struct mlx5e_ptp_metadata_fifo *metadata_freelist = &ptpsq->metadata_freelist;
struct mlx5e_ptp_metadata_map *metadata_map = &ptpsq->metadata_map;
struct mlx5e_ptp_port_ts_cqe_list *cqe_list;
int db_sz;
int md;

ptpsq->skb_fifo.fifo = kvzalloc_node(array_size(wq_sz, sizeof(*ptpsq->skb_fifo.fifo)),
GFP_KERNEL, numa);
if (!ptpsq->skb_fifo.fifo)
cqe_list = kvzalloc_node(sizeof(*ptpsq->ts_cqe_pending_list), GFP_KERNEL, numa);
if (!cqe_list)
return -ENOMEM;
ptpsq->ts_cqe_pending_list = cqe_list;

db_sz = min_t(u32, mlx5_wq_cyc_get_size(&ptpsq->txqsq.wq),
1 << MLX5_CAP_GEN_2(ptpsq->txqsq.mdev,
ts_cqe_metadata_size2wqe_counter));
ptpsq->ts_cqe_ctr_mask = db_sz - 1;

cqe_list->nodes = kvzalloc_node(array_size(db_sz, sizeof(*cqe_list->nodes)),
GFP_KERNEL, numa);
if (!cqe_list->nodes)
goto free_cqe_list;
INIT_LIST_HEAD(&cqe_list->tracker_list_head);
spin_lock_init(&cqe_list->tracker_list_lock);

metadata_freelist->data =
kvzalloc_node(array_size(db_sz, sizeof(*metadata_freelist->data)),
GFP_KERNEL, numa);
if (!metadata_freelist->data)
goto free_cqe_list_nodes;
metadata_freelist->mask = ptpsq->ts_cqe_ctr_mask;

for (md = 0; md < db_sz; ++md) {
cqe_list->nodes[md].metadata_id = md;
metadata_freelist->data[md] = md;
}
metadata_freelist->pc = db_sz;

metadata_map->data =
kvzalloc_node(array_size(db_sz, sizeof(*metadata_map->data)),
GFP_KERNEL, numa);
if (!metadata_map->data)
goto free_metadata_freelist;
metadata_map->capacity = db_sz;

ptpsq->skb_fifo.pc = &ptpsq->skb_fifo_pc;
ptpsq->skb_fifo.cc = &ptpsq->skb_fifo_cc;
ptpsq->skb_fifo.mask = wq_sz - 1;
if (MLX5_CAP_GEN_2(mdev, ts_cqe_metadata_size2wqe_counter))
ptpsq->ts_cqe_ctr_mask =
(1 << MLX5_CAP_GEN_2(mdev, ts_cqe_metadata_size2wqe_counter)) - 1;
return 0;

free_metadata_freelist:
kvfree(metadata_freelist->data);
free_cqe_list_nodes:
kvfree(cqe_list->nodes);
free_cqe_list:
kvfree(cqe_list);
return -ENOMEM;
}

static void mlx5e_ptp_drain_skb_fifo(struct mlx5e_skb_fifo *skb_fifo)
static void mlx5e_ptp_drain_metadata_map(struct mlx5e_ptp_metadata_map *map)
{
while (*skb_fifo->pc != *skb_fifo->cc) {
struct sk_buff *skb = mlx5e_skb_fifo_pop(skb_fifo);
int idx;

for (idx = 0; idx < map->capacity; ++idx) {
struct sk_buff *skb = map->data[idx];

dev_kfree_skb_any(skb);
}
}

static void mlx5e_ptp_free_traffic_db(struct mlx5e_skb_fifo *skb_fifo)
static void mlx5e_ptp_free_traffic_db(struct mlx5e_ptpsq *ptpsq)
{
mlx5e_ptp_drain_skb_fifo(skb_fifo);
kvfree(skb_fifo->fifo);
mlx5e_ptp_drain_metadata_map(&ptpsq->metadata_map);
kvfree(ptpsq->metadata_map.data);
kvfree(ptpsq->metadata_freelist.data);
kvfree(ptpsq->ts_cqe_pending_list->nodes);
kvfree(ptpsq->ts_cqe_pending_list);
}

static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn,
Expand Down Expand Up @@ -348,8 +447,7 @@ static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn,
if (err)
goto err_free_txqsq;

err = mlx5e_ptp_alloc_traffic_db(ptpsq,
dev_to_node(mlx5_core_dma_dev(c->mdev)));
err = mlx5e_ptp_alloc_traffic_db(ptpsq, dev_to_node(mlx5_core_dma_dev(c->mdev)));
if (err)
goto err_free_txqsq;

Expand All @@ -366,7 +464,7 @@ static void mlx5e_ptp_close_txqsq(struct mlx5e_ptpsq *ptpsq)
struct mlx5e_txqsq *sq = &ptpsq->txqsq;
struct mlx5_core_dev *mdev = sq->mdev;

mlx5e_ptp_free_traffic_db(&ptpsq->skb_fifo);
mlx5e_ptp_free_traffic_db(ptpsq);
cancel_work_sync(&sq->recover_work);
mlx5e_ptp_destroy_sq(mdev, sq->sqn);
mlx5e_free_txqsq_descs(sq);
Expand Down Expand Up @@ -534,7 +632,10 @@ static void mlx5e_ptp_build_params(struct mlx5e_ptp *c,

/* SQ */
if (test_bit(MLX5E_PTP_STATE_TX, c->state)) {
params->log_sq_size = orig->log_sq_size;
params->log_sq_size =
min(MLX5_CAP_GEN_2(c->mdev, ts_cqe_metadata_size2wqe_counter),
MLX5E_PTP_MAX_LOG_SQ_SIZE);
params->log_sq_size = min(params->log_sq_size, orig->log_sq_size);
mlx5e_ptp_build_sq_param(c->mdev, params, &cparams->txq_sq_param);
}
/* RQ */
Expand Down
Loading

0 comments on commit 3178308

Please sign in to comment.