net/mlx5e: Keep single pre-initialized UMR WQE per RQ

All UMR WQEs of an RQ share many common fields. We use
pre-initialized structures to save calculations in datapath.
One field (xlt_offset) was the only reason we saved a pre-initialized
copy per WQE index.
Here we remove its initialization (move its calculation to datapath),
and reduce the number of copies to one-per-RQ.

A very small datapath calculation is added, it occurs once per a MPWQE
(i.e. once every 256KB), but reduces memory consumption and gives
better cache utilization.

Performance testing:
Tested packet rate, no degradation sensed.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
This commit is contained in:
Tariq Toukan 2017-12-20 11:56:35 +02:00 committed by Saeed Mahameed
parent 9f9e9cd50e
commit b8a98a4cf3
3 changed files with 13 additions and 19 deletions

View File

@ -84,6 +84,7 @@
#define MLX5_MTT_OCTW(npages) (ALIGN(npages, 8) / 2) #define MLX5_MTT_OCTW(npages) (ALIGN(npages, 8) / 2)
#define MLX5E_REQUIRED_WQE_MTTS (ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8)) #define MLX5E_REQUIRED_WQE_MTTS (ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8))
#define MLX5E_LOG_ALIGNED_MPWQE_PPW (ilog2(MLX5E_REQUIRED_WQE_MTTS))
#define MLX5E_REQUIRED_MTTS(wqes) (wqes * MLX5E_REQUIRED_WQE_MTTS) #define MLX5E_REQUIRED_MTTS(wqes) (wqes * MLX5E_REQUIRED_WQE_MTTS)
#define MLX5E_MAX_RQ_NUM_MTTS \ #define MLX5E_MAX_RQ_NUM_MTTS \
((1 << 16) * 2) /* So that MLX5_MTT_OCTW(num_mtts) fits into u16 */ ((1 << 16) * 2) /* So that MLX5_MTT_OCTW(num_mtts) fits into u16 */
@ -450,7 +451,6 @@ struct mlx5e_wqe_frag_info {
struct mlx5e_umr_dma_info { struct mlx5e_umr_dma_info {
struct mlx5e_dma_info dma_info[MLX5_MPWRQ_PAGES_PER_WQE]; struct mlx5e_dma_info dma_info[MLX5_MPWRQ_PAGES_PER_WQE];
struct mlx5e_umr_wqe wqe;
}; };
struct mlx5e_mpw_info { struct mlx5e_mpw_info {
@ -496,6 +496,7 @@ struct mlx5e_rq {
}; };
} wqe; } wqe;
struct { struct {
struct mlx5e_umr_wqe umr_wqe;
struct mlx5e_mpw_info *info; struct mlx5e_mpw_info *info;
mlx5e_fp_skb_from_cqe_mpwrq skb_from_cqe_mpwrq; mlx5e_fp_skb_from_cqe_mpwrq skb_from_cqe_mpwrq;
u16 num_strides; u16 num_strides;
@ -978,11 +979,6 @@ static inline void mlx5e_cq_arm(struct mlx5e_cq *cq)
mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, cq->wq.cc); mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, cq->wq.cc);
} }
static inline u32 mlx5e_get_wqe_mtt_offset(struct mlx5e_rq *rq, u16 wqe_ix)
{
return wqe_ix * ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8);
}
extern const struct ethtool_ops mlx5e_ethtool_ops; extern const struct ethtool_ops mlx5e_ethtool_ops;
#ifdef CONFIG_MLX5_CORE_EN_DCB #ifdef CONFIG_MLX5_CORE_EN_DCB
extern const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops; extern const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops;

View File

@ -306,13 +306,11 @@ static void mlx5e_disable_async_events(struct mlx5e_priv *priv)
static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq, static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
struct mlx5e_icosq *sq, struct mlx5e_icosq *sq,
struct mlx5e_umr_wqe *wqe, struct mlx5e_umr_wqe *wqe)
u16 ix)
{ {
struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl;
struct mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->uctrl; struct mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->uctrl;
u8 ds_cnt = DIV_ROUND_UP(MLX5E_UMR_WQE_INLINE_SZ, MLX5_SEND_WQE_DS); u8 ds_cnt = DIV_ROUND_UP(MLX5E_UMR_WQE_INLINE_SZ, MLX5_SEND_WQE_DS);
u32 umr_wqe_mtt_offset = mlx5e_get_wqe_mtt_offset(rq, ix);
cseg->qpn_ds = cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) | cseg->qpn_ds = cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) |
ds_cnt); ds_cnt);
@ -322,8 +320,6 @@ static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
ucseg->flags = MLX5_UMR_TRANSLATION_OFFSET_EN | MLX5_UMR_INLINE; ucseg->flags = MLX5_UMR_TRANSLATION_OFFSET_EN | MLX5_UMR_INLINE;
ucseg->xlt_octowords = ucseg->xlt_octowords =
cpu_to_be16(MLX5_MTT_OCTW(MLX5_MPWRQ_PAGES_PER_WQE)); cpu_to_be16(MLX5_MTT_OCTW(MLX5_MPWRQ_PAGES_PER_WQE));
ucseg->bsf_octowords =
cpu_to_be16(MLX5_MTT_OCTW(umr_wqe_mtt_offset));
ucseg->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); ucseg->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE);
} }
@ -331,18 +327,13 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
struct mlx5e_channel *c) struct mlx5e_channel *c)
{ {
int wq_sz = mlx5_wq_ll_get_size(&rq->wq); int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
int i;
rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info), rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
GFP_KERNEL, cpu_to_node(c->cpu)); GFP_KERNEL, cpu_to_node(c->cpu));
if (!rq->mpwqe.info) if (!rq->mpwqe.info)
return -ENOMEM; return -ENOMEM;
for (i = 0; i < wq_sz; i++) { mlx5e_build_umr_wqe(rq, &c->icosq, &rq->mpwqe.umr_wqe);
struct mlx5e_mpw_info *wi = &rq->mpwqe.info[i];
mlx5e_build_umr_wqe(rq, &c->icosq, &wi->umr.wqe, i);
}
return 0; return 0;
} }
@ -388,6 +379,11 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq
return mlx5e_create_umr_mkey(mdev, num_mtts, PAGE_SHIFT, &rq->umr_mkey); return mlx5e_create_umr_mkey(mdev, num_mtts, PAGE_SHIFT, &rq->umr_mkey);
} }
static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix)
{
return (wqe_ix << MLX5E_LOG_ALIGNED_MPWQE_PPW) << PAGE_SHIFT;
}
static int mlx5e_alloc_rq(struct mlx5e_channel *c, static int mlx5e_alloc_rq(struct mlx5e_channel *c,
struct mlx5e_params *params, struct mlx5e_params *params,
struct mlx5e_rq_param *rqp, struct mlx5e_rq_param *rqp,
@ -520,7 +516,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i); struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
u64 dma_offset = (u64)mlx5e_get_wqe_mtt_offset(rq, i) << PAGE_SHIFT; u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
wqe->data.addr = cpu_to_be64(dma_offset + rq->buff.headroom); wqe->data.addr = cpu_to_be64(dma_offset + rq->buff.headroom);
} }

View File

@ -373,6 +373,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
struct mlx5_wq_cyc *wq = &sq->wq; struct mlx5_wq_cyc *wq = &sq->wq;
struct mlx5e_umr_wqe *umr_wqe; struct mlx5e_umr_wqe *umr_wqe;
int cpy = offsetof(struct mlx5e_umr_wqe, inline_mtts); int cpy = offsetof(struct mlx5e_umr_wqe, inline_mtts);
u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1);
int err; int err;
u16 pi; u16 pi;
int i; int i;
@ -384,7 +385,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
} }
umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi); umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
memcpy(umr_wqe, &wi->umr.wqe, cpy); memcpy(umr_wqe, &rq->mpwqe.umr_wqe, cpy);
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) { for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) {
err = mlx5e_page_alloc_mapped(rq, dma_info); err = mlx5e_page_alloc_mapped(rq, dma_info);
if (unlikely(err)) if (unlikely(err))
@ -400,6 +401,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
umr_wqe->ctrl.opmod_idx_opcode = umr_wqe->ctrl.opmod_idx_opcode =
cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
MLX5_OPCODE_UMR); MLX5_OPCODE_UMR);
umr_wqe->uctrl.xlt_offset = cpu_to_be16(xlt_offset);
sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_UMR; sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_UMR;
sq->pc += MLX5E_UMR_WQEBBS; sq->pc += MLX5E_UMR_WQEBBS;