mlx4: add page recycling in receive path

Same technique than some Intel drivers, for arches where PAGE_SIZE = 4096

In most cases, pages are reused because they were consumed
before we could loop around the RX ring.

This brings back performance, and is even better,
a single TCP flow reaches 30Gbit on my hosts.

v2: added full memset() in mlx4_en_free_frag(), as Tariq found it was needed
if we switch to large MTU, as priv->log_rx_info can dynamically be changed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eric Dumazet 2017-03-08 08:17:13 -08:00 committed by David S. Miller
parent b5a54d9a31
commit 34db548bfb
2 changed files with 82 additions and 177 deletions

View File

@ -50,9 +50,8 @@
#include "mlx4_en.h" #include "mlx4_en.h"
static int mlx4_alloc_pages(struct mlx4_en_priv *priv, static int mlx4_alloc_page(struct mlx4_en_priv *priv,
struct mlx4_en_rx_alloc *page_alloc, struct mlx4_en_rx_alloc *frag,
const struct mlx4_en_frag_info *frag_info,
gfp_t gfp) gfp_t gfp)
{ {
struct page *page; struct page *page;
@ -63,145 +62,46 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
return -ENOMEM; return -ENOMEM;
dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE, priv->dma_dir); dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE, priv->dma_dir);
if (unlikely(dma_mapping_error(priv->ddev, dma))) { if (unlikely(dma_mapping_error(priv->ddev, dma))) {
put_page(page); __free_page(page);
return -ENOMEM; return -ENOMEM;
} }
page_alloc->page = page; frag->page = page;
page_alloc->dma = dma; frag->dma = dma;
page_alloc->page_offset = 0; frag->page_offset = priv->rx_headroom;
/* Not doing get_page() for each frag is a big win
* on asymetric workloads. Note we can not use atomic_set().
*/
page_ref_add(page, PAGE_SIZE / frag_info->frag_stride - 1);
return 0; return 0;
} }
static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv, static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
struct mlx4_en_rx_desc *rx_desc, struct mlx4_en_rx_desc *rx_desc,
struct mlx4_en_rx_alloc *frags, struct mlx4_en_rx_alloc *frags,
struct mlx4_en_rx_alloc *ring_alloc,
gfp_t gfp) gfp_t gfp)
{ {
struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
const struct mlx4_en_frag_info *frag_info;
struct page *page;
int i; int i;
for (i = 0; i < priv->num_frags; i++) { for (i = 0; i < priv->num_frags; i++, frags++) {
frag_info = &priv->frag_info[i]; if (!frags->page && mlx4_alloc_page(priv, frags, gfp))
page_alloc[i] = ring_alloc[i];
page_alloc[i].page_offset += frag_info->frag_stride;
if (page_alloc[i].page_offset + frag_info->frag_stride <=
PAGE_SIZE)
continue;
if (unlikely(mlx4_alloc_pages(priv, &page_alloc[i],
frag_info, gfp)))
goto out;
}
for (i = 0; i < priv->num_frags; i++) {
frags[i] = ring_alloc[i];
frags[i].page_offset += priv->rx_headroom;
rx_desc->data[i].addr = cpu_to_be64(frags[i].dma +
frags[i].page_offset);
ring_alloc[i] = page_alloc[i];
}
return 0;
out:
while (i--) {
if (page_alloc[i].page != ring_alloc[i].page) {
dma_unmap_page(priv->ddev, page_alloc[i].dma,
PAGE_SIZE, priv->dma_dir);
page = page_alloc[i].page;
/* Revert changes done by mlx4_alloc_pages */
page_ref_sub(page, PAGE_SIZE /
priv->frag_info[i].frag_stride - 1);
put_page(page);
}
}
return -ENOMEM; return -ENOMEM;
} rx_desc->data[i].addr = cpu_to_be64(frags->dma +
frags->page_offset);
static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
struct mlx4_en_rx_alloc *frags,
int i)
{
const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride;
if (next_frag_end > PAGE_SIZE)
dma_unmap_page(priv->ddev, frags[i].dma, PAGE_SIZE,
priv->dma_dir);
if (frags[i].page)
put_page(frags[i].page);
}
static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
struct mlx4_en_rx_ring *ring)
{
int i;
struct mlx4_en_rx_alloc *page_alloc;
for (i = 0; i < priv->num_frags; i++) {
const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
if (mlx4_alloc_pages(priv, &ring->page_alloc[i],
frag_info, GFP_KERNEL | __GFP_COLD))
goto out;
en_dbg(DRV, priv, " frag %d allocator: - frags:%d\n",
i, page_ref_count(ring->page_alloc[i].page));
} }
return 0; return 0;
out:
while (i--) {
struct page *page;
page_alloc = &ring->page_alloc[i];
dma_unmap_page(priv->ddev, page_alloc->dma,
PAGE_SIZE, priv->dma_dir);
page = page_alloc->page;
/* Revert changes done by mlx4_alloc_pages */
page_ref_sub(page, PAGE_SIZE /
priv->frag_info[i].frag_stride - 1);
put_page(page);
page_alloc->page = NULL;
}
return -ENOMEM;
} }
static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv, static void mlx4_en_free_frag(const struct mlx4_en_priv *priv,
struct mlx4_en_rx_ring *ring) struct mlx4_en_rx_alloc *frag)
{ {
struct mlx4_en_rx_alloc *page_alloc; if (frag->page) {
int i; dma_unmap_page(priv->ddev, frag->dma,
for (i = 0; i < priv->num_frags; i++) {
const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
page_alloc = &ring->page_alloc[i];
en_dbg(DRV, priv, "Freeing allocator:%d count:%d\n",
i, page_count(page_alloc->page));
dma_unmap_page(priv->ddev, page_alloc->dma,
PAGE_SIZE, priv->dma_dir); PAGE_SIZE, priv->dma_dir);
while (page_alloc->page_offset + frag_info->frag_stride < __free_page(frag->page);
PAGE_SIZE) {
put_page(page_alloc->page);
page_alloc->page_offset += frag_info->frag_stride;
}
page_alloc->page = NULL;
} }
/* We need to clear all fields, otherwise a change of priv->log_rx_info
* could lead to see garbage later in frag->page.
*/
memset(frag, 0, sizeof(*frag));
} }
static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv, static void mlx4_en_init_rx_desc(const struct mlx4_en_priv *priv,
struct mlx4_en_rx_ring *ring, int index) struct mlx4_en_rx_ring *ring, int index)
{ {
struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index; struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
@ -235,19 +135,22 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
(index << priv->log_rx_info); (index << priv->log_rx_info);
if (ring->page_cache.index > 0) { if (ring->page_cache.index > 0) {
/* XDP uses a single page per frame */
if (!frags->page) {
ring->page_cache.index--; ring->page_cache.index--;
frags[0].page = ring->page_cache.buf[ring->page_cache.index].page; frags->page = ring->page_cache.buf[ring->page_cache.index].page;
frags[0].dma = ring->page_cache.buf[ring->page_cache.index].dma; frags->dma = ring->page_cache.buf[ring->page_cache.index].dma;
frags[0].page_offset = XDP_PACKET_HEADROOM; }
rx_desc->data[0].addr = cpu_to_be64(frags[0].dma + frags->page_offset = XDP_PACKET_HEADROOM;
frags[0].page_offset); rx_desc->data[0].addr = cpu_to_be64(frags->dma +
XDP_PACKET_HEADROOM);
return 0; return 0;
} }
return mlx4_en_alloc_frags(priv, rx_desc, frags, ring->page_alloc, gfp); return mlx4_en_alloc_frags(priv, rx_desc, frags, gfp);
} }
static inline bool mlx4_en_is_ring_empty(struct mlx4_en_rx_ring *ring) static bool mlx4_en_is_ring_empty(const struct mlx4_en_rx_ring *ring)
{ {
return ring->prod == ring->cons; return ring->prod == ring->cons;
} }
@ -257,7 +160,8 @@ static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring)
*ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff); *ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff);
} }
static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv, /* slow path */
static void mlx4_en_free_rx_desc(const struct mlx4_en_priv *priv,
struct mlx4_en_rx_ring *ring, struct mlx4_en_rx_ring *ring,
int index) int index)
{ {
@ -267,7 +171,7 @@ static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv,
frags = ring->rx_info + (index << priv->log_rx_info); frags = ring->rx_info + (index << priv->log_rx_info);
for (nr = 0; nr < priv->num_frags; nr++) { for (nr = 0; nr < priv->num_frags; nr++) {
en_dbg(DRV, priv, "Freeing fragment:%d\n", nr); en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
mlx4_en_free_frag(priv, frags, nr); mlx4_en_free_frag(priv, frags + nr);
} }
} }
@ -323,12 +227,12 @@ static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv,
ring->cons, ring->prod); ring->cons, ring->prod);
/* Unmap and free Rx buffers */ /* Unmap and free Rx buffers */
while (!mlx4_en_is_ring_empty(ring)) { for (index = 0; index < ring->size; index++) {
index = ring->cons & ring->size_mask;
en_dbg(DRV, priv, "Processing descriptor:%d\n", index); en_dbg(DRV, priv, "Processing descriptor:%d\n", index);
mlx4_en_free_rx_desc(priv, ring, index); mlx4_en_free_rx_desc(priv, ring, index);
++ring->cons;
} }
ring->cons = 0;
ring->prod = 0;
} }
void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev) void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev)
@ -380,9 +284,9 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS * tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
sizeof(struct mlx4_en_rx_alloc)); sizeof(struct mlx4_en_rx_alloc));
ring->rx_info = vmalloc_node(tmp, node); ring->rx_info = vzalloc_node(tmp, node);
if (!ring->rx_info) { if (!ring->rx_info) {
ring->rx_info = vmalloc(tmp); ring->rx_info = vzalloc(tmp);
if (!ring->rx_info) { if (!ring->rx_info) {
err = -ENOMEM; err = -ENOMEM;
goto err_ring; goto err_ring;
@ -452,16 +356,6 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
/* Initialize all descriptors */ /* Initialize all descriptors */
for (i = 0; i < ring->size; i++) for (i = 0; i < ring->size; i++)
mlx4_en_init_rx_desc(priv, ring, i); mlx4_en_init_rx_desc(priv, ring, i);
/* Initialize page allocators */
err = mlx4_en_init_allocator(priv, ring);
if (err) {
en_err(priv, "Failed initializing ring allocator\n");
if (ring->stride <= TXBB_SIZE)
ring->buf -= TXBB_SIZE;
ring_ind--;
goto err_allocator;
}
} }
err = mlx4_en_fill_rx_buffers(priv); err = mlx4_en_fill_rx_buffers(priv);
if (err) if (err)
@ -481,11 +375,9 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
mlx4_en_free_rx_buf(priv, priv->rx_ring[ring_ind]); mlx4_en_free_rx_buf(priv, priv->rx_ring[ring_ind]);
ring_ind = priv->rx_ring_num - 1; ring_ind = priv->rx_ring_num - 1;
err_allocator:
while (ring_ind >= 0) { while (ring_ind >= 0) {
if (priv->rx_ring[ring_ind]->stride <= TXBB_SIZE) if (priv->rx_ring[ring_ind]->stride <= TXBB_SIZE)
priv->rx_ring[ring_ind]->buf -= TXBB_SIZE; priv->rx_ring[ring_ind]->buf -= TXBB_SIZE;
mlx4_en_destroy_allocator(priv, priv->rx_ring[ring_ind]);
ring_ind--; ring_ind--;
} }
return err; return err;
@ -565,50 +457,68 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
mlx4_en_free_rx_buf(priv, ring); mlx4_en_free_rx_buf(priv, ring);
if (ring->stride <= TXBB_SIZE) if (ring->stride <= TXBB_SIZE)
ring->buf -= TXBB_SIZE; ring->buf -= TXBB_SIZE;
mlx4_en_destroy_allocator(priv, ring);
} }
static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
struct mlx4_en_rx_desc *rx_desc,
struct mlx4_en_rx_alloc *frags, struct mlx4_en_rx_alloc *frags,
struct sk_buff *skb, struct sk_buff *skb,
int length) int length)
{ {
struct skb_frag_struct *skb_frags_rx = skb_shinfo(skb)->frags; const struct mlx4_en_frag_info *frag_info = priv->frag_info;
struct mlx4_en_frag_info *frag_info = priv->frag_info; unsigned int truesize = 0;
int nr, frag_size; int nr, frag_size;
struct page *page;
dma_addr_t dma; dma_addr_t dma;
bool release;
/* Collect used fragments while replacing them in the HW descriptors */ /* Collect used fragments while replacing them in the HW descriptors */
for (nr = 0;;) { for (nr = 0;; frags++) {
frag_size = min_t(int, length, frag_info->frag_size); frag_size = min_t(int, length, frag_info->frag_size);
if (unlikely(!frags[nr].page)) page = frags->page;
if (unlikely(!page))
goto fail; goto fail;
dma = be64_to_cpu(rx_desc->data[nr].addr); dma = frags->dma;
dma_sync_single_for_cpu(priv->ddev, dma, frag_info->frag_size, dma_sync_single_range_for_cpu(priv->ddev, dma, frags->page_offset,
DMA_FROM_DEVICE); frag_size, priv->dma_dir);
__skb_fill_page_desc(skb, nr, frags[nr].page, __skb_fill_page_desc(skb, nr, page, frags->page_offset,
frags[nr].page_offset,
frag_size); frag_size);
skb->truesize += frag_info->frag_stride; truesize += frag_info->frag_stride;
frags[nr].page = NULL; if (frag_info->frag_stride == PAGE_SIZE / 2) {
frags->page_offset ^= PAGE_SIZE / 2;
release = page_count(page) != 1 ||
page_is_pfmemalloc(page) ||
page_to_nid(page) != numa_mem_id();
} else {
u32 sz_align = ALIGN(frag_size, SMP_CACHE_BYTES);
frags->page_offset += sz_align;
release = frags->page_offset + frag_info->frag_size > PAGE_SIZE;
}
if (release) {
dma_unmap_page(priv->ddev, dma, PAGE_SIZE, priv->dma_dir);
frags->page = NULL;
} else {
page_ref_inc(page);
}
nr++; nr++;
length -= frag_size; length -= frag_size;
if (!length) if (!length)
break; break;
frag_info++; frag_info++;
} }
skb->truesize += truesize;
return nr; return nr;
fail: fail:
while (nr > 0) { while (nr > 0) {
nr--; nr--;
__skb_frag_unref(&skb_frags_rx[nr]); __skb_frag_unref(skb_shinfo(skb)->frags + nr);
} }
return 0; return 0;
} }
@ -639,7 +549,8 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
if (length <= SMALL_PACKET_SIZE) { if (length <= SMALL_PACKET_SIZE) {
/* We are copying all relevant data to the skb - temporarily /* We are copying all relevant data to the skb - temporarily
* sync buffers for the copy */ * sync buffers for the copy */
dma = be64_to_cpu(rx_desc->data[0].addr);
dma = frags[0].dma + frags[0].page_offset;
dma_sync_single_for_cpu(priv->ddev, dma, length, dma_sync_single_for_cpu(priv->ddev, dma, length,
DMA_FROM_DEVICE); DMA_FROM_DEVICE);
skb_copy_to_linear_data(skb, va, length); skb_copy_to_linear_data(skb, va, length);
@ -648,7 +559,7 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
unsigned int pull_len; unsigned int pull_len;
/* Move relevant fragments to skb */ /* Move relevant fragments to skb */
used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, frags, used_frags = mlx4_en_complete_rx_desc(priv, frags,
skb, length); skb, length);
if (unlikely(!used_frags)) { if (unlikely(!used_frags)) {
kfree_skb(skb); kfree_skb(skb);
@ -916,8 +827,10 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
case XDP_TX: case XDP_TX:
if (likely(!mlx4_en_xmit_frame(ring, frags, dev, if (likely(!mlx4_en_xmit_frame(ring, frags, dev,
length, cq->ring, length, cq->ring,
&doorbell_pending))) &doorbell_pending))) {
goto consumed; frags[0].page = NULL;
goto next;
}
trace_xdp_exception(dev, xdp_prog, act); trace_xdp_exception(dev, xdp_prog, act);
goto xdp_drop_no_cnt; /* Drop on xmit failure */ goto xdp_drop_no_cnt; /* Drop on xmit failure */
default: default:
@ -927,8 +840,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
case XDP_DROP: case XDP_DROP:
ring->xdp_drop++; ring->xdp_drop++;
xdp_drop_no_cnt: xdp_drop_no_cnt:
if (likely(mlx4_en_rx_recycle(ring, frags)))
goto consumed;
goto next; goto next;
} }
} }
@ -974,8 +885,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
if (!gro_skb) if (!gro_skb)
goto next; goto next;
nr = mlx4_en_complete_rx_desc(priv, nr = mlx4_en_complete_rx_desc(priv, frags, gro_skb,
rx_desc, frags, gro_skb,
length); length);
if (!nr) if (!nr)
goto next; goto next;
@ -1084,10 +994,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
napi_gro_receive(&cq->napi, skb); napi_gro_receive(&cq->napi, skb);
next: next:
for (nr = 0; nr < priv->num_frags; nr++)
mlx4_en_free_frag(priv, frags, nr);
consumed:
++cq->mcq.cons_index; ++cq->mcq.cons_index;
index = (cq->mcq.cons_index) & ring->size_mask; index = (cq->mcq.cons_index) & ring->size_mask;
cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor; cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;

View File

@ -327,7 +327,6 @@ struct mlx4_en_rx_desc {
struct mlx4_en_rx_ring { struct mlx4_en_rx_ring {
struct mlx4_hwq_resources wqres; struct mlx4_hwq_resources wqres;
struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
u32 size ; /* number of Rx descs*/ u32 size ; /* number of Rx descs*/
u32 actual_size; u32 actual_size;
u32 size_mask; u32 size_mask;