mirror of https://gitee.com/openkylin/linux.git
Merge branch '40GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue
Tony Nguyen says: ==================== 40GbE Intel Wired LAN Driver Updates 2020-09-14 This series contains updates to i40e driver only. Li RongQing removes binding affinity mask to a fixed CPU and sets prefetch of Rx buffer page to occur conditionally. Björn provides AF_XDP performance improvements by not prefetching HW descriptors, using 16 byte descriptors, and moving buffer allocation out of Rx processing loop. v2: Define prefetch_page_address in a common header for patch 2. Dropped, previous, patch 5 as it is being reworked to be more generalized. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
7952d7edf3
|
@ -90,7 +90,7 @@
|
|||
#define I40E_OEM_RELEASE_MASK 0x0000ffff
|
||||
|
||||
#define I40E_RX_DESC(R, i) \
|
||||
(&(((union i40e_32byte_rx_desc *)((R)->desc))[i]))
|
||||
(&(((union i40e_rx_desc *)((R)->desc))[i]))
|
||||
#define I40E_TX_DESC(R, i) \
|
||||
(&(((struct i40e_tx_desc *)((R)->desc))[i]))
|
||||
#define I40E_TX_CTXTDESC(R, i) \
|
||||
|
|
|
@ -604,10 +604,9 @@ static void i40e_dbg_dump_desc(int cnt, int vsi_seid, int ring_id, int desc_n,
|
|||
} else {
|
||||
rxd = I40E_RX_DESC(ring, i);
|
||||
dev_info(&pf->pdev->dev,
|
||||
" d[%03x] = 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n",
|
||||
" d[%03x] = 0x%016llx 0x%016llx\n",
|
||||
i, rxd->read.pkt_addr,
|
||||
rxd->read.hdr_addr,
|
||||
rxd->read.rsvd1, rxd->read.rsvd2);
|
||||
rxd->read.hdr_addr);
|
||||
}
|
||||
}
|
||||
} else if (cnt == 3) {
|
||||
|
@ -625,10 +624,9 @@ static void i40e_dbg_dump_desc(int cnt, int vsi_seid, int ring_id, int desc_n,
|
|||
} else {
|
||||
rxd = I40E_RX_DESC(ring, desc_n);
|
||||
dev_info(&pf->pdev->dev,
|
||||
"vsi = %02i rx ring = %02i d[%03x] = 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n",
|
||||
"vsi = %02i rx ring = %02i d[%03x] = 0x%016llx 0x%016llx\n",
|
||||
vsi_seid, ring_id, desc_n,
|
||||
rxd->read.pkt_addr, rxd->read.hdr_addr,
|
||||
rxd->read.rsvd1, rxd->read.rsvd2);
|
||||
rxd->read.pkt_addr, rxd->read.hdr_addr);
|
||||
}
|
||||
} else {
|
||||
dev_info(&pf->pdev->dev, "dump desc rx/tx/xdp <vsi_seid> <ring_id> [<desc_n>]\n");
|
||||
|
|
|
@ -3321,8 +3321,8 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
|
|||
rx_ctx.base = (ring->dma / 128);
|
||||
rx_ctx.qlen = ring->count;
|
||||
|
||||
/* use 32 byte descriptors */
|
||||
rx_ctx.dsize = 1;
|
||||
/* use 16 byte descriptors */
|
||||
rx_ctx.dsize = 0;
|
||||
|
||||
/* descriptor type is always zero
|
||||
* rx_ctx.dtype = 0;
|
||||
|
@ -11186,11 +11186,10 @@ static int i40e_init_msix(struct i40e_pf *pf)
|
|||
* i40e_vsi_alloc_q_vector - Allocate memory for a single interrupt vector
|
||||
* @vsi: the VSI being configured
|
||||
* @v_idx: index of the vector in the vsi struct
|
||||
* @cpu: cpu to be used on affinity_mask
|
||||
*
|
||||
* We allocate one q_vector. If allocation fails we return -ENOMEM.
|
||||
**/
|
||||
static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx, int cpu)
|
||||
static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx)
|
||||
{
|
||||
struct i40e_q_vector *q_vector;
|
||||
|
||||
|
@ -11223,7 +11222,7 @@ static int i40e_vsi_alloc_q_vector(struct i40e_vsi *vsi, int v_idx, int cpu)
|
|||
static int i40e_vsi_alloc_q_vectors(struct i40e_vsi *vsi)
|
||||
{
|
||||
struct i40e_pf *pf = vsi->back;
|
||||
int err, v_idx, num_q_vectors, current_cpu;
|
||||
int err, v_idx, num_q_vectors;
|
||||
|
||||
/* if not MSIX, give the one vector only to the LAN VSI */
|
||||
if (pf->flags & I40E_FLAG_MSIX_ENABLED)
|
||||
|
@ -11233,15 +11232,10 @@ static int i40e_vsi_alloc_q_vectors(struct i40e_vsi *vsi)
|
|||
else
|
||||
return -EINVAL;
|
||||
|
||||
current_cpu = cpumask_first(cpu_online_mask);
|
||||
|
||||
for (v_idx = 0; v_idx < num_q_vectors; v_idx++) {
|
||||
err = i40e_vsi_alloc_q_vector(vsi, v_idx, current_cpu);
|
||||
err = i40e_vsi_alloc_q_vector(vsi, v_idx);
|
||||
if (err)
|
||||
goto err_out;
|
||||
current_cpu = cpumask_next(current_cpu, cpu_online_mask);
|
||||
if (unlikely(current_cpu >= nr_cpu_ids))
|
||||
current_cpu = cpumask_first(cpu_online_mask);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -112,7 +112,7 @@ DECLARE_EVENT_CLASS(
|
|||
i40e_rx_template,
|
||||
|
||||
TP_PROTO(struct i40e_ring *ring,
|
||||
union i40e_32byte_rx_desc *desc,
|
||||
union i40e_16byte_rx_desc *desc,
|
||||
struct sk_buff *skb),
|
||||
|
||||
TP_ARGS(ring, desc, skb),
|
||||
|
@ -140,7 +140,7 @@ DECLARE_EVENT_CLASS(
|
|||
DEFINE_EVENT(
|
||||
i40e_rx_template, i40e_clean_rx_irq,
|
||||
TP_PROTO(struct i40e_ring *ring,
|
||||
union i40e_32byte_rx_desc *desc,
|
||||
union i40e_16byte_rx_desc *desc,
|
||||
struct sk_buff *skb),
|
||||
|
||||
TP_ARGS(ring, desc, skb));
|
||||
|
@ -148,7 +148,7 @@ DEFINE_EVENT(
|
|||
DEFINE_EVENT(
|
||||
i40e_rx_template, i40e_clean_rx_irq_rx,
|
||||
TP_PROTO(struct i40e_ring *ring,
|
||||
union i40e_32byte_rx_desc *desc,
|
||||
union i40e_16byte_rx_desc *desc,
|
||||
struct sk_buff *skb),
|
||||
|
||||
TP_ARGS(ring, desc, skb));
|
||||
|
|
|
@ -533,11 +533,11 @@ static void i40e_fd_handle_status(struct i40e_ring *rx_ring, u64 qword0_raw,
|
|||
{
|
||||
struct i40e_pf *pf = rx_ring->vsi->back;
|
||||
struct pci_dev *pdev = pf->pdev;
|
||||
struct i40e_32b_rx_wb_qw0 *qw0;
|
||||
struct i40e_16b_rx_wb_qw0 *qw0;
|
||||
u32 fcnt_prog, fcnt_avail;
|
||||
u32 error;
|
||||
|
||||
qw0 = (struct i40e_32b_rx_wb_qw0 *)&qword0_raw;
|
||||
qw0 = (struct i40e_16b_rx_wb_qw0 *)&qword0_raw;
|
||||
error = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
|
||||
I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT;
|
||||
|
||||
|
@ -1418,7 +1418,7 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
|
|||
u64_stats_init(&rx_ring->syncp);
|
||||
|
||||
/* Round up to nearest 4K */
|
||||
rx_ring->size = rx_ring->count * sizeof(union i40e_32byte_rx_desc);
|
||||
rx_ring->size = rx_ring->count * sizeof(union i40e_rx_desc);
|
||||
rx_ring->size = ALIGN(rx_ring->size, 4096);
|
||||
rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
|
||||
&rx_ring->dma, GFP_KERNEL);
|
||||
|
@ -1953,7 +1953,7 @@ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring,
|
|||
struct i40e_rx_buffer *rx_buffer;
|
||||
|
||||
rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
|
||||
prefetchw(rx_buffer->page);
|
||||
prefetch_page_address(rx_buffer->page);
|
||||
|
||||
/* we are reusing so sync this buffer for CPU use */
|
||||
dma_sync_single_range_for_cpu(rx_ring->dev,
|
||||
|
@ -2295,6 +2295,19 @@ void i40e_finalize_xdp_rx(struct i40e_ring *rx_ring, unsigned int xdp_res)
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* i40e_inc_ntc: Advance the next_to_clean index
|
||||
* @rx_ring: Rx ring
|
||||
**/
|
||||
static void i40e_inc_ntc(struct i40e_ring *rx_ring)
|
||||
{
|
||||
u32 ntc = rx_ring->next_to_clean + 1;
|
||||
|
||||
ntc = (ntc < rx_ring->count) ? ntc : 0;
|
||||
rx_ring->next_to_clean = ntc;
|
||||
prefetch(I40E_RX_DESC(rx_ring, ntc));
|
||||
}
|
||||
|
||||
/**
|
||||
* i40e_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
|
||||
* @rx_ring: rx descriptor ring to transact packets on
|
||||
|
|
|
@ -110,7 +110,7 @@ enum i40e_dyn_idx_t {
|
|||
*/
|
||||
#define I40E_RX_HDR_SIZE I40E_RXBUFFER_256
|
||||
#define I40E_PACKET_HDR_PAD (ETH_HLEN + ETH_FCS_LEN + (VLAN_HLEN * 2))
|
||||
#define i40e_rx_desc i40e_32byte_rx_desc
|
||||
#define i40e_rx_desc i40e_16byte_rx_desc
|
||||
|
||||
#define I40E_RX_DMA_ATTR \
|
||||
(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
|
||||
|
|
|
@ -99,19 +99,6 @@ static inline bool i40e_rx_is_programming_status(u64 qword1)
|
|||
return qword1 & I40E_RXD_QW1_LENGTH_SPH_MASK;
|
||||
}
|
||||
|
||||
/**
|
||||
* i40e_inc_ntc: Advance the next_to_clean index
|
||||
* @rx_ring: Rx ring
|
||||
**/
|
||||
static inline void i40e_inc_ntc(struct i40e_ring *rx_ring)
|
||||
{
|
||||
u32 ntc = rx_ring->next_to_clean + 1;
|
||||
|
||||
ntc = (ntc < rx_ring->count) ? ntc : 0;
|
||||
rx_ring->next_to_clean = ntc;
|
||||
prefetch(I40E_RX_DESC(rx_ring, ntc));
|
||||
}
|
||||
|
||||
void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring);
|
||||
void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring);
|
||||
bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi);
|
||||
|
|
|
@ -628,7 +628,7 @@ union i40e_16byte_rx_desc {
|
|||
__le64 hdr_addr; /* Header buffer address */
|
||||
} read;
|
||||
struct {
|
||||
struct {
|
||||
struct i40e_16b_rx_wb_qw0 {
|
||||
struct {
|
||||
union {
|
||||
__le16 mirroring_status;
|
||||
|
@ -647,6 +647,9 @@ union i40e_16byte_rx_desc {
|
|||
__le64 status_error_len;
|
||||
} qword1;
|
||||
} wb; /* writeback */
|
||||
struct {
|
||||
u64 qword[2];
|
||||
} raw;
|
||||
};
|
||||
|
||||
union i40e_32byte_rx_desc {
|
||||
|
|
|
@ -257,6 +257,18 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
|
|||
return skb;
|
||||
}
|
||||
|
||||
/**
|
||||
* i40e_inc_ntc: Advance the next_to_clean index
|
||||
* @rx_ring: Rx ring
|
||||
**/
|
||||
static void i40e_inc_ntc(struct i40e_ring *rx_ring)
|
||||
{
|
||||
u32 ntc = rx_ring->next_to_clean + 1;
|
||||
|
||||
ntc = (ntc < rx_ring->count) ? ntc : 0;
|
||||
rx_ring->next_to_clean = ntc;
|
||||
}
|
||||
|
||||
/**
|
||||
* i40e_clean_rx_irq_zc - Consumes Rx packets from the hardware ring
|
||||
* @rx_ring: Rx ring
|
||||
|
@ -269,8 +281,8 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
|
|||
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
|
||||
u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
|
||||
unsigned int xdp_res, xdp_xmit = 0;
|
||||
bool failure = false;
|
||||
struct sk_buff *skb;
|
||||
bool failure;
|
||||
|
||||
while (likely(total_rx_packets < (unsigned int)budget)) {
|
||||
union i40e_rx_desc *rx_desc;
|
||||
|
@ -278,13 +290,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
|
|||
unsigned int size;
|
||||
u64 qword;
|
||||
|
||||
if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
|
||||
failure = failure ||
|
||||
!i40e_alloc_rx_buffers_zc(rx_ring,
|
||||
cleaned_count);
|
||||
cleaned_count = 0;
|
||||
}
|
||||
|
||||
rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next_to_clean);
|
||||
qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
|
||||
|
||||
|
@ -359,6 +364,9 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
|
|||
napi_gro_receive(&rx_ring->q_vector->napi, skb);
|
||||
}
|
||||
|
||||
if (cleaned_count >= I40E_RX_BUFFER_WRITE)
|
||||
failure = !i40e_alloc_rx_buffers_zc(rx_ring, cleaned_count);
|
||||
|
||||
i40e_finalize_xdp_rx(rx_ring, xdp_xmit);
|
||||
i40e_update_rx_stats(rx_ring, total_rx_bytes, total_rx_packets);
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include <asm/processor.h>
|
||||
#include <asm/cache.h>
|
||||
|
||||
struct page;
|
||||
/*
|
||||
prefetch(x) attempts to pre-emptively get the memory pointed to
|
||||
by address "x" into the CPU L1 cache.
|
||||
|
@ -62,4 +63,11 @@ static inline void prefetch_range(void *addr, size_t len)
|
|||
#endif
|
||||
}
|
||||
|
||||
static inline void prefetch_page_address(struct page *page)
|
||||
{
|
||||
#if defined(WANT_PAGE_VIRTUAL) || defined(HASHED_PAGE_VIRTUAL)
|
||||
prefetch(page);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue