Merge branch 'net-driver-barriers'

Sinan Kaya says:

====================
netdev: Eliminate duplicate barriers on weakly-ordered archs

Code includes wmb() followed by writel() in multiple places. writel()
already has a barrier on some architectures like arm64.

This ends up CPU observing two barriers back to back before executing the
register write.

Since code already has an explicit barrier call, changing writel() to
writel_relaxed().

I did a regex search for wmb() followed by writel() in each drivers
directory.
I scrubbed the ones I care about in this series.

I considered "ease of change", "popular usage" and "performance critical
path" as the determining criteria for my filtering.

We used relaxed API heavily on ARM for a long time but
it did not exist on other architectures. For this reason, relaxed
architectures have been paying double penalty in order to use the common
drivers.

Now that relaxed API is present on all architectures, we can go and scrub
all drivers to see what needs to change and what can remain.

We start with mostly used ones and hope to increase the coverage over time.
It will take a while to cover all drivers.

Feel free to apply patches individually.

Changes since v6:
- bring back amazon ena and add mmiowb, remove
  ena_com_write_sq_doorbell_rel().
- remove extra mmiowb in bnx2x
- correct spelling mistake in  bnx2x: Replace doorbell barrier() with wmb()
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2018-03-26 12:47:57 -04:00
commit 90c00f8154
15 changed files with 68 additions and 24 deletions

View File

@ -631,8 +631,10 @@ static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset)
*/
wmb();
writel(mmio_read_reg, ena_dev->reg_bar + ENA_REGS_MMIO_REG_READ_OFF);
writel_relaxed(mmio_read_reg,
ena_dev->reg_bar + ENA_REGS_MMIO_REG_READ_OFF);
mmiowb();
for (i = 0; i < timeout; i++) {
if (read_resp->req_id == mmio_read->seq_num)
break;
@ -1826,7 +1828,9 @@ void ena_com_aenq_intr_handler(struct ena_com_dev *dev, void *data)
/* write the aenq doorbell after all AENQ descriptors were read */
mb();
writel((u32)aenq->head, dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
writel_relaxed((u32)aenq->head,
dev->reg_bar + ENA_REGS_AENQ_HEAD_DB_OFF);
mmiowb();
}
int ena_com_dev_reset(struct ena_com_dev *ena_dev,

View File

@ -107,7 +107,8 @@ static inline int ena_com_sq_empty_space(struct ena_com_io_sq *io_sq)
return io_sq->q_depth - 1 - cnt;
}
static inline int ena_com_write_sq_doorbell(struct ena_com_io_sq *io_sq)
static inline int ena_com_write_sq_doorbell(struct ena_com_io_sq *io_sq,
bool relaxed)
{
u16 tail;
@ -116,7 +117,10 @@ static inline int ena_com_write_sq_doorbell(struct ena_com_io_sq *io_sq)
pr_debug("write submission queue doorbell for queue: %d tail: %d\n",
io_sq->qid, tail);
writel(tail, io_sq->db_addr);
if (relaxed)
writel_relaxed(tail, io_sq->db_addr);
else
writel(tail, io_sq->db_addr);
return 0;
}

View File

@ -556,7 +556,8 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
* issue a doorbell
*/
wmb();
ena_com_write_sq_doorbell(rx_ring->ena_com_io_sq);
ena_com_write_sq_doorbell(rx_ring->ena_com_io_sq, true);
mmiowb();
}
rx_ring->next_to_use = next_to_use;
@ -2151,7 +2152,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)
if (netif_xmit_stopped(txq) || !skb->xmit_more) {
/* trigger the dma engine */
ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq);
ena_com_write_sq_doorbell(tx_ring->ena_com_io_sq, false);
u64_stats_update_begin(&tx_ring->syncp);
tx_ring->tx_stats.doorbells++;
u64_stats_update_end(&tx_ring->syncp);

View File

@ -166,6 +166,12 @@ do { \
#define REG_RD8(bp, offset) readb(REG_ADDR(bp, offset))
#define REG_RD16(bp, offset) readw(REG_ADDR(bp, offset))
#define REG_WR_RELAXED(bp, offset, val) \
writel_relaxed((u32)val, REG_ADDR(bp, offset))
#define REG_WR16_RELAXED(bp, offset, val) \
writew_relaxed((u16)val, REG_ADDR(bp, offset))
#define REG_WR(bp, offset, val) writel((u32)val, REG_ADDR(bp, offset))
#define REG_WR8(bp, offset, val) writeb((u8)val, REG_ADDR(bp, offset))
#define REG_WR16(bp, offset, val) writew((u16)val, REG_ADDR(bp, offset))
@ -758,10 +764,8 @@ struct bnx2x_fastpath {
#if (BNX2X_DB_SHIFT < BNX2X_DB_MIN_SHIFT)
#error "Min DB doorbell stride is 8"
#endif
#define DOORBELL(bp, cid, val) \
do { \
writel((u32)(val), bp->doorbells + (bp->db_size * (cid))); \
} while (0)
#define DOORBELL_RELAXED(bp, cid, val) \
writel_relaxed((u32)(val), (bp)->doorbells + ((bp)->db_size * (cid)))
/* TX CSUM helpers */
#define SKB_CS_OFF(skb) (offsetof(struct tcphdr, check) - \

View File

@ -4153,9 +4153,10 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
wmb();
txdata->tx_db.data.prod += nbd;
barrier();
/* make sure descriptor update is observed by HW */
wmb();
DOORBELL(bp, txdata->cid, txdata->tx_db.raw);
DOORBELL_RELAXED(bp, txdata->cid, txdata->tx_db.raw);
mmiowb();

View File

@ -522,8 +522,8 @@ static inline void bnx2x_update_rx_prod(struct bnx2x *bp,
wmb();
for (i = 0; i < sizeof(rx_prods)/4; i++)
REG_WR(bp, fp->ustorm_rx_prods_offset + i*4,
((u32 *)&rx_prods)[i]);
REG_WR_RELAXED(bp, fp->ustorm_rx_prods_offset + i * 4,
((u32 *)&rx_prods)[i]);
mmiowb(); /* keep prod updates ordered */

View File

@ -2591,8 +2591,9 @@ static int bnx2x_run_loopback(struct bnx2x *bp, int loopback_mode)
wmb();
txdata->tx_db.data.prod += 2;
barrier();
DOORBELL(bp, txdata->cid, txdata->tx_db.raw);
/* make sure descriptor update is observed by the HW */
wmb();
DOORBELL_RELAXED(bp, txdata->cid, txdata->tx_db.raw);
mmiowb();
barrier();

View File

@ -3817,8 +3817,8 @@ static void bnx2x_sp_prod_update(struct bnx2x *bp)
*/
mb();
REG_WR16(bp, BAR_XSTRORM_INTMEM + XSTORM_SPQ_PROD_OFFSET(func),
bp->spq_prod_idx);
REG_WR16_RELAXED(bp, BAR_XSTRORM_INTMEM + XSTORM_SPQ_PROD_OFFSET(func),
bp->spq_prod_idx);
mmiowb();
}

View File

@ -170,7 +170,9 @@ static int bnx2x_send_msg2pf(struct bnx2x *bp, u8 *done, dma_addr_t msg_mapping)
wmb();
/* Trigger the PF FW */
writeb(1, &zone_data->trigger.vf_pf_channel.addr_valid);
writeb_relaxed(1, &zone_data->trigger.vf_pf_channel.addr_valid);
mmiowb();
/* Wait for PF to complete */
while ((tout >= 0) && (!*done)) {

View File

@ -1922,7 +1922,7 @@ static int bnxt_poll_work(struct bnxt *bp, struct bnxt_napi *bnapi, int budget)
/* Sync BD data before updating doorbell */
wmb();
bnxt_db_write(bp, db, DB_KEY_TX | prod);
bnxt_db_write_relaxed(bp, db, DB_KEY_TX | prod);
}
cpr->cp_raw_cons = raw_cons;

View File

@ -1402,6 +1402,15 @@ static inline u32 bnxt_tx_avail(struct bnxt *bp, struct bnxt_tx_ring_info *txr)
((txr->tx_prod - txr->tx_cons) & bp->tx_ring_mask);
}
/* For TX and RX ring doorbells with no ordering guarantee*/
static inline void bnxt_db_write_relaxed(struct bnxt *bp, void __iomem *db,
u32 val)
{
writel_relaxed(val, db);
if (bp->flags & BNXT_FLAG_DOUBLE_DB)
writel_relaxed(val, db);
}
/* For TX and RX ring doorbells */
static inline void bnxt_db_write(struct bnxt *bp, void __iomem *db, u32 val)
{

View File

@ -1858,8 +1858,9 @@ static void ql_update_small_bufq_prod_index(struct ql3_adapter *qdev)
qdev->small_buf_release_cnt -= 8;
}
wmb();
writel(qdev->small_buf_q_producer_index,
&port_regs->CommonRegs.rxSmallQProducerIndex);
writel_relaxed(qdev->small_buf_q_producer_index,
&port_regs->CommonRegs.rxSmallQProducerIndex);
mmiowb();
}
}

View File

@ -478,7 +478,7 @@ irqreturn_t qlcnic_83xx_clear_legacy_intr(struct qlcnic_adapter *adapter)
wmb();
/* clear the interrupt trigger control register */
writel(0, adapter->isr_int_vec);
writel_relaxed(0, adapter->isr_int_vec);
intr_val = readl(adapter->isr_int_vec);
do {
intr_val = readl(adapter->tgt_status_reg);

View File

@ -2184,6 +2184,22 @@ static inline void ql_write_db_reg(u32 val, void __iomem *addr)
mmiowb();
}
/*
* Doorbell Registers:
* Doorbell registers are virtual registers in the PCI memory space.
* The space is allocated by the chip during PCI initialization. The
* device driver finds the doorbell address in BAR 3 in PCI config space.
* The registers are used to control outbound and inbound queues. For
* example, the producer index for an outbound queue. Each queue uses
* 1 4k chunk of memory. The lower half of the space is for outbound
* queues. The upper half is for inbound queues.
* Caller has to guarantee ordering.
*/
static inline void ql_write_db_reg_relaxed(u32 val, void __iomem *addr)
{
writel_relaxed(val, addr);
}
/*
* Shadow Registers:
* Outbound queues have a consumer index that is maintained by the chip.

View File

@ -2700,7 +2700,8 @@ static netdev_tx_t qlge_send(struct sk_buff *skb, struct net_device *ndev)
tx_ring->prod_idx = 0;
wmb();
ql_write_db_reg(tx_ring->prod_idx, tx_ring->prod_idx_db_reg);
ql_write_db_reg_relaxed(tx_ring->prod_idx, tx_ring->prod_idx_db_reg);
mmiowb();
netif_printk(qdev, tx_queued, KERN_DEBUG, qdev->ndev,
"tx queued, slot %d, len %d\n",
tx_ring->prod_idx, skb->len);