Merge branch 'net-stmmac-CPU-Performance-Improvements'

Jose Abreu says:

====================
net: stmmac: CPU Performance Improvements

CPU Performance improvements for stmmac. Please check bellow for results
before and after the series.

Patch 1/7, allows RX Interrupt on Completion to be disabled and only use the
RX HW Watchdog.

Patch 2/7, setups the default RX coalesce settings instead of using the
minimum value.

Patch 3/7 and 4/7, removes the uneeded computations for RX Flow Control
activation/de-activation, on some cases.

Patch 5/7, tunes-up the default coalesce settings.

Patch 6/7, re-works the TX coalesce timer activation logic.

Patch 7/7, removes the now uneeded TBU interrupt.

NetPerf UDP Results:
--------------------

Socket  Message  Elapsed      Messages                   CPU      Service
Size    Size     Time         Okay Errors   Throughput   Util     Demand
bytes   bytes    secs            #      #   10^6bits/sec % SS     us/KB
--- XGMAC@2.5G: Before
212992    1400   10.00     2100620      0     2351.7     36.69    5.112
212992           10.00     2100539            2351.6     26.18    3.648
--- XGMAC@2.5G: After
212992    1400   10.00     2108972      0     2361.5     21.73    3.015
212992           10.00     2097038            2348.1     19.21    2.666

--- GMAC5@1G: Before
212992    1400   10.00      786000      0      880.2     34.71    12.923
212992           10.00      786000             880.2     23.42    8.719
--- GMAC5@1G: After
212992    1400   10.00      842648      0      943.7     14.12    4.903
212992           10.00      842648             943.7     12.73    4.418

Perf TCP Results on RX Path:
----------------------------
--- XGMAC@2.5G: Before
22.51%  swapper          [stmmac]           [k] dwxgmac2_dma_interrupt
10.82%  swapper          [stmmac]           [k] dwxgmac2_host_mtl_irq_status
 5.21%  swapper          [stmmac]           [k] dwxgmac2_host_irq_status
 4.67%  swapper          [stmmac]           [k] dwxgmac3_safety_feat_irq_status
 3.63%  swapper          [kernel.kallsyms]  [k] stack_trace_consume_entry
 2.74%  iperf3           [kernel.kallsyms]  [k] copy_user_enhanced_fast_string
 2.52%  swapper          [kernel.kallsyms]  [k] update_stack_state
 1.94%  ksoftirqd/0      [stmmac]           [k] dwxgmac2_dma_interrupt
 1.45%  iperf3           [kernel.kallsyms]  [k] queued_spin_lock_slowpath
 1.26%  swapper          [kernel.kallsyms]  [k] create_object
--- XGMAC@2.5G: After
 7.43%  swapper          [kernel.kallsyms]   [k] stack_trace_consume_entry
 5.86%  swapper          [stmmac]            [k] dwxgmac2_dma_interrupt
 5.68%  swapper          [kernel.kallsyms]   [k] update_stack_state
 4.71%  iperf3           [kernel.kallsyms]   [k] copy_user_enhanced_fast_string
 2.88%  swapper          [kernel.kallsyms]   [k] create_object
 2.69%  swapper          [stmmac]            [k] dwxgmac2_host_mtl_irq_status
 2.61%  swapper          [stmmac]            [k] stmmac_napi_poll_rx
 2.52%  swapper          [kernel.kallsyms]   [k] unwind_next_frame.part.4
 1.48%  swapper          [kernel.kallsyms]   [k] unwind_get_return_address
 1.38%  swapper          [kernel.kallsyms]   [k] arch_stack_walk

--- GMAC5@1G: Before
31.29%  swapper          [stmmac]           [k] dwmac4_dma_interrupt
14.57%  swapper          [stmmac]           [k] dwmac4_irq_mtl_status
10.66%  swapper          [stmmac]           [k] dwmac4_irq_status
 1.97%  swapper          [kernel.kallsyms]  [k] stack_trace_consume_entry
 1.73%  iperf3           [kernel.kallsyms]  [k] copy_user_enhanced_fast_string
 1.59%  swapper          [kernel.kallsyms]  [k] update_stack_state
 1.15%  iperf3           [kernel.kallsyms]  [k] do_syscall_64
 1.01%  ksoftirqd/0      [stmmac]           [k] dwmac4_dma_interrupt
 0.89%  swapper          [kernel.kallsyms]  [k] __default_send_IPI_dest_field
 0.75%  swapper          [stmmac]           [k] stmmac_napi_poll_rx
--- GMAC5@1G: After
 6.70%  swapper          [kernel.kallsyms]   [k] stack_trace_consume_entry
 5.79%  swapper          [stmmac]            [k] dwmac4_dma_interrupt
 5.29%  swapper          [kernel.kallsyms]   [k] update_stack_state
 3.52%  iperf3           [kernel.kallsyms]   [k] copy_user_enhanced_fast_string
 2.83%  swapper          [stmmac]            [k] dwmac4_irq_mtl_status
 2.62%  swapper          [kernel.kallsyms]   [k] create_object
 2.46%  swapper          [stmmac]            [k] stmmac_napi_poll_rx
 2.32%  swapper          [kernel.kallsyms]   [k] unwind_next_frame.part.4
 2.19%  swapper          [stmmac]            [k] dwmac4_irq_status
 1.39%  swapper          [kernel.kallsyms]   [k] unwind_get_return_address
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2019-11-15 12:25:42 -08:00
commit 43da44c876
5 changed files with 59 additions and 50 deletions

View File

@ -248,12 +248,13 @@ struct stmmac_safety_stats {
/* Max/Min RI Watchdog Timer count value */
#define MAX_DMA_RIWT 0xff
#define MIN_DMA_RIWT 0x10
#define DEF_DMA_RIWT 0xa0
/* Tx coalesce parameters */
#define STMMAC_COAL_TX_TIMER 1000
#define STMMAC_MAX_COAL_TX_TICK 100000
#define STMMAC_TX_MAX_FRAMES 256
#define STMMAC_TX_FRAMES 1
#define STMMAC_RX_FRAMES 25
#define STMMAC_TX_FRAMES 25
#define STMMAC_RX_FRAMES 0
/* Packets types */
enum packets_types {

View File

@ -252,19 +252,9 @@ static void dwmac4_dma_rx_chan_op_mode(void __iomem *ioaddr, int mode,
rfa = 0x01; /* Full-1.5K */
break;
case 8192:
rfd = 0x06; /* Full-4K */
rfa = 0x0a; /* Full-6K */
break;
case 16384:
rfd = 0x06; /* Full-4K */
rfa = 0x12; /* Full-10K */
break;
default:
rfd = 0x06; /* Full-4K */
rfa = 0x1e; /* Full-16K */
rfd = 0x07; /* Full-4.5K */
rfa = 0x04; /* Full-3K */
break;
}

View File

@ -360,7 +360,7 @@
#define XGMAC_TBUE BIT(2)
#define XGMAC_TIE BIT(0)
#define XGMAC_DMA_INT_DEFAULT_EN (XGMAC_NIE | XGMAC_AIE | XGMAC_RBUE | \
XGMAC_RIE | XGMAC_TBUE | XGMAC_TIE)
XGMAC_RIE | XGMAC_TIE)
#define XGMAC_DMA_CH_Rx_WATCHDOG(x) (0x0000313c + (0x80 * (x)))
#define XGMAC_RWT GENMASK(7, 0)
#define XGMAC_DMA_CH_STATUS(x) (0x00003160 + (0x80 * (x)))

View File

@ -183,19 +183,9 @@ static void dwxgmac2_dma_rx_mode(void __iomem *ioaddr, int mode,
rfa = 0x01; /* Full-1.5K */
break;
case 8192:
rfd = 0x06; /* Full-4K */
rfa = 0x0a; /* Full-6K */
break;
case 16384:
rfd = 0x06; /* Full-4K */
rfa = 0x12; /* Full-10K */
break;
default:
rfd = 0x06; /* Full-4K */
rfa = 0x1e; /* Full-16K */
rfd = 0x07; /* Full-4.5K */
rfa = 0x04; /* Full-3K */
break;
}

View File

@ -2605,9 +2605,10 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
priv->tx_lpi_timer = STMMAC_DEFAULT_TWT_LS;
if (priv->use_riwt) {
ret = stmmac_rx_watchdog(priv, priv->ioaddr, MIN_DMA_RIWT, rx_cnt);
if (!ret)
priv->rx_riwt = MIN_DMA_RIWT;
if (!priv->rx_riwt)
priv->rx_riwt = DEF_DMA_RIWT;
ret = stmmac_rx_watchdog(priv, priv->ioaddr, priv->rx_riwt, rx_cnt);
}
if (priv->hw->pcs)
@ -2915,16 +2916,17 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
struct stmmac_priv *priv = netdev_priv(dev);
int nfrags = skb_shinfo(skb)->nr_frags;
u32 queue = skb_get_queue_mapping(skb);
unsigned int first_entry, tx_packets;
int tmp_pay_len = 0, first_tx;
struct stmmac_tx_queue *tx_q;
unsigned int first_entry;
u8 proto_hdr_len, hdr;
int tmp_pay_len = 0;
bool has_vlan, set_ic;
u32 pay_len, mss;
dma_addr_t des;
bool has_vlan;
int i;
tx_q = &priv->tx_queue[queue];
first_tx = tx_q->cur_tx;
/* Compute header lengths */
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
@ -3032,16 +3034,27 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
tx_q->tx_skbuff[tx_q->cur_tx] = skb;
/* Manage tx mitigation */
tx_q->tx_count_frames += nfrags + 1;
if (likely(priv->tx_coal_frames > tx_q->tx_count_frames) &&
!((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
priv->hwts_tx_en)) {
stmmac_tx_timer_arm(priv, queue);
} else {
tx_packets = (tx_q->cur_tx + 1) - first_tx;
tx_q->tx_count_frames += tx_packets;
if ((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) && priv->hwts_tx_en)
set_ic = true;
else if (!priv->tx_coal_frames)
set_ic = false;
else if (tx_packets > priv->tx_coal_frames)
set_ic = true;
else if ((tx_q->tx_count_frames % priv->tx_coal_frames) < tx_packets)
set_ic = true;
else
set_ic = false;
if (set_ic) {
desc = &tx_q->dma_tx[tx_q->cur_tx];
tx_q->tx_count_frames = 0;
stmmac_set_tx_ic(priv, desc);
priv->xstats.tx_set_ic_bit++;
} else {
stmmac_tx_timer_arm(priv, queue);
}
/* We've used all descriptors we need for this skb, however,
@ -3132,6 +3145,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
*/
static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
{
unsigned int first_entry, tx_packets, enh_desc;
struct stmmac_priv *priv = netdev_priv(dev);
unsigned int nopaged_len = skb_headlen(skb);
int i, csum_insertion = 0, is_jumbo = 0;
@ -3140,13 +3154,12 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
int gso = skb_shinfo(skb)->gso_type;
struct dma_desc *desc, *first;
struct stmmac_tx_queue *tx_q;
unsigned int first_entry;
unsigned int enh_desc;
bool has_vlan, set_ic;
int entry, first_tx;
dma_addr_t des;
bool has_vlan;
int entry;
tx_q = &priv->tx_queue[queue];
first_tx = tx_q->cur_tx;
if (priv->tx_path_in_lpi_mode)
stmmac_disable_eee_mode(priv);
@ -3240,12 +3253,21 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
* This approach takes care about the fragments: desc is the first
* element in case of no SG.
*/
tx_q->tx_count_frames += nfrags + 1;
if (likely(priv->tx_coal_frames > tx_q->tx_count_frames) &&
!((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
priv->hwts_tx_en)) {
stmmac_tx_timer_arm(priv, queue);
} else {
tx_packets = (entry + 1) - first_tx;
tx_q->tx_count_frames += tx_packets;
if ((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) && priv->hwts_tx_en)
set_ic = true;
else if (!priv->tx_coal_frames)
set_ic = false;
else if (tx_packets > priv->tx_coal_frames)
set_ic = true;
else if ((tx_q->tx_count_frames % priv->tx_coal_frames) < tx_packets)
set_ic = true;
else
set_ic = false;
if (set_ic) {
if (likely(priv->extend_desc))
desc = &tx_q->dma_etx[entry].basic;
else
@ -3254,6 +3276,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
tx_q->tx_count_frames = 0;
stmmac_set_tx_ic(priv, desc);
priv->xstats.tx_set_ic_bit++;
} else {
stmmac_tx_timer_arm(priv, queue);
}
/* We've used all descriptors we need for this skb, however,
@ -3440,7 +3464,11 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue)
rx_q->rx_count_frames += priv->rx_coal_frames;
if (rx_q->rx_count_frames > priv->rx_coal_frames)
rx_q->rx_count_frames = 0;
use_rx_wd = priv->use_riwt && rx_q->rx_count_frames;
use_rx_wd = !priv->rx_coal_frames;
use_rx_wd |= rx_q->rx_count_frames > 0;
if (!priv->use_riwt)
use_rx_wd = false;
dma_wmb();
stmmac_set_rx_owner(priv, p, use_rx_wd);