From 4da402597c2b75c4b636ab2416baf921b363b325 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Sun, 30 Sep 2018 15:06:06 +0800 Subject: [PATCH 01/49] xfrm: fix gro_cells leak when remove virtual xfrm interfaces The device gro_cells has been initialized, it should be freed, otherwise it will be leaked Fixes: f203b76d78092faf2 ("xfrm: Add virtual xfrm interfaces") Signed-off-by: Zhang Yu Signed-off-by: Li RongQing Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 31acc6f33d98..6f05e831a73e 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -116,6 +116,9 @@ static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) static void xfrmi_dev_free(struct net_device *dev) { + struct xfrm_if *xi = netdev_priv(dev); + + gro_cells_destroy(&xi->gro_cells); free_percpu(dev->tstats); } From 5f78aec0d7e9f239d64b2cd1c9e671345ff92f94 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Thu, 4 Oct 2018 07:21:48 +0200 Subject: [PATCH 02/49] MAINTAINERS: Remove net/core/flow.c net/core/flow.c does not exist anymore, so remove it from the IPSEC NETWORKING section of the MAINTAINERS file. Signed-off-by: Steffen Klassert --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index dcb0191c4f54..4ff21dac9b45 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10130,7 +10130,6 @@ L: netdev@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git S: Maintained -F: net/core/flow.c F: net/xfrm/ F: net/key/ F: net/ipv4/xfrm* From 262f9d811c7608f1e74258ceecfe1fa213bdf912 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Oct 2018 19:38:46 -0700 Subject: [PATCH 03/49] bpf: do not blindly change rlimit in reuseport net selftest If the current process has unlimited RLIMIT_MEMLOCK, we should should leave it as is. Fixes: 941ff6f11c02 ("bpf: fix rlimit in reuseport net selftest") Signed-off-by: John Sperbeck Signed-off-by: Eric Dumazet Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- tools/testing/selftests/net/reuseport_bpf.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/reuseport_bpf.c b/tools/testing/selftests/net/reuseport_bpf.c index cad14cd0ea92..b5277106df1f 100644 --- a/tools/testing/selftests/net/reuseport_bpf.c +++ b/tools/testing/selftests/net/reuseport_bpf.c @@ -437,14 +437,19 @@ void enable_fastopen(void) } } -static struct rlimit rlim_old, rlim_new; +static struct rlimit rlim_old; static __attribute__((constructor)) void main_ctor(void) { getrlimit(RLIMIT_MEMLOCK, &rlim_old); - rlim_new.rlim_cur = rlim_old.rlim_cur + (1UL << 20); - rlim_new.rlim_max = rlim_old.rlim_max + (1UL << 20); - setrlimit(RLIMIT_MEMLOCK, &rlim_new); + + if (rlim_old.rlim_cur != RLIM_INFINITY) { + struct rlimit rlim_new; + + rlim_new.rlim_cur = rlim_old.rlim_cur + (1UL << 20); + rlim_new.rlim_max = rlim_old.rlim_max + (1UL << 20); + setrlimit(RLIMIT_MEMLOCK, &rlim_new); + } } static __attribute__((destructor)) void main_dtor(void) From fd7e848077c1a466b9187537adce16658f7cb94b Mon Sep 17 00:00:00 2001 From: Talat Batheesh Date: Thu, 30 Aug 2018 16:31:52 +0300 Subject: [PATCH 04/49] net/mlx5: Fix memory leak when setting fpga ipsec caps Allocated memory for context should be freed once finished working with it. Fixes: d6c4f0298cec ("net/mlx5: Refactor accel IPSec code") Signed-off-by: Talat Batheesh Reviewed-by: Or Gerlitz Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c index 5645a4facad2..b8ee9101c506 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c @@ -245,7 +245,7 @@ static void *mlx5_fpga_ipsec_cmd_exec(struct mlx5_core_dev *mdev, return ERR_PTR(res); } - /* Context will be freed by wait func after completion */ + /* Context should be freed by the caller after completion. */ return context; } @@ -418,10 +418,8 @@ static int mlx5_fpga_ipsec_set_caps(struct mlx5_core_dev *mdev, u32 flags) cmd.cmd = htonl(MLX5_FPGA_IPSEC_CMD_OP_SET_CAP); cmd.flags = htonl(flags); context = mlx5_fpga_ipsec_cmd_exec(mdev, &cmd, sizeof(cmd)); - if (IS_ERR(context)) { - err = PTR_ERR(context); - goto out; - } + if (IS_ERR(context)) + return PTR_ERR(context); err = mlx5_fpga_ipsec_cmd_wait(context); if (err) @@ -435,6 +433,7 @@ static int mlx5_fpga_ipsec_set_caps(struct mlx5_core_dev *mdev, u32 flags) } out: + kfree(context); return err; } From a48bc513159d4767f9988f0d857b2b0c38a4d614 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 11 Sep 2018 14:58:22 -0500 Subject: [PATCH 05/49] net/mlx5: Take only bit 24-26 of wqe.pftype_wq for page fault type The HW spec defines only bits 24-26 of pftype_wq as the page fault type, use the required mask to ensure that. Fixes: d9aaed838765 ("{net,IB}/mlx5: Refactor page fault handling") Signed-off-by: Huy Nguyen Signed-off-by: Eli Cohen Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 48864f4988a4..c1e1a16a9b07 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -273,7 +273,7 @@ static void eq_pf_process(struct mlx5_eq *eq) case MLX5_PFAULT_SUBTYPE_WQE: /* WQE based event */ pfault->type = - be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24; + (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7; pfault->token = be32_to_cpu(pf_eqe->wqe.token); pfault->wqe.wq_num = From 37fdffb217a45609edccbb8b407d031143f551c0 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 21 Aug 2018 14:41:41 +0300 Subject: [PATCH 06/49] net/mlx5: WQ, fixes for fragmented WQ buffers API mlx5e netdevice used to calculate fragment edges by a call to mlx5_wq_cyc_get_frag_size(). This calculation did not give the correct indication for queues smaller than a PAGE_SIZE, (broken by default on PowerPC, where PAGE_SIZE == 64KB). Here it is replaced by the correct new calls/API. Since (TX/RX) Work Queues buffers are fragmented, here we introduce changes to the API in core driver, so that it gets a stride index and returns the index of last stride on same fragment, and an additional wrapping function that returns the number of physically contiguous strides that can be written contiguously to the work queue. This obsoletes the following API functions, and their buggy usage in EN driver: * mlx5_wq_cyc_get_frag_size() * mlx5_wq_cyc_ctr2fragix() The new API improves modularity and hides the details of such calculation for mlx5e netdevice and mlx5_ib rdma drivers. New calculation is also more efficient, and improves performance as follows: Packet rate test: pktgen, UDP / IPv4, 64byte, single ring, 8K ring size. Before: 16,477,619 pps After: 17,085,793 pps 3.7% improvement Fixes: 3a2f70331226 ("net/mlx5: Use order-0 allocations for all WQ types") Signed-off-by: Tariq Toukan Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_rx.c | 12 +++++----- .../net/ethernet/mellanox/mlx5/core/en_tx.c | 22 +++++++++---------- .../ethernet/mellanox/mlx5/core/ipoib/ipoib.h | 5 ++--- drivers/net/ethernet/mellanox/mlx5/core/wq.c | 5 ----- drivers/net/ethernet/mellanox/mlx5/core/wq.h | 11 +++++----- include/linux/mlx5/driver.h | 8 +++++++ 6 files changed, 31 insertions(+), 32 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 15d8ae28c040..00172dee5339 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -432,10 +432,9 @@ static inline u16 mlx5e_icosq_wrap_cnt(struct mlx5e_icosq *sq) static inline void mlx5e_fill_icosq_frag_edge(struct mlx5e_icosq *sq, struct mlx5_wq_cyc *wq, - u16 pi, u16 frag_pi) + u16 pi, u16 nnops) { struct mlx5e_sq_wqe_info *edge_wi, *wi = &sq->db.ico_wqe[pi]; - u8 nnops = mlx5_wq_cyc_get_frag_size(wq) - frag_pi; edge_wi = wi + nnops; @@ -454,15 +453,14 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) struct mlx5_wq_cyc *wq = &sq->wq; struct mlx5e_umr_wqe *umr_wqe; u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1); - u16 pi, frag_pi; + u16 pi, contig_wqebbs_room; int err; int i; pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); - frag_pi = mlx5_wq_cyc_ctr2fragix(wq, sq->pc); - - if (unlikely(frag_pi + MLX5E_UMR_WQEBBS > mlx5_wq_cyc_get_frag_size(wq))) { - mlx5e_fill_icosq_frag_edge(sq, wq, pi, frag_pi); + contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); + if (unlikely(contig_wqebbs_room < MLX5E_UMR_WQEBBS)) { + mlx5e_fill_icosq_frag_edge(sq, wq, pi, contig_wqebbs_room); pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index ae73ea992845..6dacaeba2fbf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -290,10 +290,9 @@ mlx5e_txwqe_build_dsegs(struct mlx5e_txqsq *sq, struct sk_buff *skb, static inline void mlx5e_fill_sq_frag_edge(struct mlx5e_txqsq *sq, struct mlx5_wq_cyc *wq, - u16 pi, u16 frag_pi) + u16 pi, u16 nnops) { struct mlx5e_tx_wqe_info *edge_wi, *wi = &sq->db.wqe_info[pi]; - u8 nnops = mlx5_wq_cyc_get_frag_size(wq) - frag_pi; edge_wi = wi + nnops; @@ -348,8 +347,8 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5e_tx_wqe_info *wi; struct mlx5e_sq_stats *stats = sq->stats; + u16 headlen, ihs, contig_wqebbs_room; u16 ds_cnt, ds_cnt_inl = 0; - u16 headlen, ihs, frag_pi; u8 num_wqebbs, opcode; u32 num_bytes; int num_dma; @@ -386,9 +385,9 @@ netdev_tx_t mlx5e_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, } num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); - frag_pi = mlx5_wq_cyc_ctr2fragix(wq, sq->pc); - if (unlikely(frag_pi + num_wqebbs > mlx5_wq_cyc_get_frag_size(wq))) { - mlx5e_fill_sq_frag_edge(sq, wq, pi, frag_pi); + contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); + if (unlikely(contig_wqebbs_room < num_wqebbs)) { + mlx5e_fill_sq_frag_edge(sq, wq, pi, contig_wqebbs_room); mlx5e_sq_fetch_wqe(sq, &wqe, &pi); } @@ -636,7 +635,7 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, struct mlx5e_tx_wqe_info *wi; struct mlx5e_sq_stats *stats = sq->stats; - u16 headlen, ihs, pi, frag_pi; + u16 headlen, ihs, pi, contig_wqebbs_room; u16 ds_cnt, ds_cnt_inl = 0; u8 num_wqebbs, opcode; u32 num_bytes; @@ -672,13 +671,14 @@ netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb, } num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); - frag_pi = mlx5_wq_cyc_ctr2fragix(wq, sq->pc); - if (unlikely(frag_pi + num_wqebbs > mlx5_wq_cyc_get_frag_size(wq))) { + pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); + contig_wqebbs_room = mlx5_wq_cyc_get_contig_wqebbs(wq, pi); + if (unlikely(contig_wqebbs_room < num_wqebbs)) { + mlx5e_fill_sq_frag_edge(sq, wq, pi, contig_wqebbs_room); pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); - mlx5e_fill_sq_frag_edge(sq, wq, pi, frag_pi); } - mlx5i_sq_fetch_wqe(sq, &wqe, &pi); + mlx5i_sq_fetch_wqe(sq, &wqe, pi); /* fill wqe */ wi = &sq->db.wqe_info[pi]; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h index 08eac92fc26c..0982c579ec74 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.h @@ -109,12 +109,11 @@ struct mlx5i_tx_wqe { static inline void mlx5i_sq_fetch_wqe(struct mlx5e_txqsq *sq, struct mlx5i_tx_wqe **wqe, - u16 *pi) + u16 pi) { struct mlx5_wq_cyc *wq = &sq->wq; - *pi = mlx5_wq_cyc_ctr2ix(wq, sq->pc); - *wqe = mlx5_wq_cyc_get_wqe(wq, *pi); + *wqe = mlx5_wq_cyc_get_wqe(wq, pi); memset(*wqe, 0, sizeof(**wqe)); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c index 68e7f8df2a6d..ddca327e8950 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c @@ -39,11 +39,6 @@ u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq) return (u32)wq->fbc.sz_m1 + 1; } -u16 mlx5_wq_cyc_get_frag_size(struct mlx5_wq_cyc *wq) -{ - return wq->fbc.frag_sz_m1 + 1; -} - u32 mlx5_cqwq_get_size(struct mlx5_cqwq *wq) { return wq->fbc.sz_m1 + 1; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h index 3a1a170bb2d7..b1293d153a58 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h @@ -80,7 +80,6 @@ int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, void *wqc, struct mlx5_wq_cyc *wq, struct mlx5_wq_ctrl *wq_ctrl); u32 mlx5_wq_cyc_get_size(struct mlx5_wq_cyc *wq); -u16 mlx5_wq_cyc_get_frag_size(struct mlx5_wq_cyc *wq); int mlx5_wq_qp_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, void *qpc, struct mlx5_wq_qp *wq, @@ -140,11 +139,6 @@ static inline u16 mlx5_wq_cyc_ctr2ix(struct mlx5_wq_cyc *wq, u16 ctr) return ctr & wq->fbc.sz_m1; } -static inline u16 mlx5_wq_cyc_ctr2fragix(struct mlx5_wq_cyc *wq, u16 ctr) -{ - return ctr & wq->fbc.frag_sz_m1; -} - static inline u16 mlx5_wq_cyc_get_head(struct mlx5_wq_cyc *wq) { return mlx5_wq_cyc_ctr2ix(wq, wq->wqe_ctr); @@ -160,6 +154,11 @@ static inline void *mlx5_wq_cyc_get_wqe(struct mlx5_wq_cyc *wq, u16 ix) return mlx5_frag_buf_get_wqe(&wq->fbc, ix); } +static inline u16 mlx5_wq_cyc_get_contig_wqebbs(struct mlx5_wq_cyc *wq, u16 ix) +{ + return mlx5_frag_buf_get_idx_last_contig_stride(&wq->fbc, ix) - ix + 1; +} + static inline int mlx5_wq_cyc_cc_bigger(u16 cc1, u16 cc2) { int equal = (cc1 == cc2); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 66d94b4557cf..88a041b73abf 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1032,6 +1032,14 @@ static inline void *mlx5_frag_buf_get_wqe(struct mlx5_frag_buf_ctrl *fbc, ((fbc->frag_sz_m1 & ix) << fbc->log_stride); } +static inline u32 +mlx5_frag_buf_get_idx_last_contig_stride(struct mlx5_frag_buf_ctrl *fbc, u32 ix) +{ + u32 last_frag_stride_idx = (ix + fbc->strides_offset) | fbc->frag_sz_m1; + + return min_t(u32, last_frag_stride_idx - fbc->strides_offset, fbc->sz_m1); +} + int mlx5_cmd_init(struct mlx5_core_dev *dev); void mlx5_cmd_cleanup(struct mlx5_core_dev *dev); void mlx5_cmd_use_events(struct mlx5_core_dev *dev); From cee271678d0e3177a25d0fcb2fa5e051d48e4262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 8 Oct 2018 19:40:16 +0200 Subject: [PATCH 07/49] xsk: do not call synchronize_net() under RCU read lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XSKMAP update and delete functions called synchronize_net(), which can sleep. It is not allowed to sleep during an RCU read section. Instead we need to make sure that the sock sk_destruct (xsk_destruct) function is asynchronously called after an RCU grace period. Setting the SOCK_RCU_FREE flag for XDP sockets takes care of this. Fixes: fbfc504a24f5 ("bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP") Reported-by: Eric Dumazet Signed-off-by: Björn Töpel Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/xskmap.c | 10 ++-------- net/xdp/xsk.c | 2 ++ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 9f8463afda9c..47147c9e184d 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -192,11 +192,8 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, sock_hold(sock->sk); old_xs = xchg(&m->xsk_map[i], xs); - if (old_xs) { - /* Make sure we've flushed everything. */ - synchronize_net(); + if (old_xs) sock_put((struct sock *)old_xs); - } sockfd_put(sock); return 0; @@ -212,11 +209,8 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return -EINVAL; old_xs = xchg(&m->xsk_map[k], NULL); - if (old_xs) { - /* Make sure we've flushed everything. */ - synchronize_net(); + if (old_xs) sock_put((struct sock *)old_xs); - } return 0; } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4e937cd7c17d..661504042d30 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -744,6 +744,8 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, sk->sk_destruct = xsk_destruct; sk_refcnt_debug_inc(sk); + sock_set_flag(sk, SOCK_RCU_FREE); + xs = xdp_sk(sk); mutex_init(&xs->mutex); spin_lock_init(&xs->tx_completion_lock); From 9f7e43da6ae4862b48bac233838ba808c1167a0d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Oct 2018 09:59:36 -0700 Subject: [PATCH 08/49] net/xfrm: fix out-of-bounds packet access BUG: KASAN: slab-out-of-bounds in _decode_session6+0x1331/0x14e0 net/ipv6/xfrm6_policy.c:161 Read of size 1 at addr ffff8801d882eec7 by task syz-executor1/6667 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0x242/0x30d mm/kasan/report.c:412 __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430 _decode_session6+0x1331/0x14e0 net/ipv6/xfrm6_policy.c:161 __xfrm_decode_session+0x71/0x140 net/xfrm/xfrm_policy.c:2299 xfrm_decode_session include/net/xfrm.h:1232 [inline] vti6_tnl_xmit+0x3c3/0x1bc1 net/ipv6/ip6_vti.c:542 __netdev_start_xmit include/linux/netdevice.h:4313 [inline] netdev_start_xmit include/linux/netdevice.h:4322 [inline] xmit_one net/core/dev.c:3217 [inline] dev_hard_start_xmit+0x272/0xc10 net/core/dev.c:3233 __dev_queue_xmit+0x2ab2/0x3870 net/core/dev.c:3803 dev_queue_xmit+0x17/0x20 net/core/dev.c:3836 Reported-by: syzbot+acffccec848dc13fe459@syzkaller.appspotmail.com Reported-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_policy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index ef3defaf43b9..d35bcf92969c 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -146,8 +146,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) fl6->daddr = reverse ? hdr->saddr : hdr->daddr; fl6->saddr = reverse ? hdr->daddr : hdr->saddr; - while (nh + offset + 1 < skb->data || - pskb_may_pull(skb, nh + offset + 1 - skb->data)) { + while (nh + offset + sizeof(*exthdr) < skb->data || + pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) { nh = skb_network_header(skb); exthdr = (struct ipv6_opt_hdr *)(nh + offset); From 9dffff200fd178f11dd50eb1fd8ccd0650c9284e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Oct 2018 18:02:21 +0200 Subject: [PATCH 09/49] xfrm: policy: use hlist rcu variants on insert bydst table/list lookups use rcu, so insertions must use rcu versions. Fixes: a7c44247f704e ("xfrm: policy: make xfrm_policy_lookup_bytype lockless") Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f094d4b3520d..119a427d9b2b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -632,9 +632,9 @@ static void xfrm_hash_rebuild(struct work_struct *work) break; } if (newpos) - hlist_add_behind(&policy->bydst, newpos); + hlist_add_behind_rcu(&policy->bydst, newpos); else - hlist_add_head(&policy->bydst, chain); + hlist_add_head_rcu(&policy->bydst, chain); } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); @@ -774,9 +774,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) break; } if (newpos) - hlist_add_behind(&policy->bydst, newpos); + hlist_add_behind_rcu(&policy->bydst, newpos); else - hlist_add_head(&policy->bydst, chain); + hlist_add_head_rcu(&policy->bydst, chain); __xfrm_policy_link(policy, dir); /* After previous checking, family can either be AF_INET or AF_INET6 */ From d49c88d7677ba737e9d2759a87db0402d5ab2607 Mon Sep 17 00:00:00 2001 From: Jian-Hong Pan Date: Thu, 27 Sep 2018 12:09:48 +0800 Subject: [PATCH 10/49] r8169: Enable MSI-X on RTL8106e Originally, we have an issue where r8169 MSI-X interrupt is broken after S3 suspend/resume on RTL8106e of ASUS X441UAR. 02:00.0 Ethernet controller [0200]: Realtek Semiconductor Co., Ltd. RTL8101/2/6E PCI Express Fast/Gigabit Ethernet controller [10ec:8136] (rev 07) Subsystem: ASUSTeK Computer Inc. RTL810xE PCI Express Fast Ethernet controller [1043:200f] Flags: bus master, fast devsel, latency 0, IRQ 16 I/O ports at e000 [size=256] Memory at ef100000 (64-bit, non-prefetchable) [size=4K] Memory at e0000000 (64-bit, prefetchable) [size=16K] Capabilities: [40] Power Management version 3 Capabilities: [50] MSI: Enable- Count=1/1 Maskable- 64bit+ Capabilities: [70] Express Endpoint, MSI 01 Capabilities: [b0] MSI-X: Enable+ Count=4 Masked- Capabilities: [d0] Vital Product Data Capabilities: [100] Advanced Error Reporting Capabilities: [140] Virtual Channel Capabilities: [160] Device Serial Number 01-00-00-00-36-4c-e0-00 Capabilities: [170] Latency Tolerance Reporting Kernel driver in use: r8169 Kernel modules: r8169 We found the all of the values in PCI BAR=4 of the ethernet adapter become 0xFF after system resumes. That breaks the MSI-X interrupt. Therefore, we can only fall back to MSI interrupt to fix the issue at that time. However, there is a commit which resolves the drivers getting nothing in PCI BAR=4 after system resumes. It is 04cb3ae895d7 "PCI: Reprogram bridge prefetch registers on resume" by Daniel Drake. After apply the patch, the ethernet adapter works fine before suspend and after resume. So, we can revert the workaround after the commit "PCI: Reprogram bridge prefetch registers on resume" is merged into main tree. This patch reverts commit 7bb05b85bc2d1a1b647b91424b2ed4a18e6ecd81 "r8169: don't use MSI-X on RTL8106e". Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=201181 Fixes: 7bb05b85bc2d ("r8169: don't use MSI-X on RTL8106e") Signed-off-by: Jian-Hong Pan Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 3a5e6160bf0d..f4df367fb894 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -7093,20 +7093,17 @@ static int rtl_alloc_irq(struct rtl8169_private *tp) { unsigned int flags; - switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_01 ... RTL_GIGA_MAC_VER_06: + if (tp->mac_version <= RTL_GIGA_MAC_VER_06) { RTL_W8(tp, Cfg9346, Cfg9346_Unlock); RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~MSIEnable); RTL_W8(tp, Cfg9346, Cfg9346_Lock); flags = PCI_IRQ_LEGACY; - break; - case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_40: + } else if (tp->mac_version == RTL_GIGA_MAC_VER_40) { /* This version was reported to have issues with resume * from suspend when using MSI-X */ flags = PCI_IRQ_LEGACY | PCI_IRQ_MSI; - break; - default: + } else { flags = PCI_IRQ_ALL_TYPES; } From 2bb3207dbbd4d30e96dd0e1c8e013104193bd59c Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Tue, 9 Oct 2018 08:15:38 -0500 Subject: [PATCH 11/49] ethtool: fix a missing-check bug In ethtool_get_rxnfc(), the eth command 'cmd' is compared against 'ETHTOOL_GRXFH' to see whether it is necessary to adjust the variable 'info_size'. Then the whole structure of 'info' is copied from the user-space buffer 'useraddr' with 'info_size' bytes. In the following execution, 'info' may be copied again from the buffer 'useraddr' depending on the 'cmd' and the 'info.flow_type'. However, after these two copies, there is no check between 'cmd' and 'info.cmd'. In fact, 'cmd' is also copied from the buffer 'useraddr' in dev_ethtool(), which is the caller function of ethtool_get_rxnfc(). Given that 'useraddr' is in the user space, a malicious user can race to change the eth command in the buffer between these copies. By doing so, the attacker can supply inconsistent data and cause undefined behavior because in the following execution 'info' will be passed to ops->get_rxnfc(). This patch adds a necessary check on 'info.cmd' and 'cmd' to confirm that they are still same after the two copies in ethtool_get_rxnfc(). Otherwise, an error code EINVAL will be returned. Signed-off-by: Wenwen Wang Signed-off-by: David S. Miller --- net/core/ethtool.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 0762aaf8e964..192f2f76b7bd 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1015,6 +1015,9 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, return -EINVAL; } + if (info.cmd != cmd) + return -EINVAL; + if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) From 58f5bbe331c566f49c9559568f982202a278aa78 Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Mon, 8 Oct 2018 10:49:35 -0500 Subject: [PATCH 12/49] ethtool: fix a privilege escalation bug In dev_ethtool(), the eth command 'ethcmd' is firstly copied from the use-space buffer 'useraddr' and checked to see whether it is ETHTOOL_PERQUEUE. If yes, the sub-command 'sub_cmd' is further copied from the user space. Otherwise, 'sub_cmd' is the same as 'ethcmd'. Next, according to 'sub_cmd', a permission check is enforced through the function ns_capable(). For example, the permission check is required if 'sub_cmd' is ETHTOOL_SCOALESCE, but it is not necessary if 'sub_cmd' is ETHTOOL_GCOALESCE, as suggested in the comment "Allow some commands to be done by anyone". The following execution invokes different handlers according to 'ethcmd'. Specifically, if 'ethcmd' is ETHTOOL_PERQUEUE, ethtool_set_per_queue() is called. In ethtool_set_per_queue(), the kernel object 'per_queue_opt' is copied again from the user-space buffer 'useraddr' and 'per_queue_opt.sub_command' is used to determine which operation should be performed. Given that the buffer 'useraddr' is in the user space, a malicious user can race to change the sub-command between the two copies. In particular, the attacker can supply ETHTOOL_PERQUEUE and ETHTOOL_GCOALESCE to bypass the permission check in dev_ethtool(). Then before ethtool_set_per_queue() is called, the attacker changes ETHTOOL_GCOALESCE to ETHTOOL_SCOALESCE. In this way, the attacker can bypass the permission check and execute ETHTOOL_SCOALESCE. This patch enforces a check in ethtool_set_per_queue() after the second copy from 'useraddr'. If the sub-command is different from the one obtained in the first copy in dev_ethtool(), an error code EINVAL will be returned. Fixes: f38d138a7da6 ("net/ethtool: support set coalesce per queue") Signed-off-by: Wenwen Wang Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- net/core/ethtool.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 192f2f76b7bd..aeabc4831fca 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -2472,13 +2472,17 @@ static int ethtool_set_per_queue_coalesce(struct net_device *dev, return ret; } -static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr) +static int ethtool_set_per_queue(struct net_device *dev, + void __user *useraddr, u32 sub_cmd) { struct ethtool_per_queue_op per_queue_opt; if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt))) return -EFAULT; + if (per_queue_opt.sub_command != sub_cmd) + return -EINVAL; + switch (per_queue_opt.sub_command) { case ETHTOOL_GCOALESCE: return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt); @@ -2849,7 +2853,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) rc = ethtool_get_phy_stats(dev, useraddr); break; case ETHTOOL_PERQUEUE: - rc = ethtool_set_per_queue(dev, useraddr); + rc = ethtool_set_per_queue(dev, useraddr, sub_cmd); break; case ETHTOOL_GLINKSETTINGS: rc = ethtool_get_link_ksettings(dev, useraddr); From e331473fee3d500bb0d2582a1fe598df3326d8cd Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 10 Oct 2018 22:00:58 +0200 Subject: [PATCH 13/49] net/sched: cls_api: add missing validation of netlink attributes Similarly to what has been done in 8b4c3cdd9dd8 ("net: sched: Add policy validation for tc attributes"), fix classifier code to add validation of TCA_CHAIN and TCA_KIND netlink attributes. tested with: # ./tdc.py -c filter v2: Let sch_api and cls_api share nla_policy they have in common, thanks to David Ahern. v3: Avoid EXPORT_SYMBOL(), as validation of those attributes is not done by TC modules, thanks to Cong Wang. While at it, restore the 'Delete / get qdisc' comment to its orginal position, just above tc_get_qdisc() function prototype. Fixes: 5bc1701881e39 ("net: sched: introduce multichain support for filters") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/cls_api.c | 13 ++++++++----- net/sched/sch_api.c | 8 ++++---- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 0a75cb2e5e7b..70f144ac5e1d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -31,6 +31,8 @@ #include #include +extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; + /* The list of all installed classifier types */ static LIST_HEAD(tcf_proto_base); @@ -1211,7 +1213,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, replay: tp_created = 0; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; @@ -1360,7 +1362,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; @@ -1475,7 +1477,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *fh = NULL; int err; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; @@ -1838,7 +1840,7 @@ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, return -EPERM; replay: - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; @@ -1949,7 +1951,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return skb->len; - err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, + NULL); if (err) return err; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 85e73f48e48f..6684641ea344 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1307,10 +1307,6 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) return 0; } -/* - * Delete/get qdisc. - */ - const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { [TCA_KIND] = { .type = NLA_STRING }, [TCA_OPTIONS] = { .type = NLA_NESTED }, @@ -1323,6 +1319,10 @@ const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { [TCA_EGRESS_BLOCK] = { .type = NLA_U32 }, }; +/* + * Delete/get qdisc. + */ + static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { From 5a8e7aea953bdb6d4da13aff6f1e7f9c62023499 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 11 Oct 2018 11:15:13 -0700 Subject: [PATCH 14/49] llc: set SOCK_RCU_FREE in llc_sap_add_socket() WHen an llc sock is added into the sk_laddr_hash of an llc_sap, it is not marked with SOCK_RCU_FREE. This causes that the sock could be freed while it is still being read by __llc_lookup_established() with RCU read lock. sock is refcounted, but with RCU read lock, nothing prevents the readers getting a zero refcnt. Fix it by setting SOCK_RCU_FREE in llc_sap_add_socket(). Reported-by: syzbot+11e05f04c15e03be5254@syzkaller.appspotmail.com Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/llc/llc_conn.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index c0ac522b48a1..4ff89cb7c86f 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -734,6 +734,7 @@ void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk) llc_sk(sk)->sap = sap; spin_lock_bh(&sap->sk_lock); + sock_set_flag(sk, SOCK_RCU_FREE); sap->sk_count++; sk_nulls_add_node_rcu(sk, laddr_hb); hlist_add_head(&llc->dev_hash_node, dev_hb); From 4af00f4cc1ba34da4654ac31830843cae871642d Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 11 Oct 2018 22:02:29 +0200 Subject: [PATCH 15/49] tipc: initialize broadcast link stale counter correctly In the commit referred to below we added link tolerance as an additional criteria for declaring broadcast transmission "stale" and resetting the unicast links to the affected node. Unfortunately, this 'improvement' introduced two bugs, which each and one alone cause only limited problems, but combined lead to seemingly stochastic unicast link resets, depending on the amount of broadcast traffic transmitted. The first issue, a missing initialization of the 'tolerance' field of the receiver broadcast link, was recently fixed by commit 047491ea334a ("tipc: set link tolerance correctly in broadcast link"). Ths second issue, where we omit to reset the 'stale_cnt' field of the same link after a 'stale' period is over, leads to this counter accumulating over time, and in the absence of the 'tolerance' criteria leads to the above described symptoms. This commit adds the missing initialization. Fixes: a4dc70d46cf1 ("tipc: extend link reset criteria for stale packet retransmission") Signed-off-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/link.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/link.c b/net/tipc/link.c index f6552e4f4b43..201c3b5bc96b 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1041,6 +1041,7 @@ static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r, if (r->last_retransm != buf_seqno(skb)) { r->last_retransm = buf_seqno(skb); r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance); + r->stale_cnt = 0; } else if (++r->stale_cnt > 99 && time_after(jiffies, r->stale_limit)) { link_retransmit_failure(l, skb); if (link_is_bc_sndlink(l)) From d7b4c24f45d2efe51b8f213da4593fefd49240ba Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 11 Oct 2018 22:32:31 +0100 Subject: [PATCH 16/49] rxrpc: Fix an uninitialised variable Fix an uninitialised variable introduced by the last patch. This can cause a crash when a new call comes in to a local service, such as when an AFS fileserver calls back to the local cache manager. Fixes: c1e15b4944c9 ("rxrpc: Fix the packet reception routine") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/call_accept.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 652e314de38e..8079aacaecac 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -337,7 +337,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_connection *conn; - struct rxrpc_peer *peer; + struct rxrpc_peer *peer = NULL; struct rxrpc_call *call; _enter(""); From d6672a5a97918f92bf2f3a2591f25d02bb0897a4 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 11 Oct 2018 22:32:39 +0100 Subject: [PATCH 17/49] rxrpc: use correct kvec num when sending BUSY response packet Fixes gcc '-Wunused-but-set-variable' warning: net/rxrpc/output.c: In function 'rxrpc_reject_packets': net/rxrpc/output.c:527:11: warning: variable 'ioc' set but not used [-Wunused-but-set-variable] 'ioc' is the correct kvec num when sending a BUSY (or an ABORT) response packet. Fixes: ece64fec164f ("rxrpc: Emit BUSY packets when supposed to rather than ABORTs") Signed-off-by: YueHaibing Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index e8fb8922bca8..a141ee3ab812 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -572,7 +572,8 @@ void rxrpc_reject_packets(struct rxrpc_local *local) whdr.flags ^= RXRPC_CLIENT_INITIATED; whdr.flags &= RXRPC_CLIENT_INITIATED; - ret = kernel_sendmsg(local->socket, &msg, iov, 2, size); + ret = kernel_sendmsg(local->socket, &msg, + iov, ioc, size); if (ret < 0) trace_rxrpc_tx_fail(local->debug_id, 0, ret, rxrpc_tx_point_reject); From 64bd9c8135751b561f27edaaffe93d07093f81af Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 11 Oct 2018 15:06:33 -0700 Subject: [PATCH 18/49] net: bcmgenet: Poll internal PHY for GENETv5 On GENETv5, there is a hardware issue which prevents the GENET hardware from generating a link UP interrupt when the link is operating at 10Mbits/sec. Since we do not have any way to configure the link detection logic, fallback to polling in that case. Fixes: 421380856d9c ("net: bcmgenet: add support for the GENETv5 hardware") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmmii.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c b/drivers/net/ethernet/broadcom/genet/bcmmii.c index 4241ae928d4a..34af5f1569c8 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmmii.c +++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c @@ -321,9 +321,12 @@ int bcmgenet_mii_probe(struct net_device *dev) phydev->advertising = phydev->supported; /* The internal PHY has its link interrupts routed to the - * Ethernet MAC ISRs + * Ethernet MAC ISRs. On GENETv5 there is a hardware issue + * that prevents the signaling of link UP interrupts when + * the link operates at 10Mbps, so fallback to polling for + * those versions of GENET. */ - if (priv->internal_phy) + if (priv->internal_phy && !GENET_IS_V5(priv)) dev->phydev->irq = PHY_IGNORE_INTERRUPT; return 0; From f547fac624be53ad8b07e9ebca7654a7827ba61b Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 12 Oct 2018 16:22:47 +0200 Subject: [PATCH 19/49] ipv6: rate-limit probes for neighbourless routes When commit 270972554c91 ("[IPV6]: ROUTE: Add Router Reachability Probing (RFC4191).") introduced router probing, the rt6_probe() function required that a neighbour entry existed. This neighbour entry is used to record the timestamp of the last probe via the ->updated field. Later, commit 2152caea7196 ("ipv6: Do not depend on rt->n in rt6_probe().") removed the requirement for a neighbour entry. Neighbourless routes skip the interval check and are not rate-limited. This patch adds rate-limiting for neighbourless routes, by recording the timestamp of the last probe in the fib6_info itself. Fixes: 2152caea7196 ("ipv6: Do not depend on rt->n in rt6_probe().") Signed-off-by: Sabrina Dubroca Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 4 ++++ net/ipv6/route.c | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 3d4930528db0..2d31e22babd8 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -159,6 +159,10 @@ struct fib6_info { struct rt6_info * __percpu *rt6i_pcpu; struct rt6_exception_bucket __rcu *rt6i_exception_bucket; +#ifdef CONFIG_IPV6_ROUTER_PREF + unsigned long last_probe; +#endif + u32 fib6_metric; u8 fib6_protocol; u8 fib6_type; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a366c05a239d..abcb5ae77319 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -520,10 +520,11 @@ static void rt6_probe_deferred(struct work_struct *w) static void rt6_probe(struct fib6_info *rt) { - struct __rt6_probe_work *work; + struct __rt6_probe_work *work = NULL; const struct in6_addr *nh_gw; struct neighbour *neigh; struct net_device *dev; + struct inet6_dev *idev; /* * Okay, this does not seem to be appropriate @@ -539,15 +540,12 @@ static void rt6_probe(struct fib6_info *rt) nh_gw = &rt->fib6_nh.nh_gw; dev = rt->fib6_nh.nh_dev; rcu_read_lock_bh(); + idev = __in6_dev_get(dev); neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { - struct inet6_dev *idev; - if (neigh->nud_state & NUD_VALID) goto out; - idev = __in6_dev_get(dev); - work = NULL; write_lock(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, @@ -557,11 +555,13 @@ static void rt6_probe(struct fib6_info *rt) __neigh_set_probe_once(neigh); } write_unlock(&neigh->lock); - } else { + } else if (time_after(jiffies, rt->last_probe + + idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } if (work) { + rt->last_probe = jiffies; INIT_WORK(&work->work, rt6_probe_deferred); work->target = *nh_gw; dev_hold(dev); From 7ec8dc96e1cb45693f28f1287802ef6f2888dae0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 12 Oct 2018 16:38:36 +0100 Subject: [PATCH 20/49] rxrpc: Fix incorrect conditional on IPV6 The udpv6_encap_enable() function is part of the ipv6 code, and if that is configured as a loadable module and rxrpc is built in then a build failure will occur because the conditional check is wrong: net/rxrpc/local_object.o: In function `rxrpc_lookup_local': local_object.c:(.text+0x2688): undefined reference to `udpv6_encap_enable' Use the correct config symbol (CONFIG_AF_RXRPC_IPV6) in the conditional check rather than CONFIG_IPV6 as that will do the right thing. Fixes: 5271953cad31 ("rxrpc: Use the UDP encap_rcv hook") Reported-by: kbuild-all@01.org Reported-by: Arnd Bergmann Signed-off-by: David Howells Reviewed-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/rxrpc/local_object.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index cad0691c2bb4..0906e51d3cfb 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -139,7 +139,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) udp_sk(usk)->gro_complete = NULL; udp_encap_enable(); -#if IS_ENABLED(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_AF_RXRPC_IPV6) if (local->srx.transport.family == AF_INET6) udpv6_encap_enable(); #endif From d3092b2efca1cd1d492d0b08499a2066c5ca8cec Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Fri, 12 Oct 2018 22:46:55 +0200 Subject: [PATCH 21/49] tipc: fix unsafe rcu locking when accessing publication list The binding table's 'cluster_scope' list is rcu protected to handle races between threads changing the list and those traversing the list at the same moment. We have now found that the function named_distribute() uses the regular list_for_each() macro to traverse the said list. Likewise, the function tipc_named_withdraw() is removing items from the same list using the regular list_del() call. When these two functions execute in parallel we see occasional crashes. This commit fixes this by adding the missing _rcu() suffixes. Signed-off-by: Tung Nguyen Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/name_distr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 51b4b96f89db..3cfeb9df64b0 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -115,7 +115,7 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) struct sk_buff *buf; struct distr_item *item; - list_del(&publ->binding_node); + list_del_rcu(&publ->binding_node); if (publ->scope == TIPC_NODE_SCOPE) return NULL; @@ -147,7 +147,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; - list_for_each_entry(publ, pls, binding_node) { + list_for_each_entry_rcu(publ, pls, binding_node) { /* Prepare next buffer: */ if (!skb) { skb = named_prepare_buf(net, PUBLICATION, msg_rem, From dc012f3628eaecfb5ba68404a5c30ef501daf63d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Oct 2018 18:58:53 -0700 Subject: [PATCH 22/49] ipv6: mcast: fix a use-after-free in inet6_mc_check syzbot found a use-after-free in inet6_mc_check [1] The problem here is that inet6_mc_check() uses rcu and read_lock(&iml->sflock) So the fact that ip6_mc_leave_src() is called under RTNL and the socket lock does not help us, we need to acquire iml->sflock in write mode. In the future, we should convert all this stuff to RCU. [1] BUG: KASAN: use-after-free in ipv6_addr_equal include/net/ipv6.h:521 [inline] BUG: KASAN: use-after-free in inet6_mc_check+0xae7/0xb40 net/ipv6/mcast.c:649 Read of size 8 at addr ffff8801ce7f2510 by task syz-executor0/22432 CPU: 1 PID: 22432 Comm: syz-executor0 Not tainted 4.19.0-rc7+ #280 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c4/0x2b4 lib/dump_stack.c:113 print_address_description.cold.8+0x9/0x1ff mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.9+0x242/0x309 mm/kasan/report.c:412 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433 ipv6_addr_equal include/net/ipv6.h:521 [inline] inet6_mc_check+0xae7/0xb40 net/ipv6/mcast.c:649 __raw_v6_lookup+0x320/0x3f0 net/ipv6/raw.c:98 ipv6_raw_deliver net/ipv6/raw.c:183 [inline] raw6_local_deliver+0x3d3/0xcb0 net/ipv6/raw.c:240 ip6_input_finish+0x467/0x1aa0 net/ipv6/ip6_input.c:345 NF_HOOK include/linux/netfilter.h:289 [inline] ip6_input+0xe9/0x600 net/ipv6/ip6_input.c:426 ip6_mc_input+0x48a/0xd20 net/ipv6/ip6_input.c:503 dst_input include/net/dst.h:450 [inline] ip6_rcv_finish+0x17a/0x330 net/ipv6/ip6_input.c:76 NF_HOOK include/linux/netfilter.h:289 [inline] ipv6_rcv+0x120/0x640 net/ipv6/ip6_input.c:271 __netif_receive_skb_one_core+0x14d/0x200 net/core/dev.c:4913 __netif_receive_skb+0x2c/0x1e0 net/core/dev.c:5023 netif_receive_skb_internal+0x12c/0x620 net/core/dev.c:5126 napi_frags_finish net/core/dev.c:5664 [inline] napi_gro_frags+0x75a/0xc90 net/core/dev.c:5737 tun_get_user+0x3189/0x4250 drivers/net/tun.c:1923 tun_chr_write_iter+0xb9/0x154 drivers/net/tun.c:1968 call_write_iter include/linux/fs.h:1808 [inline] do_iter_readv_writev+0x8b0/0xa80 fs/read_write.c:680 do_iter_write+0x185/0x5f0 fs/read_write.c:959 vfs_writev+0x1f1/0x360 fs/read_write.c:1004 do_writev+0x11a/0x310 fs/read_write.c:1039 __do_sys_writev fs/read_write.c:1112 [inline] __se_sys_writev fs/read_write.c:1109 [inline] __x64_sys_writev+0x75/0xb0 fs/read_write.c:1109 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457421 Code: 75 14 b8 14 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83 34 b5 fb ff c3 48 83 ec 08 e8 1a 2d 00 00 48 89 04 24 b8 14 00 00 00 0f 05 <48> 8b 3c 24 48 89 c2 e8 63 2d 00 00 48 89 d0 48 83 c4 08 48 3d 01 RSP: 002b:00007f2d30ecaba0 EFLAGS: 00000293 ORIG_RAX: 0000000000000014 RAX: ffffffffffffffda RBX: 000000000000003e RCX: 0000000000457421 RDX: 0000000000000001 RSI: 00007f2d30ecabf0 RDI: 00000000000000f0 RBP: 0000000020000500 R08: 00000000000000f0 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000293 R12: 00007f2d30ecb6d4 R13: 00000000004c4890 R14: 00000000004d7b90 R15: 00000000ffffffff Allocated by task 22437: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:553 __do_kmalloc mm/slab.c:3718 [inline] __kmalloc+0x14e/0x760 mm/slab.c:3727 kmalloc include/linux/slab.h:518 [inline] sock_kmalloc+0x15a/0x1f0 net/core/sock.c:1983 ip6_mc_source+0x14dd/0x1960 net/ipv6/mcast.c:427 do_ipv6_setsockopt.isra.9+0x3afb/0x45d0 net/ipv6/ipv6_sockglue.c:743 ipv6_setsockopt+0xbd/0x170 net/ipv6/ipv6_sockglue.c:933 rawv6_setsockopt+0x59/0x140 net/ipv6/raw.c:1069 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3038 __sys_setsockopt+0x1ba/0x3c0 net/socket.c:1902 __do_sys_setsockopt net/socket.c:1913 [inline] __se_sys_setsockopt net/socket.c:1910 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1910 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 22430: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/kasan.c:521 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528 __cache_free mm/slab.c:3498 [inline] kfree+0xcf/0x230 mm/slab.c:3813 __sock_kfree_s net/core/sock.c:2004 [inline] sock_kfree_s+0x29/0x60 net/core/sock.c:2010 ip6_mc_leave_src+0x11a/0x1d0 net/ipv6/mcast.c:2448 __ipv6_sock_mc_close+0x20b/0x4e0 net/ipv6/mcast.c:310 ipv6_sock_mc_close+0x158/0x1d0 net/ipv6/mcast.c:328 inet6_release+0x40/0x70 net/ipv6/af_inet6.c:452 __sock_release+0xd7/0x250 net/socket.c:579 sock_close+0x19/0x20 net/socket.c:1141 __fput+0x385/0xa30 fs/file_table.c:278 ____fput+0x15/0x20 fs/file_table.c:309 task_work_run+0x1e8/0x2a0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:193 [inline] exit_to_usermode_loop+0x318/0x380 arch/x86/entry/common.c:166 prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline] syscall_return_slowpath arch/x86/entry/common.c:268 [inline] do_syscall_64+0x6be/0x820 arch/x86/entry/common.c:293 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff8801ce7f2500 which belongs to the cache kmalloc-192 of size 192 The buggy address is located 16 bytes inside of 192-byte region [ffff8801ce7f2500, ffff8801ce7f25c0) The buggy address belongs to the page: page:ffffea000739fc80 count:1 mapcount:0 mapping:ffff8801da800040 index:0x0 flags: 0x2fffc0000000100(slab) raw: 02fffc0000000100 ffffea0006f6e548 ffffea000737b948 ffff8801da800040 raw: 0000000000000000 ffff8801ce7f2000 0000000100000010 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801ce7f2400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8801ce7f2480: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc >ffff8801ce7f2500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8801ce7f2580: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff8801ce7f2600: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 4ae54aaca373..dbab62e3f0d7 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2436,17 +2436,17 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, { int err; - /* callers have the socket lock and rtnl lock - * so no other readers or writers of iml or its sflist - */ + write_lock_bh(&iml->sflock); if (!iml->sflist) { /* any-source empty exclude case */ - return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); + err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); + } else { + err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, + iml->sflist->sl_count, iml->sflist->sl_addr, 0); + sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max)); + iml->sflist = NULL; } - err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, - iml->sflist->sl_count, iml->sflist->sl_addr, 0); - sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max)); - iml->sflist = NULL; + write_unlock_bh(&iml->sflock); return err; } From fbe1222c63b805e946c3af29b0bfbfee4c2fbeff Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 13 Oct 2018 16:48:25 +0100 Subject: [PATCH 23/49] qed: fix spelling mistake "Ireelevant" -> "Irrelevant" Trivial fix to spelling mistake in DP_INFO message Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_int.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.c b/drivers/net/ethernet/qlogic/qed/qed_int.c index af3a28ec04eb..0f0aba793352 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_int.c +++ b/drivers/net/ethernet/qlogic/qed/qed_int.c @@ -228,7 +228,7 @@ static int qed_grc_attn_cb(struct qed_hwfn *p_hwfn) attn_master_to_str(GET_FIELD(tmp, QED_GRC_ATTENTION_MASTER)), GET_FIELD(tmp2, QED_GRC_ATTENTION_PF), (GET_FIELD(tmp2, QED_GRC_ATTENTION_PRIV) == - QED_GRC_ATTENTION_PRIV_VF) ? "VF" : "(Ireelevant)", + QED_GRC_ATTENTION_PRIV_VF) ? "VF" : "(Irrelevant)", GET_FIELD(tmp2, QED_GRC_ATTENTION_VF)); out: From ec20a63aa8b8ec3223fb25cdb2a49f9f9dfda88c Mon Sep 17 00:00:00 2001 From: Fugang Duan Date: Mon, 15 Oct 2018 05:19:00 +0000 Subject: [PATCH 24/49] net: fec: don't dump RX FIFO register when not available Commit db65f35f50e0 ("net: fec: add support of ethtool get_regs") introduce ethool "--register-dump" interface to dump all FEC registers. But not all silicon implementations of the Freescale FEC hardware module have the FRBR (FIFO Receive Bound Register) and FRSR (FIFO Receive Start Register) register, so we should not be trying to dump them on those that don't. To fix it we create a quirk flag, FEC_QUIRK_HAS_RFREG, and check it before dump those RX FIFO registers. Signed-off-by: Fugang Duan Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec.h | 4 ++++ drivers/net/ethernet/freescale/fec_main.c | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h index 4778b663653e..bf80855dd0dd 100644 --- a/drivers/net/ethernet/freescale/fec.h +++ b/drivers/net/ethernet/freescale/fec.h @@ -452,6 +452,10 @@ struct bufdesc_ex { * initialisation. */ #define FEC_QUIRK_MIB_CLEAR (1 << 15) +/* Only i.MX25/i.MX27/i.MX28 controller supports FRBR,FRSR registers, + * those FIFO receive registers are resolved in other platforms. + */ +#define FEC_QUIRK_HAS_FRREG (1 << 16) struct bufdesc_prop { int qid; diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index bf9b9fd6d2a0..7b98bb75ba8a 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -91,14 +91,16 @@ static struct platform_device_id fec_devtype[] = { .driver_data = 0, }, { .name = "imx25-fec", - .driver_data = FEC_QUIRK_USE_GASKET | FEC_QUIRK_MIB_CLEAR, + .driver_data = FEC_QUIRK_USE_GASKET | FEC_QUIRK_MIB_CLEAR | + FEC_QUIRK_HAS_FRREG, }, { .name = "imx27-fec", - .driver_data = FEC_QUIRK_MIB_CLEAR, + .driver_data = FEC_QUIRK_MIB_CLEAR | FEC_QUIRK_HAS_FRREG, }, { .name = "imx28-fec", .driver_data = FEC_QUIRK_ENET_MAC | FEC_QUIRK_SWAP_FRAME | - FEC_QUIRK_SINGLE_MDIO | FEC_QUIRK_HAS_RACC, + FEC_QUIRK_SINGLE_MDIO | FEC_QUIRK_HAS_RACC | + FEC_QUIRK_HAS_FRREG, }, { .name = "imx6q-fec", .driver_data = FEC_QUIRK_ENET_MAC | FEC_QUIRK_HAS_GBIT | @@ -2164,7 +2166,13 @@ static void fec_enet_get_regs(struct net_device *ndev, memset(buf, 0, regs->len); for (i = 0; i < ARRAY_SIZE(fec_enet_register_offset); i++) { - off = fec_enet_register_offset[i] / 4; + off = fec_enet_register_offset[i]; + + if ((off == FEC_R_BOUND || off == FEC_R_FSTART) && + !(fep->quirks & FEC_QUIRK_HAS_FRREG)) + continue; + + off >>= 2; buf[off] = readl(&theregs[off]); } } From d805397c3822d57ca3884d4bea37b2291fc40992 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Oct 2018 19:58:29 +0800 Subject: [PATCH 25/49] sctp: use the pmtu from the icmp packet to update transport pathmtu Other than asoc pmtu sync from all transports, sctp_assoc_sync_pmtu is also processing transport pmtu_pending by icmp packets. But it's meaningless to use sctp_dst_mtu(t->dst) as new pmtu for a transport. The right pmtu value should come from the icmp packet, and it would be saved into transport->mtu_info in this patch and used later when the pmtu sync happens in sctp_sendmsg_to_asoc or sctp_packet_config. Besides, without this patch, as pmtu can only be updated correctly when receiving a icmp packet and no place is holding sock lock, it will take long time if the sock is busy with sending packets. Note that it doesn't process transport->mtu_info in .release_cb(), as there is no enough information for pmtu update, like for which asoc or transport. It is not worth traversing all asocs to check pmtu_pending. So unlike tcp, sctp does this in tx path, for which mtu_info needs to be atomic_t. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 2 ++ net/sctp/associola.c | 3 ++- net/sctp/input.c | 1 + net/sctp/output.c | 6 ++++++ 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 28a7c8e44636..a11f93790476 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -876,6 +876,8 @@ struct sctp_transport { unsigned long sackdelay; __u32 sackfreq; + atomic_t mtu_info; + /* When was the last time that we heard from this transport? We use * this to pick new active and retran paths. */ diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 297d9cf960b9..a827a1f562bf 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1450,7 +1450,8 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc) /* Get the lowest pmtu of all the transports. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (t->pmtu_pending && t->dst) { - sctp_transport_update_pmtu(t, sctp_dst_mtu(t->dst)); + sctp_transport_update_pmtu(t, + atomic_read(&t->mtu_info)); t->pmtu_pending = 0; } if (!pmtu || (t->pathmtu < pmtu)) diff --git a/net/sctp/input.c b/net/sctp/input.c index 9bbc5f92c941..5c36a99882ed 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -395,6 +395,7 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, return; if (sock_owned_by_user(sk)) { + atomic_set(&t->mtu_info, pmtu); asoc->pmtu_pending = 1; t->pmtu_pending = 1; return; diff --git a/net/sctp/output.c b/net/sctp/output.c index 7f849b01ec8e..67939ad99c01 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -120,6 +120,12 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, sctp_assoc_sync_pmtu(asoc); } + if (asoc->pmtu_pending) { + if (asoc->param_flags & SPP_PMTUD_ENABLE) + sctp_assoc_sync_pmtu(asoc); + asoc->pmtu_pending = 0; + } + /* If there a is a prepend chunk stick it on the list before * any other chunks get appended. */ From 1890fea7936ad9be0b7caf6a94146b0d905c4b60 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 15 Oct 2018 22:37:21 +0100 Subject: [PATCH 26/49] rxrpc: Fix a missing rxrpc_put_peer() in the error_report handler Fix a missing call to rxrpc_put_peer() on the main path through the rxrpc_error_report() function. This manifests itself as a ref leak whenever an ICMP packet or other error comes in. In commit f334430316e7, the hand-off of the ref to a work item was removed and was not replaced with a put. Fixes: f334430316e7 ("rxrpc: Fix error distribution") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/peer_event.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 05b51bdbdd41..bd2fa3b7caa7 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -195,6 +195,7 @@ void rxrpc_error_report(struct sock *sk) rxrpc_store_error(peer, serr); rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + rxrpc_put_peer(peer); _leave(""); } From 8913806f166e47c6b3fe8253e9cfb9caabe64341 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Mon, 15 Oct 2018 16:52:23 -0700 Subject: [PATCH 27/49] nfp: flower: fix pedit set actions for multiple partial masks Previously we did not correctly change headers when using multiple pedit actions with partial masks. We now take this into account and no longer just commit the last pedit action. Fixes: c0b1bd9a8b8a ("nfp: add set ipv4 header action flower offload") Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- .../net/ethernet/netronome/nfp/flower/action.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c index 46ba0cf257c6..91de7a9b0190 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/action.c +++ b/drivers/net/ethernet/netronome/nfp/flower/action.c @@ -429,12 +429,14 @@ nfp_fl_set_ip4(const struct tc_action *action, int idx, u32 off, switch (off) { case offsetof(struct iphdr, daddr): - set_ip_addr->ipv4_dst_mask = mask; - set_ip_addr->ipv4_dst = exact; + set_ip_addr->ipv4_dst_mask |= mask; + set_ip_addr->ipv4_dst &= ~mask; + set_ip_addr->ipv4_dst |= exact & mask; break; case offsetof(struct iphdr, saddr): - set_ip_addr->ipv4_src_mask = mask; - set_ip_addr->ipv4_src = exact; + set_ip_addr->ipv4_src_mask |= mask; + set_ip_addr->ipv4_src &= ~mask; + set_ip_addr->ipv4_src |= exact & mask; break; default: return -EOPNOTSUPP; @@ -451,8 +453,9 @@ static void nfp_fl_set_ip6_helper(int opcode_tag, int idx, __be32 exact, __be32 mask, struct nfp_fl_set_ipv6_addr *ip6) { - ip6->ipv6[idx % 4].mask = mask; - ip6->ipv6[idx % 4].exact = exact; + ip6->ipv6[idx % 4].mask |= mask; + ip6->ipv6[idx % 4].exact &= ~mask; + ip6->ipv6[idx % 4].exact |= exact & mask; ip6->reserved = cpu_to_be16(0); ip6->head.jump_id = opcode_tag; From d08c9e589300b015e72e5b41ff4dfed6eb8e7421 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Mon, 15 Oct 2018 16:52:24 -0700 Subject: [PATCH 28/49] nfp: flower: fix multiple keys per pedit action Previously we only allowed a single header key per pedit action to change the header. This used to result in the last header key in the pedit action to overwrite previous headers. We now keep track of them and allow multiple header keys per pedit action. Fixes: c0b1bd9a8b8a ("nfp: add set ipv4 header action flower offload") Fixes: 354b82bb320e ("nfp: add set ipv6 source and destination address") Fixes: f8b7b0a6b113 ("nfp: add set tcp and udp header action flower offload") Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- .../net/ethernet/netronome/nfp/flower/action.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c index 91de7a9b0190..c39d7fdf73e6 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/action.c +++ b/drivers/net/ethernet/netronome/nfp/flower/action.c @@ -544,7 +544,7 @@ nfp_fl_pedit(const struct tc_action *action, struct tc_cls_flower_offload *flow, struct nfp_fl_set_eth set_eth; enum pedit_header_type htype; int idx, nkeys, err; - size_t act_size; + size_t act_size = 0; u32 offset, cmd; u8 ip_proto = 0; @@ -602,7 +602,9 @@ nfp_fl_pedit(const struct tc_action *action, struct tc_cls_flower_offload *flow, act_size = sizeof(set_eth); memcpy(nfp_action, &set_eth, act_size); *a_len += act_size; - } else if (set_ip_addr.head.len_lw) { + } + if (set_ip_addr.head.len_lw) { + nfp_action += act_size; act_size = sizeof(set_ip_addr); memcpy(nfp_action, &set_ip_addr, act_size); *a_len += act_size; @@ -610,10 +612,12 @@ nfp_fl_pedit(const struct tc_action *action, struct tc_cls_flower_offload *flow, /* Hardware will automatically fix IPv4 and TCP/UDP checksum. */ *csum_updated |= TCA_CSUM_UPDATE_FLAG_IPV4HDR | nfp_fl_csum_l4_to_flag(ip_proto); - } else if (set_ip6_dst.head.len_lw && set_ip6_src.head.len_lw) { + } + if (set_ip6_dst.head.len_lw && set_ip6_src.head.len_lw) { /* TC compiles set src and dst IPv6 address as a single action, * the hardware requires this to be 2 separate actions. */ + nfp_action += act_size; act_size = sizeof(set_ip6_src); memcpy(nfp_action, &set_ip6_src, act_size); *a_len += act_size; @@ -626,6 +630,7 @@ nfp_fl_pedit(const struct tc_action *action, struct tc_cls_flower_offload *flow, /* Hardware will automatically fix TCP/UDP checksum. */ *csum_updated |= nfp_fl_csum_l4_to_flag(ip_proto); } else if (set_ip6_dst.head.len_lw) { + nfp_action += act_size; act_size = sizeof(set_ip6_dst); memcpy(nfp_action, &set_ip6_dst, act_size); *a_len += act_size; @@ -633,13 +638,16 @@ nfp_fl_pedit(const struct tc_action *action, struct tc_cls_flower_offload *flow, /* Hardware will automatically fix TCP/UDP checksum. */ *csum_updated |= nfp_fl_csum_l4_to_flag(ip_proto); } else if (set_ip6_src.head.len_lw) { + nfp_action += act_size; act_size = sizeof(set_ip6_src); memcpy(nfp_action, &set_ip6_src, act_size); *a_len += act_size; /* Hardware will automatically fix TCP/UDP checksum. */ *csum_updated |= nfp_fl_csum_l4_to_flag(ip_proto); - } else if (set_tport.head.len_lw) { + } + if (set_tport.head.len_lw) { + nfp_action += act_size; act_size = sizeof(set_tport); memcpy(nfp_action, &set_tport, act_size); *a_len += act_size; From 140b6abac26d799f75d772ab5e969b34ad8d68f1 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Mon, 15 Oct 2018 16:52:25 -0700 Subject: [PATCH 29/49] nfp: flower: use offsets provided by pedit instead of index for ipv6 Previously when populating the set ipv6 address action, we incorrectly made use of pedit's key index to determine which 32bit word should be set. We now calculate which word has been selected based on the offset provided by the pedit action. Fixes: 354b82bb320e ("nfp: add set ipv6 source and destination address") Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- .../ethernet/netronome/nfp/flower/action.c | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c index c39d7fdf73e6..7a1e9cd9cc62 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/action.c +++ b/drivers/net/ethernet/netronome/nfp/flower/action.c @@ -450,12 +450,12 @@ nfp_fl_set_ip4(const struct tc_action *action, int idx, u32 off, } static void -nfp_fl_set_ip6_helper(int opcode_tag, int idx, __be32 exact, __be32 mask, +nfp_fl_set_ip6_helper(int opcode_tag, u8 word, __be32 exact, __be32 mask, struct nfp_fl_set_ipv6_addr *ip6) { - ip6->ipv6[idx % 4].mask |= mask; - ip6->ipv6[idx % 4].exact &= ~mask; - ip6->ipv6[idx % 4].exact |= exact & mask; + ip6->ipv6[word].mask |= mask; + ip6->ipv6[word].exact &= ~mask; + ip6->ipv6[word].exact |= exact & mask; ip6->reserved = cpu_to_be16(0); ip6->head.jump_id = opcode_tag; @@ -468,6 +468,7 @@ nfp_fl_set_ip6(const struct tc_action *action, int idx, u32 off, struct nfp_fl_set_ipv6_addr *ip_src) { __be32 exact, mask; + u8 word; /* We are expecting tcf_pedit to return a big endian value */ mask = (__force __be32)~tcf_pedit_mask(action, idx); @@ -476,17 +477,20 @@ nfp_fl_set_ip6(const struct tc_action *action, int idx, u32 off, if (exact & ~mask) return -EOPNOTSUPP; - if (off < offsetof(struct ipv6hdr, saddr)) + if (off < offsetof(struct ipv6hdr, saddr)) { return -EOPNOTSUPP; - else if (off < offsetof(struct ipv6hdr, daddr)) - nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_SRC, idx, + } else if (off < offsetof(struct ipv6hdr, daddr)) { + word = (off - offsetof(struct ipv6hdr, saddr)) / sizeof(exact); + nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_SRC, word, exact, mask, ip_src); - else if (off < offsetof(struct ipv6hdr, daddr) + - sizeof(struct in6_addr)) - nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_DST, idx, + } else if (off < offsetof(struct ipv6hdr, daddr) + + sizeof(struct in6_addr)) { + word = (off - offsetof(struct ipv6hdr, daddr)) / sizeof(exact); + nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_DST, word, exact, mask, ip_dst); - else + } else { return -EOPNOTSUPP; + } return 0; } From 0ac1077e3a549bf8d35971613e2be05bdbb41a00 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 16 Oct 2018 15:52:02 +0800 Subject: [PATCH 30/49] sctp: get pr_assoc and pr_stream all status with SCTP_PR_SCTP_ALL instead According to rfc7496 section 4.3 or 4.4: sprstat_policy: This parameter indicates for which PR-SCTP policy the user wants the information. It is an error to use SCTP_PR_SCTP_NONE in sprstat_policy. If SCTP_PR_SCTP_ALL is used, the counters provided are aggregated over all supported policies. We change to dump pr_assoc and pr_stream all status by SCTP_PR_SCTP_ALL instead, and return error for SCTP_PR_SCTP_NONE, as it also said "It is an error to use SCTP_PR_SCTP_NONE in sprstat_policy. " Fixes: 826d253d57b1 ("sctp: add SCTP_PR_ASSOC_STATUS on sctp sockopt") Fixes: d229d48d183f ("sctp: add SCTP_PR_STREAM_STATUS sockopt for prsctp") Reported-by: Ying Xu Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 1 + net/sctp/socket.c | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index b479db5c71d9..34dd3d497f2c 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -301,6 +301,7 @@ enum sctp_sinfo_flags { SCTP_SACK_IMMEDIATELY = (1 << 3), /* SACK should be sent without delay. */ /* 2 bits here have been used by SCTP_PR_SCTP_MASK */ SCTP_SENDALL = (1 << 6), + SCTP_PR_SCTP_ALL = (1 << 7), SCTP_NOTIFICATION = MSG_NOTIFICATION, /* Next message is not user msg but notification. */ SCTP_EOF = MSG_FIN, /* Initiate graceful shutdown process. */ }; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index f73e9d38d5ba..e25a20fc629a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7100,14 +7100,14 @@ static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len, } policy = params.sprstat_policy; - if (policy & ~SCTP_PR_SCTP_MASK) + if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL))) goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); if (!asoc) goto out; - if (policy == SCTP_PR_SCTP_NONE) { + if (policy & SCTP_PR_SCTP_ALL) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { @@ -7159,7 +7159,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, } policy = params.sprstat_policy; - if (policy & ~SCTP_PR_SCTP_MASK) + if (!policy || (policy & ~(SCTP_PR_SCTP_MASK | SCTP_PR_SCTP_ALL))) goto out; asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); @@ -7175,7 +7175,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, goto out; } - if (policy == SCTP_PR_SCTP_NONE) { + if (policy == SCTP_PR_SCTP_ALL) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { From 7463e4f9b99c089cc962033b46349ff29f466e40 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 12 Oct 2018 23:53:58 +0200 Subject: [PATCH 31/49] geneve, vxlan: Don't check skb_dst() twice Commit f15ca723c1eb ("net: don't call update_pmtu unconditionally") avoids that we try updating PMTU for a non-existent destination, but didn't clean up cases where the check was already explicit. Drop those redundant checks. Signed-off-by: Stefano Brivio Reviewed-by: Sabrina Dubroca Signed-off-by: David S. Miller --- drivers/net/geneve.c | 15 ++++----------- drivers/net/vxlan.c | 12 ++---------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 6acb6b5718b9..61c4bfbeb41c 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -830,12 +830,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, if (IS_ERR(rt)) return PTR_ERR(rt); - if (skb_dst(skb)) { - int mtu = dst_mtu(&rt->dst) - GENEVE_IPV4_HLEN - - info->options_len; - - skb_dst_update_pmtu(skb, mtu); - } + skb_dst_update_pmtu(skb, dst_mtu(&rt->dst) - + GENEVE_IPV4_HLEN - info->options_len); sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); if (geneve->collect_md) { @@ -876,11 +872,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, if (IS_ERR(dst)) return PTR_ERR(dst); - if (skb_dst(skb)) { - int mtu = dst_mtu(dst) - GENEVE_IPV6_HLEN - info->options_len; - - skb_dst_update_pmtu(skb, mtu); - } + skb_dst_update_pmtu(skb, dst_mtu(dst) - + GENEVE_IPV6_HLEN - info->options_len); sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); if (geneve->collect_md) { diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 2b8da2b7e721..22e0ce592e07 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2194,11 +2194,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } ndst = &rt->dst; - if (skb_dst(skb)) { - int mtu = dst_mtu(ndst) - VXLAN_HEADROOM; - - skb_dst_update_pmtu(skb, mtu); - } + skb_dst_update_pmtu(skb, dst_mtu(ndst) - VXLAN_HEADROOM); tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); @@ -2235,11 +2231,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto out_unlock; } - if (skb_dst(skb)) { - int mtu = dst_mtu(ndst) - VXLAN6_HEADROOM; - - skb_dst_update_pmtu(skb, mtu); - } + skb_dst_update_pmtu(skb, dst_mtu(ndst) - VXLAN6_HEADROOM); tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip6_dst_hoplimit(ndst); From 6b4f92af3d59e882d3ba04c44a815266890d188f Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 12 Oct 2018 23:53:59 +0200 Subject: [PATCH 32/49] geneve, vxlan: Don't set exceptions if skb->len < mtu We shouldn't abuse exceptions: if the destination MTU is already higher than what we're transmitting, no exception should be created. Fixes: 52a589d51f10 ("geneve: update skb dst pmtu on tx path") Fixes: a93bf0ff4490 ("vxlan: update skb dst pmtu on tx path") Signed-off-by: Stefano Brivio Reviewed-by: Sabrina Dubroca Signed-off-by: David S. Miller --- drivers/net/geneve.c | 7 +++---- drivers/net/vxlan.c | 4 ++-- include/net/dst.h | 10 ++++++++++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 61c4bfbeb41c..493cd382b8aa 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -830,8 +830,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, if (IS_ERR(rt)) return PTR_ERR(rt); - skb_dst_update_pmtu(skb, dst_mtu(&rt->dst) - - GENEVE_IPV4_HLEN - info->options_len); + skb_tunnel_check_pmtu(skb, &rt->dst, + GENEVE_IPV4_HLEN + info->options_len); sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); if (geneve->collect_md) { @@ -872,8 +872,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, if (IS_ERR(dst)) return PTR_ERR(dst); - skb_dst_update_pmtu(skb, dst_mtu(dst) - - GENEVE_IPV6_HLEN - info->options_len); + skb_tunnel_check_pmtu(skb, dst, GENEVE_IPV6_HLEN + info->options_len); sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); if (geneve->collect_md) { diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 22e0ce592e07..27bd586b94b0 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2194,7 +2194,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } ndst = &rt->dst; - skb_dst_update_pmtu(skb, dst_mtu(ndst) - VXLAN_HEADROOM); + skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM); tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); @@ -2231,7 +2231,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto out_unlock; } - skb_dst_update_pmtu(skb, dst_mtu(ndst) - VXLAN6_HEADROOM); + skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM); tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip6_dst_hoplimit(ndst); diff --git a/include/net/dst.h b/include/net/dst.h index 7f735e76ca73..6cf0870414c7 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -527,4 +527,14 @@ static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu) dst->ops->update_pmtu(dst, NULL, skb, mtu); } +static inline void skb_tunnel_check_pmtu(struct sk_buff *skb, + struct dst_entry *encap_dst, + int headroom) +{ + u32 encap_mtu = dst_mtu(encap_dst); + + if (skb->len > encap_mtu - headroom) + skb_dst_update_pmtu(skb, encap_mtu - headroom); +} + #endif /* _NET_DST_H */ From 8c3bf9b62b667456a57aefcf1689e826df146159 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 12 Oct 2018 19:14:58 -0700 Subject: [PATCH 33/49] net: qla3xxx: Remove overflowing shift statement Clang currently warns: drivers/net/ethernet/qlogic/qla3xxx.c:384:24: warning: signed shift result (0xF00000000) requires 37 bits to represent, but 'int' only has 32 bits [-Wshift-overflow] ((ISP_NVRAM_MASK << 16) | qdev->eeprom_cmd_data)); ~~~~~~~~~~~~~~ ^ ~~ 1 warning generated. The warning is certainly accurate since ISP_NVRAM_MASK is defined as (0x000F << 16) which is then shifted by 16, resulting in 64424509440, well above UINT_MAX. Given that this is the only location in this driver where ISP_NVRAM_MASK is shifted again, it seems likely that ISP_NVRAM_MASK was originally defined without a shift and during the move of the shift to the definition, this statement wasn't properly removed (since ISP_NVRAM_MASK is used in the statenent right above this). Only the maintainers can confirm this since this statment has been here since the driver was first added to the kernel. Link: https://github.com/ClangBuiltLinux/linux/issues/127 Signed-off-by: Nathan Chancellor Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qla3xxx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c index b48f76182049..10b075bc5959 100644 --- a/drivers/net/ethernet/qlogic/qla3xxx.c +++ b/drivers/net/ethernet/qlogic/qla3xxx.c @@ -380,8 +380,6 @@ static void fm93c56a_select(struct ql3_adapter *qdev) qdev->eeprom_cmd_data = AUBURN_EEPROM_CS_1; ql_write_nvram_reg(qdev, spir, ISP_NVRAM_MASK | qdev->eeprom_cmd_data); - ql_write_nvram_reg(qdev, spir, - ((ISP_NVRAM_MASK << 16) | qdev->eeprom_cmd_data)); } /* From efa61c8cf2950ab5c0e66cff3cabe2a2b24e81ba Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 16 Oct 2018 15:06:41 +0200 Subject: [PATCH 34/49] ptp: fix Spectre v1 vulnerability pin_index can be indirectly controlled by user-space, hence leading to a potential exploitation of the Spectre variant 1 vulnerability. This issue was detected with the help of Smatch: drivers/ptp/ptp_chardev.c:253 ptp_ioctl() warn: potential spectre issue 'ops->pin_config' [r] (local cap) Fix this by sanitizing pin_index before using it to index ops->pin_config, and before passing it as an argument to function ptp_set_pinfunc(), in which it is used to index info->pin_config. Notice that given that speculation windows are large, the policy is to kill the speculation on the first load and not worry if it can be completed with a dependent load/store [1]. [1] https://marc.info/?l=linux-kernel&m=152449131114778&w=2 Cc: stable@vger.kernel.org Signed-off-by: Gustavo A. R. Silva Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/ptp/ptp_chardev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 01b0e2bb3319..2012551d93e0 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -24,6 +24,8 @@ #include #include +#include + #include "ptp_private.h" static int ptp_disable_pinfunc(struct ptp_clock_info *ops, @@ -248,6 +250,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) err = -EINVAL; break; } + pin_index = array_index_nospec(pin_index, ops->n_pins); if (mutex_lock_interruptible(&ptp->pincfg_mux)) return -ERESTARTSYS; pd = ops->pin_config[pin_index]; @@ -266,6 +269,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) err = -EINVAL; break; } + pin_index = array_index_nospec(pin_index, ops->n_pins); if (mutex_lock_interruptible(&ptp->pincfg_mux)) return -ERESTARTSYS; err = ptp_set_pinfunc(ptp, pin_index, pd.func, pd.chan); From 84258438e8ce12d6888b68a1238bba9cb25307e2 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 17 Oct 2018 00:35:10 +0900 Subject: [PATCH 35/49] net: bpfilter: use get_pid_task instead of pid_task pid_task() dereferences rcu protected tasks array. But there is no rcu_read_lock() in shutdown_umh() routine so that rcu_read_lock() is needed. get_pid_task() is wrapper function of pid_task. it holds rcu_read_lock() then calls pid_task(). if task isn't NULL, it increases reference count of task. test commands: %modprobe bpfilter %modprobe -rv bpfilter splat looks like: [15102.030932] ============================= [15102.030957] WARNING: suspicious RCU usage [15102.030985] 4.19.0-rc7+ #21 Not tainted [15102.031010] ----------------------------- [15102.031038] kernel/pid.c:330 suspicious rcu_dereference_check() usage! [15102.031063] other info that might help us debug this: [15102.031332] rcu_scheduler_active = 2, debug_locks = 1 [15102.031363] 1 lock held by modprobe/1570: [15102.031389] #0: 00000000580ef2b0 (bpfilter_lock){+.+.}, at: stop_umh+0x13/0x52 [bpfilter] [15102.031552] stack backtrace: [15102.031583] CPU: 1 PID: 1570 Comm: modprobe Not tainted 4.19.0-rc7+ #21 [15102.031607] Hardware name: To be filled by O.E.M. To be filled by O.E.M./Aptio CRB, BIOS 5.6.5 07/08/2015 [15102.031628] Call Trace: [15102.031676] dump_stack+0xc9/0x16b [15102.031723] ? show_regs_print_info+0x5/0x5 [15102.031801] ? lockdep_rcu_suspicious+0x117/0x160 [15102.031855] pid_task+0x134/0x160 [15102.031900] ? find_vpid+0xf0/0xf0 [15102.032017] shutdown_umh.constprop.1+0x1e/0x53 [bpfilter] [15102.032055] stop_umh+0x46/0x52 [bpfilter] [15102.032092] __x64_sys_delete_module+0x47e/0x570 [ ... ] Fixes: d2ba09c17a06 ("net: add skeleton of bpfilter kernel module") Signed-off-by: Taehee Yoo Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/bpfilter/bpfilter_kern.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index b64e1649993b..94e88f510c5b 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -23,9 +23,11 @@ static void shutdown_umh(struct umh_info *info) if (!info->pid) return; - tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID); - if (tsk) + tsk = get_pid_task(find_vpid(info->pid), PIDTYPE_PID); + if (tsk) { force_sig(SIGKILL, tsk); + put_task_struct(tsk); + } fput(info->pipe_to_umh); fput(info->pipe_from_umh); info->pid = 0; From 9675931e6b65d160d16bcc9472c1acef15524def Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 16 Oct 2018 19:35:17 +0200 Subject: [PATCH 36/49] r8169: re-enable MSI-X on RTL8168g Similar to d49c88d7677b ("r8169: Enable MSI-X on RTL8106e") after e9d0ba506ea8 ("PCI: Reprogram bridge prefetch registers on resume") we can safely assume that this also fixes the root cause of the issue worked around by 7c53a722459c ("r8169: don't use MSI-X on RTL8168g"). So let's revert it. Fixes: 7c53a722459c ("r8169: don't use MSI-X on RTL8168g") Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index f4df367fb894..28184b984a44 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -7098,11 +7098,6 @@ static int rtl_alloc_irq(struct rtl8169_private *tp) RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~MSIEnable); RTL_W8(tp, Cfg9346, Cfg9346_Lock); flags = PCI_IRQ_LEGACY; - } else if (tp->mac_version == RTL_GIGA_MAC_VER_40) { - /* This version was reported to have issues with resume - * from suspend when using MSI-X - */ - flags = PCI_IRQ_LEGACY | PCI_IRQ_MSI; } else { flags = PCI_IRQ_ALL_TYPES; } From b336decab22158937975293aea79396525f92bb3 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 16 Oct 2018 15:18:17 -0300 Subject: [PATCH 37/49] sctp: fix race on sctp_id2asoc syzbot reported an use-after-free involving sctp_id2asoc. Dmitry Vyukov helped to root cause it and it is because of reading the asoc after it was freed: CPU 1 CPU 2 (working on socket 1) (working on socket 2) sctp_association_destroy sctp_id2asoc spin lock grab the asoc from idr spin unlock spin lock remove asoc from idr spin unlock free(asoc) if asoc->base.sk != sk ... [*] This can only be hit if trying to fetch asocs from different sockets. As we have a single IDR for all asocs, in all SCTP sockets, their id is unique on the system. An application can try to send stuff on an id that matches on another socket, and the if in [*] will protect from such usage. But it didn't consider that as that asoc may belong to another socket, it may be freed in parallel (read: under another socket lock). We fix it by moving the checks in [*] into the protected region. This fixes it because the asoc cannot be freed while the lock is held. Reported-by: syzbot+c7dd55d7aec49d48e49a@syzkaller.appspotmail.com Acked-by: Dmitry Vyukov Signed-off-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e25a20fc629a..bba877a0205b 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -271,11 +271,10 @@ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id) spin_lock_bh(&sctp_assocs_id_lock); asoc = (struct sctp_association *)idr_find(&sctp_assocs_id, (int)id); + if (asoc && (asoc->base.sk != sk || asoc->base.dead)) + asoc = NULL; spin_unlock_bh(&sctp_assocs_id_lock); - if (!asoc || (asoc->base.sk != sk) || asoc->base.dead) - return NULL; - return asoc; } From c863850ce22e1b0bb365d49cadf51f4765153ae4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 17 Oct 2018 03:06:12 +0800 Subject: [PATCH 38/49] sctp: not free the new asoc when sctp_wait_for_connect returns err When sctp_wait_for_connect is called to wait for connect ready for sp->strm_interleave in sctp_sendmsg_to_asoc, a panic could be triggered if cpu is scheduled out and the new asoc is freed elsewhere, as it will return err and later the asoc gets freed again in sctp_sendmsg. [ 285.840764] list_del corruption, ffff9f0f7b284078->next is LIST_POISON1 (dead000000000100) [ 285.843590] WARNING: CPU: 1 PID: 8861 at lib/list_debug.c:47 __list_del_entry_valid+0x50/0xa0 [ 285.846193] Kernel panic - not syncing: panic_on_warn set ... [ 285.846193] [ 285.848206] CPU: 1 PID: 8861 Comm: sctp_ndata Kdump: loaded Not tainted 4.19.0-rc7.label #584 [ 285.850559] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 285.852164] Call Trace: ... [ 285.872210] ? __list_del_entry_valid+0x50/0xa0 [ 285.872894] sctp_association_free+0x42/0x2d0 [sctp] [ 285.873612] sctp_sendmsg+0x5a4/0x6b0 [sctp] [ 285.874236] sock_sendmsg+0x30/0x40 [ 285.874741] ___sys_sendmsg+0x27a/0x290 [ 285.875304] ? __switch_to_asm+0x34/0x70 [ 285.875872] ? __switch_to_asm+0x40/0x70 [ 285.876438] ? ptep_set_access_flags+0x2a/0x30 [ 285.877083] ? do_wp_page+0x151/0x540 [ 285.877614] __sys_sendmsg+0x58/0xa0 [ 285.878138] do_syscall_64+0x55/0x180 [ 285.878669] entry_SYSCALL_64_after_hwframe+0x44/0xa9 This is a similar issue with the one fixed in Commit ca3af4dd28cf ("sctp: do not free asoc when it is already dead in sctp_sendmsg"). But this one can't be fixed by returning -ESRCH for the dead asoc in sctp_wait_for_connect, as it will break sctp_connect's return value to users. This patch is to simply set err to -ESRCH before it returns to sctp_sendmsg when any err is returned by sctp_wait_for_connect for sp->strm_interleave, so that no asoc would be freed due to this. When users see this error, they will know the packet hasn't been sent. And it also makes sense to not free asoc because waiting connect fails, like the second call for sctp_wait_for_connect in sctp_sendmsg_to_asoc. Fixes: 668c9beb9020 ("sctp: implement assign_number for sctp_stream_interleave") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index bba877a0205b..c1c1bda334a4 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1945,8 +1945,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, if (sp->strm_interleave) { timeo = sock_sndtimeo(sk, 0); err = sctp_wait_for_connect(asoc, &timeo); - if (err) + if (err) { + err = -ESRCH; goto err; + } } else { wait_connect = true; } From 9b3bc7db759e64c33471025721817467f8c3ecd4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 17 Oct 2018 08:05:45 +0000 Subject: [PATCH 39/49] mlxsw: core: Fix use-after-free when flashing firmware during init When the switch driver (e.g., mlxsw_spectrum) determines it needs to flash a new firmware version it resets the ASIC after the flashing process. The bus driver (e.g., mlxsw_pci) then registers itself again with mlxsw_core which means (among other things) that the device registers itself again with the hwmon subsystem again. Since the device was registered with the hwmon subsystem using devm_hwmon_device_register_with_groups(), then the old hwmon device (registered before the flashing) was never unregistered and was referencing stale data, resulting in a use-after free. Fix by removing reliance on device managed APIs in mlxsw_hwmon_init(). Fixes: c86d62cc410c ("mlxsw: spectrum: Reset FW after flash") Signed-off-by: Ido Schimmel Reported-by: Alexander Petrovskiy Tested-by: Alexander Petrovskiy Reviewed-by: Petr Machata Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core.c | 2 ++ drivers/net/ethernet/mellanox/mlxsw/core.h | 4 ++++ .../net/ethernet/mellanox/mlxsw/core_hwmon.c | 17 +++++++++++------ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index 81533d7f395c..937d0ace699a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -1055,6 +1055,7 @@ int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info, err_driver_init: mlxsw_thermal_fini(mlxsw_core->thermal); err_thermal_init: + mlxsw_hwmon_fini(mlxsw_core->hwmon); err_hwmon_init: if (!reload) devlink_unregister(devlink); @@ -1088,6 +1089,7 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core, if (mlxsw_core->driver->fini) mlxsw_core->driver->fini(mlxsw_core); mlxsw_thermal_fini(mlxsw_core->thermal); + mlxsw_hwmon_fini(mlxsw_core->hwmon); if (!reload) devlink_unregister(devlink); mlxsw_emad_fini(mlxsw_core); diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h index 655ddd204ab2..c35be477856f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core.h @@ -359,6 +359,10 @@ static inline int mlxsw_hwmon_init(struct mlxsw_core *mlxsw_core, return 0; } +static inline void mlxsw_hwmon_fini(struct mlxsw_hwmon *mlxsw_hwmon) +{ +} + #endif struct mlxsw_thermal; diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c b/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c index f6cf2896d337..e04e8162aa14 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c @@ -303,8 +303,7 @@ int mlxsw_hwmon_init(struct mlxsw_core *mlxsw_core, struct device *hwmon_dev; int err; - mlxsw_hwmon = devm_kzalloc(mlxsw_bus_info->dev, sizeof(*mlxsw_hwmon), - GFP_KERNEL); + mlxsw_hwmon = kzalloc(sizeof(*mlxsw_hwmon), GFP_KERNEL); if (!mlxsw_hwmon) return -ENOMEM; mlxsw_hwmon->core = mlxsw_core; @@ -321,10 +320,9 @@ int mlxsw_hwmon_init(struct mlxsw_core *mlxsw_core, mlxsw_hwmon->groups[0] = &mlxsw_hwmon->group; mlxsw_hwmon->group.attrs = mlxsw_hwmon->attrs; - hwmon_dev = devm_hwmon_device_register_with_groups(mlxsw_bus_info->dev, - "mlxsw", - mlxsw_hwmon, - mlxsw_hwmon->groups); + hwmon_dev = hwmon_device_register_with_groups(mlxsw_bus_info->dev, + "mlxsw", mlxsw_hwmon, + mlxsw_hwmon->groups); if (IS_ERR(hwmon_dev)) { err = PTR_ERR(hwmon_dev); goto err_hwmon_register; @@ -337,5 +335,12 @@ int mlxsw_hwmon_init(struct mlxsw_core *mlxsw_core, err_hwmon_register: err_fans_init: err_temp_init: + kfree(mlxsw_hwmon); return err; } + +void mlxsw_hwmon_fini(struct mlxsw_hwmon *mlxsw_hwmon) +{ + hwmon_device_unregister(mlxsw_hwmon->hwmon_dev); + kfree(mlxsw_hwmon); +} From 84dad55951b0d009372ec21760b650634246e144 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 17 Oct 2018 11:44:04 +0200 Subject: [PATCH 40/49] udp6: fix encap return code for resubmitting The commit eb63f2964dbe ("udp6: add missing checks on edumux packet processing") used the same return code convention of the ipv4 counterpart, but ipv6 uses the opposite one: positive values means resubmit. This change addresses the issue, using positive return value for resubmitting. Also update the related comment, which was broken, too. Fixes: eb63f2964dbe ("udp6: add missing checks on edumux packet processing") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv6/udp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 28c4aa5078fc..b36694b6716e 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -766,11 +766,9 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, ret = udpv6_queue_rcv_skb(sk, skb); - /* a return value > 0 means to resubmit the input, but - * it wants the return to be -protocol, or 0 - */ + /* a return value > 0 means to resubmit the input */ if (ret > 0) - return -ret; + return ret; return 0; } From 05c998b738fdd3e5d6a257bcacc8f34b6284d795 Mon Sep 17 00:00:00 2001 From: Ake Koomsin Date: Wed, 17 Oct 2018 19:44:12 +0900 Subject: [PATCH 41/49] virtio_net: avoid using netif_tx_disable() for serializing tx routine Commit 713a98d90c5e ("virtio-net: serialize tx routine during reset") introduces netif_tx_disable() after netif_device_detach() in order to avoid use-after-free of tx queues. However, there are two issues. 1) Its operation is redundant with netif_device_detach() in case the interface is running. 2) In case of the interface is not running before suspending and resuming, the tx does not get resumed by netif_device_attach(). This results in losing network connectivity. It is better to use netif_tx_lock_bh()/netif_tx_unlock_bh() instead for serializing tx routine during reset. This also preserves the symmetry of netif_device_detach() and netif_device_attach(). Fixes commit 713a98d90c5e ("virtio-net: serialize tx routine during reset") Signed-off-by: Ake Koomsin Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index dab504ec5e50..ddfa3f24204c 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2218,8 +2218,9 @@ static void virtnet_freeze_down(struct virtio_device *vdev) /* Make sure no work handler is accessing the device */ flush_work(&vi->config_work); + netif_tx_lock_bh(vi->dev); netif_device_detach(vi->dev); - netif_tx_disable(vi->dev); + netif_tx_unlock_bh(vi->dev); cancel_delayed_work_sync(&vi->refill); if (netif_running(vi->dev)) { @@ -2255,7 +2256,9 @@ static int virtnet_restore_up(struct virtio_device *vdev) } } + netif_tx_lock_bh(vi->dev); netif_device_attach(vi->dev); + netif_tx_unlock_bh(vi->dev); return err; } From 5660b9d9d6a29c2c3cc12f62ae44bfb56b0a15a9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 17 Oct 2018 21:11:27 +0800 Subject: [PATCH 42/49] sctp: fix the data size calculation in sctp_data_size sctp data size should be calculated by subtracting data chunk header's length from chunk_hdr->length, not just data header. Fixes: 668c9beb9020 ("sctp: implement assign_number for sctp_stream_interleave") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 5ef1bad81ef5..9e3d32746430 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -347,7 +347,7 @@ static inline __u16 sctp_data_size(struct sctp_chunk *chunk) __u16 size; size = ntohs(chunk->chunk_hdr->length); - size -= sctp_datahdr_len(&chunk->asoc->stream); + size -= sctp_datachk_len(&chunk->asoc->stream); return size; } From 06a36ecb5d0ee4b379845a5687f83084d3187521 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Wed, 17 Oct 2018 17:26:35 +0200 Subject: [PATCH 43/49] net: mscc: ocelot: Fix comment in ocelot_vlant_wait_for_completion() The ocelot_vlant_wait_for_completion() function is very similar to the ocelot_mact_wait_for_completion(). It seemed to have be copied but the comment was not updated, so let's fix it. Signed-off-by: Gregory CLEMENT Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 1a4f2bb48ead..ed4e298cd823 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -133,9 +133,9 @@ static inline int ocelot_vlant_wait_for_completion(struct ocelot *ocelot) { unsigned int val, timeout = 10; - /* Wait for the issued mac table command to be completed, or timeout. - * When the command read from ANA_TABLES_MACACCESS is - * MACACCESS_CMD_IDLE, the issued command completed successfully. + /* Wait for the issued vlan table command to be completed, or timeout. + * When the command read from ANA_TABLES_VLANACCESS is + * VLANACCESS_CMD_IDLE, the issued command completed successfully. */ do { val = ocelot_read(ocelot, ANA_TABLES_VLANACCESS); From eddf016b910486d2123675a6b5fd7d64f77cdca8 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 17 Oct 2018 22:34:34 +0300 Subject: [PATCH 44/49] net: ipmr: fix unresolved entry dumps If the skb space ends in an unresolved entry while dumping we'll miss some unresolved entries. The reason is due to zeroing the entry counter between dumping resolved and unresolved mfc entries. We should just keep counting until the whole table is dumped and zero when we move to the next as we have a separate table counter. Reported-by: Colin Ian King Fixes: 8fb472c09b9d ("ipmr: improve hash scalability") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/ipmr_base.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 1ad9aa62a97b..eab8cd5ec2f5 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -296,8 +296,6 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, next_entry: e++; } - e = 0; - s_e = 0; spin_lock_bh(lock); list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { From 6b839b6cf9eada30b086effb51e5d6076bafc761 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 18 Oct 2018 19:56:01 +0200 Subject: [PATCH 45/49] r8169: fix NAPI handling under high load rtl_rx() and rtl_tx() are called only if the respective bits are set in the interrupt status register. Under high load NAPI may not be able to process all data (work_done == budget) and it will schedule subsequent calls to the poll callback. rtl_ack_events() however resets the bits in the interrupt status register, therefore subsequent calls to rtl8169_poll() won't call rtl_rx() and rtl_tx() - chip interrupts are still disabled. Fix this by calling rtl_rx() and rtl_tx() independent of the bits set in the interrupt status register. Both functions will detect if there's nothing to do for them. Fixes: da78dbff2e05 ("r8169: remove work from irq handler.") Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 28184b984a44..2c350099b83c 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -6549,17 +6549,15 @@ static int rtl8169_poll(struct napi_struct *napi, int budget) struct rtl8169_private *tp = container_of(napi, struct rtl8169_private, napi); struct net_device *dev = tp->dev; u16 enable_mask = RTL_EVENT_NAPI | tp->event_slow; - int work_done= 0; + int work_done; u16 status; status = rtl_get_events(tp); rtl_ack_events(tp, status & ~tp->event_slow); - if (status & RTL_EVENT_NAPI_RX) - work_done = rtl_rx(dev, tp, (u32) budget); + work_done = rtl_rx(dev, tp, (u32) budget); - if (status & RTL_EVENT_NAPI_TX) - rtl_tx(dev, tp); + rtl_tx(dev, tp); if (status & tp->event_slow) { enable_mask &= ~tp->event_slow; From 3c53ed8fef6881a864f0ee8240ed2793ef73ad0d Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 18 Oct 2018 10:34:26 +0200 Subject: [PATCH 46/49] net: sched: Fix for duplicate class dump When dumping classes by parent, kernel would return classes twice: | # tc qdisc add dev lo root prio | # tc class show dev lo | class prio 8001:1 parent 8001: | class prio 8001:2 parent 8001: | class prio 8001:3 parent 8001: | # tc class show dev lo parent 8001: | class prio 8001:1 parent 8001: | class prio 8001:2 parent 8001: | class prio 8001:3 parent 8001: | class prio 8001:1 parent 8001: | class prio 8001:2 parent 8001: | class prio 8001:3 parent 8001: This comes from qdisc_match_from_root() potentially returning the root qdisc itself if its handle matched. Though in that case, root's classes were already dumped a few lines above. Fixes: cb395b2010879 ("net: sched: optimize class dumps") Signed-off-by: Phil Sutter Reviewed-by: Jiri Pirko Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 6684641ea344..3dc0acf54245 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -2059,7 +2059,8 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, if (tcm->tcm_parent) { q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent)); - if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) + if (q && q != root && + tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) return -1; return 0; } From b6168562c8ce2bd5a30e213021650422e08764dc Mon Sep 17 00:00:00 2001 From: Wenwen Wang Date: Thu, 18 Oct 2018 09:36:46 -0500 Subject: [PATCH 47/49] net: socket: fix a missing-check bug In ethtool_ioctl(), the ioctl command 'ethcmd' is checked through a switch statement to see whether it is necessary to pre-process the ethtool structure, because, as mentioned in the comment, the structure ethtool_rxnfc is defined with padding. If yes, a user-space buffer 'rxnfc' is allocated through compat_alloc_user_space(). One thing to note here is that, if 'ethcmd' is ETHTOOL_GRXCLSRLALL, the size of the buffer 'rxnfc' is partially determined by 'rule_cnt', which is actually acquired from the user-space buffer 'compat_rxnfc', i.e., 'compat_rxnfc->rule_cnt', through get_user(). After 'rxnfc' is allocated, the data in the original user-space buffer 'compat_rxnfc' is then copied to 'rxnfc' through copy_in_user(), including the 'rule_cnt' field. However, after this copy, no check is re-enforced on 'rxnfc->rule_cnt'. So it is possible that a malicious user race to change the value in the 'compat_rxnfc->rule_cnt' between these two copies. Through this way, the attacker can bypass the previous check on 'rule_cnt' and inject malicious data. This can cause undefined behavior of the kernel and introduce potential security risk. This patch avoids the above issue via copying the value acquired by get_user() to 'rxnfc->rule_cn', if 'ethcmd' is ETHTOOL_GRXCLSRLALL. Signed-off-by: Wenwen Wang Signed-off-by: David S. Miller --- net/socket.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/socket.c b/net/socket.c index 01f3f8f32d6f..390a8ecef4bf 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2875,9 +2875,14 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) copy_in_user(&rxnfc->fs.ring_cookie, &compat_rxnfc->fs.ring_cookie, (void __user *)(&rxnfc->fs.location + 1) - - (void __user *)&rxnfc->fs.ring_cookie) || - copy_in_user(&rxnfc->rule_cnt, &compat_rxnfc->rule_cnt, - sizeof(rxnfc->rule_cnt))) + (void __user *)&rxnfc->fs.ring_cookie)) + return -EFAULT; + if (ethcmd == ETHTOOL_GRXCLSRLALL) { + if (put_user(rule_cnt, &rxnfc->rule_cnt)) + return -EFAULT; + } else if (copy_in_user(&rxnfc->rule_cnt, + &compat_rxnfc->rule_cnt, + sizeof(rxnfc->rule_cnt))) return -EFAULT; } From b06f9d9f1a907dd03f203e2ce9e27e318c22ba01 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 18 Oct 2018 17:38:29 +0200 Subject: [PATCH 48/49] tipc: fix info leak from kernel tipc_event We initialize a struct tipc_event allocated on the kernel stack to zero to avert info leak to user space. Reported-by: syzbot+057458894bc8cada4dee@syzkaller.appspotmail.com Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/group.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/group.c b/net/tipc/group.c index e82f13cb2dc5..06fee142f09f 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -666,6 +666,7 @@ static void tipc_group_create_event(struct tipc_group *grp, struct sk_buff *skb; struct tipc_msg *hdr; + memset(&evt, 0, sizeof(evt)); evt.event = event; evt.found_lower = m->instance; evt.found_upper = m->instance; From d4d576f5ab7edcb757bb33e6a5600666a0b1232d Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Thu, 18 Oct 2018 21:25:07 +0200 Subject: [PATCH 49/49] ip6_tunnel: Fix encapsulation layout Commit 058214a4d1df ("ip6_tun: Add infrastructure for doing encapsulation") added the ip6_tnl_encap() call in ip6_tnl_xmit(), before the call to ipv6_push_frag_opts() to append the IPv6 Tunnel Encapsulation Limit option (option 4, RFC 2473, par. 5.1) to the outer IPv6 header. As long as the option didn't actually end up in generated packets, this wasn't an issue. Then commit 89a23c8b528b ("ip6_tunnel: Fix missing tunnel encapsulation limit option") fixed sending of this option, and the resulting layout, e.g. for FoU, is: .-------------------.------------.----------.-------------------.----- - - | Outer IPv6 Header | UDP header | Option 4 | Inner IPv6 Header | Payload '-------------------'------------'----------'-------------------'----- - - Needless to say, FoU and GUE (at least) won't work over IPv6. The option is appended by default, and I couldn't find a way to disable it with the current iproute2. Turn this into a more reasonable: .-------------------.----------.------------.-------------------.----- - - | Outer IPv6 Header | Option 4 | UDP header | Inner IPv6 Header | Payload '-------------------'----------'------------'-------------------'----- - - With this, and with 84dad55951b0 ("udp6: fix encap return code for resubmitting"), FoU and GUE work again over IPv6. Fixes: 058214a4d1df ("ip6_tun: Add infrastructure for doing encapsulation") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a0b6932c3afd..a9d06d4dd057 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1184,11 +1184,6 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, } skb_dst_set(skb, dst); - if (encap_limit >= 0) { - init_tel_txopt(&opt, encap_limit); - ipv6_push_frag_opts(skb, &opt.ops, &proto); - } - if (hop_limit == 0) { if (skb->protocol == htons(ETH_P_IP)) hop_limit = ip_hdr(skb)->ttl; @@ -1210,6 +1205,11 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, if (err) return err; + if (encap_limit >= 0) { + init_tel_txopt(&opt, encap_limit); + ipv6_push_frag_opts(skb, &opt.ops, &proto); + } + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb);