From 49b23575943c04b6711107cfd08ad2b3ae4e81f5 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Wed, 3 Jun 2020 21:03:47 +0200 Subject: [PATCH 01/83] bpf: Fix unused-var without NETDEVICES A recent commit added new variables only used if CONFIG_NETDEVICES is set. A simple fix would be to only declare these variables if the same condition is valid but Alexei suggested an even simpler solution: since CONFIG_NETDEVICES doesn't change anything in .h I think the best is to remove #ifdef CONFIG_NETDEVICES from net/core/filter.c and rely on sock_bindtoindex() returning ENOPROTOOPT in the extreme case of oddly configured kernels. Fixes: 70c58997c1e8 ("bpf: Allow SO_BINDTODEVICE opt in bpf_setsockopt") Suggested-by: Alexei Starovoitov Signed-off-by: Matthieu Baerts Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200603190347.2310320-1-matthieu.baerts@tessares.net --- net/core/filter.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index d01a244b5087..90d2eb77002f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4340,8 +4340,6 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, } break; case SO_BINDTODEVICE: - ret = -ENOPROTOOPT; -#ifdef CONFIG_NETDEVICES optlen = min_t(long, optlen, IFNAMSIZ - 1); strncpy(devname, optval, optlen); devname[optlen] = 0; @@ -4360,7 +4358,6 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, dev_put(dev); } ret = sock_bindtoindex(sk, ifindex, false); -#endif break; default: ret = -EINVAL; From e7ed83d6fa1a00d0f2ad0327e73d3ea9e7ea8de1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Jun 2020 11:54:36 +0300 Subject: [PATCH 02/83] bpf: Fix an error code in check_btf_func() This code returns success if the "info_aux" allocation fails but it should return -ENOMEM. Fixes: 8c1b6e69dcc1 ("bpf: Compare BTF types of functions arguments with actual types") Signed-off-by: Dan Carpenter Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200604085436.GA943001@mwanda --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5c7bbaac81ef..34cde841ab68 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7552,7 +7552,7 @@ static int check_btf_func(struct bpf_verifier_env *env, const struct btf *btf; void __user *urecord; u32 prev_offset = 0; - int ret = 0; + int ret = -ENOMEM; nfuncs = attr->func_info_cnt; if (!nfuncs) From 79ea1e12c0b8540100e89b32afb9f0e6503fad35 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 4 Jun 2020 12:04:20 +0200 Subject: [PATCH 03/83] cfg80211: fix management registrations deadlock Lockdep reports that we may deadlock because we take the RTNL on the work struct, but flush it under RTNL. Clearly, it's correct. In practice, this can happen when doing rfkill on an active device. Fix this by moving the work struct to the wiphy (registered dev) layer, and iterate over all the wdevs inside there. This then means we need to track which one of them has work to do, so we don't update to the driver for all wdevs all the time. Also fix a locking bug I noticed while working on this - the registrations list is iterated as if it was an RCU list, but it isn't handle that way - and we need to lock now for the update flag anyway, so remove the RCU. Fixes: 6cd536fe62ef ("cfg80211: change internal management frame registration API") Reported-by: Markus Theil Reported-and-tested-by: Kenneth R. Crudup Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200604120420.b1dc540a7e26.I55dcca56bb5bdc5d7ad66a36a0b42afd7034d8be@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 +++-- net/wireless/core.c | 6 +++--- net/wireless/core.h | 2 ++ net/wireless/mlme.c | 26 +++++++++++++++++++++----- 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index b58ad1a3f695..fc7e8807838d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5075,7 +5075,8 @@ struct cfg80211_cqm_config; * by cfg80211 on change_interface * @mgmt_registrations: list of registrations for management frames * @mgmt_registrations_lock: lock for the list - * @mgmt_registrations_update_wk: update work to defer from atomic context + * @mgmt_registrations_need_update: mgmt registrations were updated, + * need to propagate the update to the driver * @mtx: mutex used to lock data in this struct, may be used by drivers * and some API functions require it held * @beacon_interval: beacon interval used on this device for transmitting @@ -5121,7 +5122,7 @@ struct wireless_dev { struct list_head mgmt_registrations; spinlock_t mgmt_registrations_lock; - struct work_struct mgmt_registrations_update_wk; + u8 mgmt_registrations_need_update:1; struct mutex mtx; diff --git a/net/wireless/core.c b/net/wireless/core.c index f0226ae9561c..c623d9bf5096 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -497,6 +497,8 @@ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv, INIT_WORK(&rdev->propagate_radar_detect_wk, cfg80211_propagate_radar_detect_wk); INIT_WORK(&rdev->propagate_cac_done_wk, cfg80211_propagate_cac_done_wk); + INIT_WORK(&rdev->mgmt_registrations_update_wk, + cfg80211_mgmt_registrations_update_wk); #ifdef CONFIG_CFG80211_DEFAULT_PS rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT; @@ -1047,6 +1049,7 @@ void wiphy_unregister(struct wiphy *wiphy) flush_work(&rdev->sched_scan_stop_wk); flush_work(&rdev->propagate_radar_detect_wk); flush_work(&rdev->propagate_cac_done_wk); + flush_work(&rdev->mgmt_registrations_update_wk); #ifdef CONFIG_PM if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup) @@ -1108,7 +1111,6 @@ static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync) rdev->devlist_generation++; cfg80211_mlme_purge_registrations(wdev); - flush_work(&wdev->mgmt_registrations_update_wk); switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: @@ -1253,8 +1255,6 @@ void cfg80211_init_wdev(struct cfg80211_registered_device *rdev, spin_lock_init(&wdev->event_lock); INIT_LIST_HEAD(&wdev->mgmt_registrations); spin_lock_init(&wdev->mgmt_registrations_lock); - INIT_WORK(&wdev->mgmt_registrations_update_wk, - cfg80211_mgmt_registrations_update_wk); INIT_LIST_HEAD(&wdev->pmsr_list); spin_lock_init(&wdev->pmsr_lock); INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk); diff --git a/net/wireless/core.h b/net/wireless/core.h index e0e5b3ee9699..67b0389fca4d 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -99,6 +99,8 @@ struct cfg80211_registered_device { struct cfg80211_chan_def cac_done_chandef; struct work_struct propagate_cac_done_wk; + struct work_struct mgmt_registrations_update_wk; + /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ struct wiphy wiphy __aligned(NETDEV_ALIGN); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 189334314cba..a6c61a2e6569 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -440,9 +440,15 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) ASSERT_RTNL(); + spin_lock_bh(&wdev->mgmt_registrations_lock); + if (!wdev->mgmt_registrations_need_update) { + spin_unlock_bh(&wdev->mgmt_registrations_lock); + return; + } + rcu_read_lock(); list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) { - list_for_each_entry_rcu(reg, &tmp->mgmt_registrations, list) { + list_for_each_entry(reg, &tmp->mgmt_registrations, list) { u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4); u32 mcast_mask = 0; @@ -460,16 +466,23 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) } rcu_read_unlock(); + wdev->mgmt_registrations_need_update = 0; + spin_unlock_bh(&wdev->mgmt_registrations_lock); + rdev_update_mgmt_frame_registrations(rdev, wdev, &upd); } void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) { - struct wireless_dev *wdev = container_of(wk, struct wireless_dev, - mgmt_registrations_update_wk); + struct cfg80211_registered_device *rdev; + struct wireless_dev *wdev; + + rdev = container_of(wk, struct cfg80211_registered_device, + mgmt_registrations_update_wk); rtnl_lock(); - cfg80211_mgmt_registrations_update(wdev); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) + cfg80211_mgmt_registrations_update(wdev); rtnl_unlock(); } @@ -557,6 +570,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, nreg->multicast_rx = multicast_rx; list_add(&nreg->list, &wdev->mgmt_registrations); } + wdev->mgmt_registrations_need_update = 1; spin_unlock_bh(&wdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); @@ -585,7 +599,8 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) list_del(®->list); kfree(reg); - schedule_work(&wdev->mgmt_registrations_update_wk); + wdev->mgmt_registrations_need_update = 1; + schedule_work(&rdev->mgmt_registrations_update_wk); } spin_unlock_bh(&wdev->mgmt_registrations_lock); @@ -608,6 +623,7 @@ void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) list_del(®->list); kfree(reg); } + wdev->mgmt_registrations_need_update = 1; spin_unlock_bh(&wdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); From 523f3ec030aa5bf4818ec8dee35b2646abf367fa Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 3 Jun 2020 11:15:03 +0200 Subject: [PATCH 04/83] mac80211: initialize return flags in HE 6 GHz operation parsing Dan points out that if ieee80211_chandef_he_6ghz_oper() succeeds, we don't initialize 'ret'. Initialize it to 0 in this case, since everything went fine and nothing has to be disabled. Reported-by: Dan Carpenter Fixes: 57fa5e85d53c ("mac80211: determine chandef from HE 6 GHz operation") Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200603111500.bd2a5ff37b83.I2c3f338ce343b581db493eb9a0d988d1b626c8fb@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 5820ef02a587..b2a9d47cf86d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -167,6 +167,8 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT | IEEE80211_STA_DISABLE_HE; + else + ret = 0; vht_chandef = *chandef; goto out; } From 3067bf8c596d59164f48569a2d362de5b4c42f59 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 3 Jun 2020 22:21:16 +0100 Subject: [PATCH 05/83] rxrpc: Move the call completion handling out of line Move the handling of call completion out of line so that the next patch can add more code in that area. Signed-off-by: David Howells Reviewed-by: Marc Dionne --- net/rxrpc/ar-internal.h | 119 +++++++++------------------------------- net/rxrpc/recvmsg.c | 74 +++++++++++++++++++++++++ net/rxrpc/sendmsg.c | 8 +-- 3 files changed, 103 insertions(+), 98 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9fe264bec70c..9a2139ebd67d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -809,100 +809,6 @@ static inline bool rxrpc_is_client_call(const struct rxrpc_call *call) return !rxrpc_is_service_call(call); } -/* - * Transition a call to the complete state. - */ -static inline bool __rxrpc_set_call_completion(struct rxrpc_call *call, - enum rxrpc_call_completion compl, - u32 abort_code, - int error) -{ - if (call->state < RXRPC_CALL_COMPLETE) { - call->abort_code = abort_code; - call->error = error; - call->completion = compl, - call->state = RXRPC_CALL_COMPLETE; - trace_rxrpc_call_complete(call); - wake_up(&call->waitq); - return true; - } - return false; -} - -static inline bool rxrpc_set_call_completion(struct rxrpc_call *call, - enum rxrpc_call_completion compl, - u32 abort_code, - int error) -{ - bool ret; - - write_lock_bh(&call->state_lock); - ret = __rxrpc_set_call_completion(call, compl, abort_code, error); - write_unlock_bh(&call->state_lock); - return ret; -} - -/* - * Record that a call successfully completed. - */ -static inline bool __rxrpc_call_completed(struct rxrpc_call *call) -{ - return __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0); -} - -static inline bool rxrpc_call_completed(struct rxrpc_call *call) -{ - bool ret; - - write_lock_bh(&call->state_lock); - ret = __rxrpc_call_completed(call); - write_unlock_bh(&call->state_lock); - return ret; -} - -/* - * Record that a call is locally aborted. - */ -static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call, - rxrpc_seq_t seq, - u32 abort_code, int error) -{ - trace_rxrpc_abort(call->debug_id, why, call->cid, call->call_id, seq, - abort_code, error); - return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, - abort_code, error); -} - -static inline bool rxrpc_abort_call(const char *why, struct rxrpc_call *call, - rxrpc_seq_t seq, u32 abort_code, int error) -{ - bool ret; - - write_lock_bh(&call->state_lock); - ret = __rxrpc_abort_call(why, call, seq, abort_code, error); - write_unlock_bh(&call->state_lock); - return ret; -} - -/* - * Abort a call due to a protocol error. - */ -static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call, - struct sk_buff *skb, - const char *eproto_why, - const char *why, - u32 abort_code) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - trace_rxrpc_rx_eproto(call, sp->hdr.serial, eproto_why); - return rxrpc_abort_call(why, call, sp->hdr.seq, abort_code, -EPROTO); -} - -#define rxrpc_abort_eproto(call, skb, eproto_why, abort_why, abort_code) \ - __rxrpc_abort_eproto((call), (skb), tracepoint_string(eproto_why), \ - (abort_why), (abort_code)) - /* * conn_client.c */ @@ -1101,8 +1007,33 @@ extern const struct seq_operations rxrpc_peer_seq_ops; * recvmsg.c */ void rxrpc_notify_socket(struct rxrpc_call *); +bool __rxrpc_set_call_completion(struct rxrpc_call *, enum rxrpc_call_completion, u32, int); +bool rxrpc_set_call_completion(struct rxrpc_call *, enum rxrpc_call_completion, u32, int); +bool __rxrpc_call_completed(struct rxrpc_call *); +bool rxrpc_call_completed(struct rxrpc_call *); +bool __rxrpc_abort_call(const char *, struct rxrpc_call *, rxrpc_seq_t, u32, int); +bool rxrpc_abort_call(const char *, struct rxrpc_call *, rxrpc_seq_t, u32, int); int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); +/* + * Abort a call due to a protocol error. + */ +static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call, + struct sk_buff *skb, + const char *eproto_why, + const char *why, + u32 abort_code) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + trace_rxrpc_rx_eproto(call, sp->hdr.serial, eproto_why); + return rxrpc_abort_call(why, call, sp->hdr.seq, abort_code, -EPROTO); +} + +#define rxrpc_abort_eproto(call, skb, eproto_why, abort_why, abort_code) \ + __rxrpc_abort_eproto((call), (skb), tracepoint_string(eproto_why), \ + (abort_why), (abort_code)) + /* * rtt.c */ diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 8578c39ec839..6c4ba4224ddc 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -58,6 +58,80 @@ void rxrpc_notify_socket(struct rxrpc_call *call) _leave(""); } +/* + * Transition a call to the complete state. + */ +bool __rxrpc_set_call_completion(struct rxrpc_call *call, + enum rxrpc_call_completion compl, + u32 abort_code, + int error) +{ + if (call->state < RXRPC_CALL_COMPLETE) { + call->abort_code = abort_code; + call->error = error; + call->completion = compl, + call->state = RXRPC_CALL_COMPLETE; + trace_rxrpc_call_complete(call); + wake_up(&call->waitq); + return true; + } + return false; +} + +bool rxrpc_set_call_completion(struct rxrpc_call *call, + enum rxrpc_call_completion compl, + u32 abort_code, + int error) +{ + bool ret; + + write_lock_bh(&call->state_lock); + ret = __rxrpc_set_call_completion(call, compl, abort_code, error); + write_unlock_bh(&call->state_lock); + return ret; +} + +/* + * Record that a call successfully completed. + */ +bool __rxrpc_call_completed(struct rxrpc_call *call) +{ + return __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0); +} + +bool rxrpc_call_completed(struct rxrpc_call *call) +{ + bool ret; + + write_lock_bh(&call->state_lock); + ret = __rxrpc_call_completed(call); + write_unlock_bh(&call->state_lock); + return ret; +} + +/* + * Record that a call is locally aborted. + */ +bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call, + rxrpc_seq_t seq, u32 abort_code, int error) +{ + trace_rxrpc_abort(call->debug_id, why, call->cid, call->call_id, seq, + abort_code, error); + return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, + abort_code, error); +} + +bool rxrpc_abort_call(const char *why, struct rxrpc_call *call, + rxrpc_seq_t seq, u32 abort_code, int error) +{ + bool ret; + + write_lock_bh(&call->state_lock); + ret = __rxrpc_abort_call(why, call, seq, abort_code, error); + write_unlock_bh(&call->state_lock); + return ret; +} + /* * Pass a call terminating message to userspace. */ diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 5e9c43d4a314..5dd9ba000c00 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -261,10 +261,10 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, case -ENETUNREACH: case -EHOSTUNREACH: case -ECONNREFUSED: - rxrpc_set_call_completion(call, - RXRPC_CALL_LOCAL_ERROR, - 0, ret); - rxrpc_notify_socket(call); + if (rxrpc_set_call_completion(call, + RXRPC_CALL_LOCAL_ERROR, + 0, ret)) + rxrpc_notify_socket(call); goto out; } _debug("need instant resend %d", ret); From 5ac0d62226a07849b1a5233af8c800a19cecab83 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 3 Jun 2020 22:21:16 +0100 Subject: [PATCH 06/83] rxrpc: Fix missing notification Under some circumstances, rxrpc will fail a transmit a packet through the underlying UDP socket (ie. UDP sendmsg returns an error). This may result in a call getting stuck. In the instance being seen, where AFS tries to send a probe to the Volume Location server, tracepoints show the UDP Tx failure (in this case returing error 99 EADDRNOTAVAIL) and then nothing more: afs_make_vl_call: c=0000015d VL.GetCapabilities rxrpc_call: c=0000015d NWc u=1 sp=rxrpc_kernel_begin_call+0x106/0x170 [rxrpc] a=00000000dd89ee8a rxrpc_call: c=0000015d Gus u=2 sp=rxrpc_new_client_call+0x14f/0x580 [rxrpc] a=00000000e20e4b08 rxrpc_call: c=0000015d SEE u=2 sp=rxrpc_activate_one_channel+0x7b/0x1c0 [rxrpc] a=00000000e20e4b08 rxrpc_call: c=0000015d CON u=2 sp=rxrpc_kernel_begin_call+0x106/0x170 [rxrpc] a=00000000e20e4b08 rxrpc_tx_fail: c=0000015d r=1 ret=-99 CallDataNofrag The problem is that if the initial packet fails and the retransmission timer hasn't been started, the call is set to completed and an error is returned from rxrpc_send_data_packet() to rxrpc_queue_packet(). Though rxrpc_instant_resend() is called, this does nothing because the call is marked completed. So rxrpc_notify_socket() isn't called and the error is passed back up to rxrpc_send_data(), rxrpc_kernel_send_data() and thence to afs_make_call() and afs_vl_get_capabilities() where it is simply ignored because it is assumed that the result of a probe will be collected asynchronously. Fileserver probing is similarly affected via afs_fs_get_capabilities(). Fix this by always issuing a notification in __rxrpc_set_call_completion() if it shifts a call to the completed state, even if an error is also returned to the caller through the function return value. Also put in a little bit of optimisation to avoid taking the call state_lock and disabling softirqs if the call is already in the completed state and remove some now redundant rxrpc_notify_socket() calls. Fixes: f5c17aaeb2ae ("rxrpc: Calls should only have one terminal state") Reported-by: Gerry Seidman Signed-off-by: David Howells Reviewed-by: Marc Dionne --- net/rxrpc/call_event.c | 1 - net/rxrpc/conn_event.c | 7 +++---- net/rxrpc/input.c | 7 ++----- net/rxrpc/peer_event.c | 4 +--- net/rxrpc/recvmsg.c | 21 +++++++++++++-------- net/rxrpc/sendmsg.c | 6 ++---- 6 files changed, 21 insertions(+), 25 deletions(-) diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 2a65ac41055f..61a51c251e1b 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -320,7 +320,6 @@ void rxrpc_process_call(struct work_struct *work) if (call->state == RXRPC_CALL_COMPLETE) { del_timer_sync(&call->timer); - rxrpc_notify_socket(call); goto out_put; } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 06fcff2ebbba..447f55ca6886 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -173,10 +173,9 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, else trace_rxrpc_rx_abort(call, serial, conn->abort_code); - if (rxrpc_set_call_completion(call, compl, - conn->abort_code, - conn->error)) - rxrpc_notify_socket(call); + rxrpc_set_call_completion(call, compl, + conn->abort_code, + conn->error); } } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 3be4177baf70..299ac98e9754 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -275,7 +275,6 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, case RXRPC_CALL_SERVER_AWAIT_ACK: __rxrpc_call_completed(call); - rxrpc_notify_socket(call); state = call->state; break; @@ -1013,9 +1012,8 @@ static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb) _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); - if (rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, - abort_code, -ECONNABORTED)) - rxrpc_notify_socket(call); + rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, + abort_code, -ECONNABORTED); } /* @@ -1102,7 +1100,6 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_sock *rx, spin_lock(&rx->incoming_lock); __rxrpc_disconnect_call(conn, call); spin_unlock(&rx->incoming_lock); - rxrpc_notify_socket(call); } /* diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index b1449d971883..4704a8dceced 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -289,9 +289,7 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error, hlist_for_each_entry_rcu(call, &peer->error_targets, error_link) { rxrpc_see_call(call); - if (call->state < RXRPC_CALL_COMPLETE && - rxrpc_set_call_completion(call, compl, 0, -error)) - rxrpc_notify_socket(call); + rxrpc_set_call_completion(call, compl, 0, -error); } } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 6c4ba4224ddc..2989742a4aa1 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -73,6 +73,7 @@ bool __rxrpc_set_call_completion(struct rxrpc_call *call, call->state = RXRPC_CALL_COMPLETE; trace_rxrpc_call_complete(call); wake_up(&call->waitq); + rxrpc_notify_socket(call); return true; } return false; @@ -83,11 +84,13 @@ bool rxrpc_set_call_completion(struct rxrpc_call *call, u32 abort_code, int error) { - bool ret; + bool ret = false; - write_lock_bh(&call->state_lock); - ret = __rxrpc_set_call_completion(call, compl, abort_code, error); - write_unlock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) { + write_lock_bh(&call->state_lock); + ret = __rxrpc_set_call_completion(call, compl, abort_code, error); + write_unlock_bh(&call->state_lock); + } return ret; } @@ -101,11 +104,13 @@ bool __rxrpc_call_completed(struct rxrpc_call *call) bool rxrpc_call_completed(struct rxrpc_call *call) { - bool ret; + bool ret = false; - write_lock_bh(&call->state_lock); - ret = __rxrpc_call_completed(call); - write_unlock_bh(&call->state_lock); + if (call->state < RXRPC_CALL_COMPLETE) { + write_lock_bh(&call->state_lock); + ret = __rxrpc_call_completed(call); + write_unlock_bh(&call->state_lock); + } return ret; } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 5dd9ba000c00..1304b8608f56 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -261,10 +261,8 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, case -ENETUNREACH: case -EHOSTUNREACH: case -ECONNREFUSED: - if (rxrpc_set_call_completion(call, - RXRPC_CALL_LOCAL_ERROR, - 0, ret)) - rxrpc_notify_socket(call); + rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, + 0, ret); goto out; } _debug("need instant resend %d", ret); From 327cdb98fb3eed8373777465759c44f8d123c227 Mon Sep 17 00:00:00 2001 From: Flavio Suligoi Date: Fri, 5 Jun 2020 17:41:04 +0200 Subject: [PATCH 07/83] doc: networking: wireless: fix wiki website url In the files: - regulatory.rst - mac80211-injection.rst the wiki url is still the old "wireless.kernel.org" instead of the new "wireless.wiki.kernel.org" Signed-off-by: Flavio Suligoi Link: https://lore.kernel.org/r/20200605154112.16277-2-f.suligoi@asem.it Signed-off-by: Johannes Berg --- Documentation/networking/mac80211-injection.rst | 2 +- Documentation/networking/regulatory.rst | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/mac80211-injection.rst b/Documentation/networking/mac80211-injection.rst index be65f886ff1f..63ba6611fdff 100644 --- a/Documentation/networking/mac80211-injection.rst +++ b/Documentation/networking/mac80211-injection.rst @@ -101,6 +101,6 @@ interface), along the following lines::: You can also find a link to a complete inject application here: -http://wireless.kernel.org/en/users/Documentation/packetspammer +https://wireless.wiki.kernel.org/en/users/Documentation/packetspammer Andy Green diff --git a/Documentation/networking/regulatory.rst b/Documentation/networking/regulatory.rst index 8701b91e81ee..16782a95b74a 100644 --- a/Documentation/networking/regulatory.rst +++ b/Documentation/networking/regulatory.rst @@ -9,7 +9,7 @@ regulatory infrastructure works. More up to date information can be obtained at the project's web page: -http://wireless.kernel.org/en/developers/Regulatory +https://wireless.wiki.kernel.org/en/developers/Regulatory Keeping regulatory domains in userspace --------------------------------------- @@ -37,7 +37,7 @@ expected regulatory domains will be respected by the kernel. A currently available userspace agent which can accomplish this is CRDA - central regulatory domain agent. Its documented here: -http://wireless.kernel.org/en/developers/Regulatory/CRDA +https://wireless.wiki.kernel.org/en/developers/Regulatory/CRDA Essentially the kernel will send a udev event when it knows it needs a new regulatory domain. A udev rule can be put in place @@ -58,7 +58,7 @@ Who asks for regulatory domains? Users can use iw: -http://wireless.kernel.org/en/users/Documentation/iw +https://wireless.wiki.kernel.org/en/users/Documentation/iw An example:: From 97eda66421c44f1449e8d087fd05eab5d466afb7 Mon Sep 17 00:00:00 2001 From: Flavio Suligoi Date: Fri, 5 Jun 2020 17:41:11 +0200 Subject: [PATCH 08/83] include: fix wiki website url in netlink interface header The wiki url is still the old "wireless.kernel.org" instead of the new "wireless.wiki.kernel.org" Signed-off-by: Flavio Suligoi Link: https://lore.kernel.org/r/20200605154112.16277-9-f.suligoi@asem.it Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index dad8c8f8581f..4e6339ab1fce 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -794,7 +794,7 @@ * various triggers. These triggers can be configured through this * command with the %NL80211_ATTR_WOWLAN_TRIGGERS attribute. For * more background information, see - * http://wireless.kernel.org/en/users/Documentation/WoWLAN. + * https://wireless.wiki.kernel.org/en/users/Documentation/WoWLAN. * The @NL80211_CMD_SET_WOWLAN command can also be used as a notification * from the driver reporting the wakeup reason. In this case, the * @NL80211_ATTR_WOWLAN_TRIGGERS attribute will contain the reason From 59d4bfc1e2c09435d91c980b03f7b72ce6e9f24e Mon Sep 17 00:00:00 2001 From: Flavio Suligoi Date: Fri, 5 Jun 2020 17:41:12 +0200 Subject: [PATCH 09/83] net: fix wiki website url mac80211 and wireless files In the files: - net/mac80211/rx.c - net/wireless/Kconfig the wiki url is still the old "wireless.kernel.org" instead of the new "wireless.wiki.kernel.org" Signed-off-by: Flavio Suligoi Link: https://lore.kernel.org/r/20200605154112.16277-10-f.suligoi@asem.it Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 2 +- net/wireless/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 21854a61a2b7..a88ab6fb16f2 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4694,7 +4694,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, * rate_idx is MCS index, which can be [0-76] * as documented on: * - * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n + * https://wireless.wiki.kernel.org/en/developers/Documentation/ieee80211/802.11n * * Anything else would be some sort of driver or * hardware error. The driver should catch hardware diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 813e93644ae7..d69558487041 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -31,7 +31,7 @@ config CFG80211 For more information refer to documentation on the wireless wiki: - http://wireless.kernel.org/en/developers/Documentation/cfg80211 + https://wireless.wiki.kernel.org/en/developers/Documentation/cfg80211 When built as a module it will be called cfg80211. From 1f2436229bf64ac040f2f5018df059c21fc5526a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 7 Jun 2020 17:36:15 -0700 Subject: [PATCH 10/83] selftests/bpf: Fix ringbuf selftest sample counting undeterminism Fix test race, in which background poll can get either 5 or 6 samples, depending on timing of notification. Prevent this by open-coding sample triggering and forcing notification for the very last sample only. Also switch to using atomic increments and exchanges for more obviously reliable counting and checking. Additionally, check expected processed sample counters for single-threaded use cases as well. Fixes: 9a5f25ad30e5 ("selftests/bpf: Fix sample_cnt shared between two threads") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200608003615.3549991-1-andriin@fb.com --- .../selftests/bpf/prog_tests/ringbuf.c | 42 +++++++++++++++---- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index 2bba908dfa63..c1650548433c 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -25,13 +25,23 @@ struct sample { char comm[16]; }; -static volatile int sample_cnt; +static int sample_cnt; + +static void atomic_inc(int *cnt) +{ + __atomic_add_fetch(cnt, 1, __ATOMIC_SEQ_CST); +} + +static int atomic_xchg(int *cnt, int val) +{ + return __atomic_exchange_n(cnt, val, __ATOMIC_SEQ_CST); +} static int process_sample(void *ctx, void *data, size_t len) { struct sample *s = data; - sample_cnt++; + atomic_inc(&sample_cnt); switch (s->seq) { case 0: @@ -76,7 +86,7 @@ void test_ringbuf(void) const size_t rec_sz = BPF_RINGBUF_HDR_SZ + sizeof(struct sample); pthread_t thread; long bg_ret = -1; - int err; + int err, cnt; skel = test_ringbuf__open_and_load(); if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n")) @@ -116,11 +126,15 @@ void test_ringbuf(void) /* -EDONE is used as an indicator that we are done */ if (CHECK(err != -EDONE, "err_done", "done err: %d\n", err)) goto cleanup; + cnt = atomic_xchg(&sample_cnt, 0); + CHECK(cnt != 2, "cnt", "exp %d samples, got %d\n", 2, cnt); /* we expect extra polling to return nothing */ err = ring_buffer__poll(ringbuf, 0); if (CHECK(err != 0, "extra_samples", "poll result: %d\n", err)) goto cleanup; + cnt = atomic_xchg(&sample_cnt, 0); + CHECK(cnt != 0, "cnt", "exp %d samples, got %d\n", 0, cnt); CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", 0L, skel->bss->dropped); @@ -136,6 +150,8 @@ void test_ringbuf(void) 3L * rec_sz, skel->bss->cons_pos); err = ring_buffer__poll(ringbuf, -1); CHECK(err <= 0, "poll_err", "err %d\n", err); + cnt = atomic_xchg(&sample_cnt, 0); + CHECK(cnt != 2, "cnt", "exp %d samples, got %d\n", 2, cnt); /* start poll in background w/ long timeout */ err = pthread_create(&thread, NULL, poll_thread, (void *)(long)10000); @@ -164,6 +180,8 @@ void test_ringbuf(void) 2L, skel->bss->total); CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n", 1L, skel->bss->discarded); + cnt = atomic_xchg(&sample_cnt, 0); + CHECK(cnt != 0, "cnt", "exp %d samples, got %d\n", 0, cnt); /* clear flags to return to "adaptive" notification mode */ skel->bss->flags = 0; @@ -178,10 +196,20 @@ void test_ringbuf(void) if (CHECK(err != EBUSY, "try_join", "err %d\n", err)) goto cleanup; + /* still no samples, because consumer is behind */ + cnt = atomic_xchg(&sample_cnt, 0); + CHECK(cnt != 0, "cnt", "exp %d samples, got %d\n", 0, cnt); + + skel->bss->dropped = 0; + skel->bss->total = 0; + skel->bss->discarded = 0; + + skel->bss->value = 333; + syscall(__NR_getpgid); /* now force notifications */ skel->bss->flags = BPF_RB_FORCE_WAKEUP; - sample_cnt = 0; - trigger_samples(); + skel->bss->value = 777; + syscall(__NR_getpgid); /* now we should get a pending notification */ usleep(50000); @@ -193,8 +221,8 @@ void test_ringbuf(void) goto cleanup; /* 3 rounds, 2 samples each */ - CHECK(sample_cnt != 6, "wrong_sample_cnt", - "expected to see %d samples, got %d\n", 6, sample_cnt); + cnt = atomic_xchg(&sample_cnt, 0); + CHECK(cnt != 6, "cnt", "exp %d samples, got %d\n", 6, cnt); /* BPF side did everything right */ CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n", From 0e6fbe39bdf71b4e665767bcbf53567a3e6d0623 Mon Sep 17 00:00:00 2001 From: Pooja Trivedi Date: Fri, 5 Jun 2020 16:01:18 +0000 Subject: [PATCH 11/83] net/tls(TLS_SW): Add selftest for 'chunked' sendfile test This selftest tests for cases where sendfile's 'count' parameter is provided with a size greater than the intended file size. Motivation: When sendfile is provided with 'count' parameter value that is greater than the size of the file, kTLS example fails to send the file correctly. Last chunk of the file is not sent, and the data integrity is compromised. The reason is that the last chunk has MSG_MORE flag set because of which it gets added to pending records, but is not pushed. Note that if user space were to send SSL_shutdown control message, pending records would get flushed and the issue would not happen. So a shutdown control message following sendfile can mask the issue. Signed-off-by: Pooja Trivedi Signed-off-by: Mallesham Jatharkonda Signed-off-by: Josh Tway Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- tools/testing/selftests/net/tls.c | 58 +++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index c5282e62df75..b599f1fa99b5 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -213,6 +213,64 @@ TEST_F(tls, send_then_sendfile) EXPECT_EQ(recv(self->cfd, buf, st.st_size, MSG_WAITALL), st.st_size); } +static void chunked_sendfile(struct __test_metadata *_metadata, + struct _test_data_tls *self, + uint16_t chunk_size, + uint16_t extra_payload_size) +{ + char buf[TLS_PAYLOAD_MAX_LEN]; + uint16_t test_payload_size; + int size = 0; + int ret; + char filename[] = "/tmp/mytemp.XXXXXX"; + int fd = mkstemp(filename); + off_t offset = 0; + + unlink(filename); + ASSERT_GE(fd, 0); + EXPECT_GE(chunk_size, 1); + test_payload_size = chunk_size + extra_payload_size; + ASSERT_GE(TLS_PAYLOAD_MAX_LEN, test_payload_size); + memset(buf, 1, test_payload_size); + size = write(fd, buf, test_payload_size); + EXPECT_EQ(size, test_payload_size); + fsync(fd); + + while (size > 0) { + ret = sendfile(self->fd, fd, &offset, chunk_size); + EXPECT_GE(ret, 0); + size -= ret; + } + + EXPECT_EQ(recv(self->cfd, buf, test_payload_size, MSG_WAITALL), + test_payload_size); + + close(fd); +} + +TEST_F(tls, multi_chunk_sendfile) +{ + chunked_sendfile(_metadata, self, 4096, 4096); + chunked_sendfile(_metadata, self, 4096, 0); + chunked_sendfile(_metadata, self, 4096, 1); + chunked_sendfile(_metadata, self, 4096, 2048); + chunked_sendfile(_metadata, self, 8192, 2048); + chunked_sendfile(_metadata, self, 4096, 8192); + chunked_sendfile(_metadata, self, 8192, 4096); + chunked_sendfile(_metadata, self, 12288, 1024); + chunked_sendfile(_metadata, self, 12288, 2000); + chunked_sendfile(_metadata, self, 15360, 100); + chunked_sendfile(_metadata, self, 15360, 300); + chunked_sendfile(_metadata, self, 1, 4096); + chunked_sendfile(_metadata, self, 2048, 4096); + chunked_sendfile(_metadata, self, 2048, 8192); + chunked_sendfile(_metadata, self, 4096, 8192); + chunked_sendfile(_metadata, self, 1024, 12288); + chunked_sendfile(_metadata, self, 2000, 12288); + chunked_sendfile(_metadata, self, 100, 15360); + chunked_sendfile(_metadata, self, 300, 15360); +} + TEST_F(tls, recv_max) { unsigned int send_len = TLS_PAYLOAD_MAX_LEN; From 3763a24c727ecf236358a81ee749e5fcab1c972a Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Sun, 7 Jun 2020 18:54:41 -0700 Subject: [PATCH 12/83] net-zerocopy: use vm_insert_pages() for tcp rcv zerocopy Use vm_insert_pages() for tcp receive zerocopy. Spin lock cycles (as reported by perf) drop from a couple of percentage points to a fraction of a percent. This results in a roughly 6% increase in efficiency, measured roughly as zerocopy receive count divided by CPU utilization. The intention of this patchset is to reduce atomic ops for tcp zerocopy receives, which normally hits the same spinlock multiple times consecutively. [akpm@linux-foundation.org: suppress gcc-7.2.0 warning] Link: http://lkml.kernel.org/r/20200128025958.43490-3-arjunroy.kdev@gmail.com Signed-off-by: Arjun Roy Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Cc: David Miller Cc: Matthew Wilcox Cc: Jason Gunthorpe Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 70 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 15d47d5e7951..ecbba0abd3e5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1742,14 +1742,48 @@ int tcp_mmap(struct file *file, struct socket *sock, } EXPORT_SYMBOL(tcp_mmap); +static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, + struct page **pages, + unsigned long pages_to_map, + unsigned long *insert_addr, + u32 *length_with_pending, + u32 *seq, + struct tcp_zerocopy_receive *zc) +{ + unsigned long pages_remaining = pages_to_map; + int bytes_mapped; + int ret; + + ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining); + bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining); + /* Even if vm_insert_pages fails, it may have partially succeeded in + * mapping (some but not all of the pages). + */ + *seq += bytes_mapped; + *insert_addr += bytes_mapped; + if (ret) { + /* But if vm_insert_pages did fail, we have to unroll some state + * we speculatively touched before. + */ + const int bytes_not_mapped = PAGE_SIZE * pages_remaining; + *length_with_pending -= bytes_not_mapped; + zc->recv_skip_hint += bytes_not_mapped; + } + return ret; +} + static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc) { unsigned long address = (unsigned long)zc->address; u32 length = 0, seq, offset, zap_len; + #define PAGE_BATCH_SIZE 8 + struct page *pages[PAGE_BATCH_SIZE]; const skb_frag_t *frags = NULL; struct vm_area_struct *vma; struct sk_buff *skb = NULL; + unsigned long pg_idx = 0; + unsigned long curr_addr; struct tcp_sock *tp; int inq; int ret; @@ -1762,6 +1796,8 @@ static int tcp_zerocopy_receive(struct sock *sk, sock_rps_record_flow(sk); + tp = tcp_sk(sk); + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, address); @@ -1771,7 +1807,6 @@ static int tcp_zerocopy_receive(struct sock *sk, } zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); - tp = tcp_sk(sk); seq = tp->copied_seq; inq = tcp_inq(sk); zc->length = min_t(u32, zc->length, inq); @@ -1783,8 +1818,20 @@ static int tcp_zerocopy_receive(struct sock *sk, zc->recv_skip_hint = zc->length; } ret = 0; + curr_addr = address; while (length + PAGE_SIZE <= zc->length) { if (zc->recv_skip_hint < PAGE_SIZE) { + /* If we're here, finish the current batch. */ + if (pg_idx) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, + pg_idx, + &curr_addr, + &length, + &seq, zc); + if (ret) + goto out; + pg_idx = 0; + } if (skb) { if (zc->recv_skip_hint > 0) break; @@ -1793,7 +1840,6 @@ static int tcp_zerocopy_receive(struct sock *sk, } else { skb = tcp_recv_skb(sk, seq, &offset); } - zc->recv_skip_hint = skb->len - offset; offset -= skb_headlen(skb); if ((int)offset < 0 || skb_has_frag_list(skb)) @@ -1817,14 +1863,24 @@ static int tcp_zerocopy_receive(struct sock *sk, zc->recv_skip_hint -= remaining; break; } - ret = vm_insert_page(vma, address + length, - skb_frag_page(frags)); - if (ret) - break; + pages[pg_idx] = skb_frag_page(frags); + pg_idx++; length += PAGE_SIZE; - seq += PAGE_SIZE; zc->recv_skip_hint -= PAGE_SIZE; frags++; + if (pg_idx == PAGE_BATCH_SIZE) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, + &curr_addr, &length, + &seq, zc); + if (ret) + goto out; + pg_idx = 0; + } + } + if (pg_idx) { + ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx, + &curr_addr, &length, &seq, + zc); } out: up_read(¤t->mm->mmap_sem); From 8e60eed6b38e464e8c9d68f9caecafaa554dffe0 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 8 Jun 2020 18:47:54 +0800 Subject: [PATCH 13/83] mptcp: bugfix for RM_ADDR option parsing In MPTCPOPT_RM_ADDR option parsing, the pointer "ptr" pointed to the "Subtype" octet, the pointer "ptr+1" pointed to the "Address ID" octet: +-------+-------+---------------+ |Subtype|(resvd)| Address ID | +-------+-------+---------------+ | | ptr ptr+1 We should set mp_opt->rm_id to the value of "ptr+1", not "ptr". This patch will fix this bug. Fixes: 3df523ab582c ("mptcp: Add ADD_ADDR handling") Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/options.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 01f1f4cf4902..490b92534afc 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -273,6 +273,8 @@ static void mptcp_parse_option(const struct sk_buff *skb, if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE) break; + ptr++; + mp_opt->rm_addr = 1; mp_opt->rm_id = *ptr++; pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); From 8027bc0307ce59759b90679fa5d8b22949586d20 Mon Sep 17 00:00:00 2001 From: tannerlove Date: Mon, 8 Jun 2020 15:37:15 -0400 Subject: [PATCH 14/83] selftests/net: in timestamping, strncpy needs to preserve null byte MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If user passed an interface option longer than 15 characters, then device.ifr_name and hwtstamp.ifr_name became non-null-terminated strings. The compiler warned about this: timestamping.c:353:2: warning: ‘strncpy’ specified bound 16 equals \ destination size [-Wstringop-truncation] 353 | strncpy(device.ifr_name, interface, sizeof(device.ifr_name)); Fixes: cb9eff097831 ("net: new user space API for time stamping of incoming and outgoing packets") Signed-off-by: Tanner Love Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- tools/testing/selftests/net/timestamping.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/timestamping.c b/tools/testing/selftests/net/timestamping.c index aca3491174a1..f4bb4fef0f39 100644 --- a/tools/testing/selftests/net/timestamping.c +++ b/tools/testing/selftests/net/timestamping.c @@ -313,10 +313,16 @@ int main(int argc, char **argv) int val; socklen_t len; struct timeval next; + size_t if_len; if (argc < 2) usage(0); interface = argv[1]; + if_len = strlen(interface); + if (if_len >= IFNAMSIZ) { + printf("interface name exceeds IFNAMSIZ\n"); + exit(1); + } for (i = 2; i < argc; i++) { if (!strcasecmp(argv[i], "SO_TIMESTAMP")) @@ -350,12 +356,12 @@ int main(int argc, char **argv) bail("socket"); memset(&device, 0, sizeof(device)); - strncpy(device.ifr_name, interface, sizeof(device.ifr_name)); + memcpy(device.ifr_name, interface, if_len + 1); if (ioctl(sock, SIOCGIFADDR, &device) < 0) bail("getting interface IP address"); memset(&hwtstamp, 0, sizeof(hwtstamp)); - strncpy(hwtstamp.ifr_name, interface, sizeof(hwtstamp.ifr_name)); + memcpy(hwtstamp.ifr_name, interface, if_len + 1); hwtstamp.ifr_data = (void *)&hwconfig; memset(&hwconfig, 0, sizeof(hwconfig)); hwconfig.tx_type = From 487082fb7bd2a32b66927d2b22e3a81b072b44f0 Mon Sep 17 00:00:00 2001 From: dihu Date: Fri, 5 Jun 2020 16:46:25 +0800 Subject: [PATCH 15/83] bpf/sockmap: Fix kernel panic at __tcp_bpf_recvmsg When user application calls read() with MSG_PEEK flag to read data of bpf sockmap socket, kernel panic happens at __tcp_bpf_recvmsg+0x12c/0x350. sk_msg is not removed from ingress_msg queue after read out under MSG_PEEK flag is set. Because it's not judged whether sk_msg is the last msg of ingress_msg queue, the next sk_msg may be the head of ingress_msg queue, whose memory address of sg page is invalid. So it's necessary to add check codes to prevent this problem. [20759.125457] BUG: kernel NULL pointer dereference, address: 0000000000000008 [20759.132118] CPU: 53 PID: 51378 Comm: envoy Tainted: G E 5.4.32 #1 [20759.140890] Hardware name: Inspur SA5212M4/YZMB-00370-109, BIOS 4.1.12 06/18/2017 [20759.149734] RIP: 0010:copy_page_to_iter+0xad/0x300 [20759.270877] __tcp_bpf_recvmsg+0x12c/0x350 [20759.276099] tcp_bpf_recvmsg+0x113/0x370 [20759.281137] inet_recvmsg+0x55/0xc0 [20759.285734] __sys_recvfrom+0xc8/0x130 [20759.290566] ? __audit_syscall_entry+0x103/0x130 [20759.296227] ? syscall_trace_enter+0x1d2/0x2d0 [20759.301700] ? __audit_syscall_exit+0x1e4/0x290 [20759.307235] __x64_sys_recvfrom+0x24/0x30 [20759.312226] do_syscall_64+0x55/0x1b0 [20759.316852] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Signed-off-by: dihu Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20200605084625.9783-1-anny.hu@linux.alibaba.com --- net/ipv4/tcp_bpf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 629aaa9a1eb9..2b915aafda42 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -64,6 +64,9 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, } while (i != msg_rx->sg.end); if (unlikely(peek)) { + if (msg_rx == list_last_entry(&psock->ingress_msg, + struct sk_msg, list)) + break; msg_rx = list_next_entry(msg_rx, list); continue; } From 33a7c831565c43a7ee2f38c7df4c4a40e1dfdfed Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 7 Jun 2020 22:52:28 +0200 Subject: [PATCH 16/83] bpf, sockhash: Fix memory leak when unlinking sockets in sock_hash_free When sockhash gets destroyed while sockets are still linked to it, we will walk the bucket lists and delete the links. However, we are not freeing the list elements after processing them, leaking the memory. The leak can be triggered by close()'ing a sockhash map when it still contains sockets, and observed with kmemleak: unreferenced object 0xffff888116e86f00 (size 64): comm "race_sock_unlin", pid 223, jiffies 4294731063 (age 217.404s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 81 de e8 41 00 00 00 00 c0 69 2f 15 81 88 ff ff ...A.....i/..... backtrace: [<00000000dd089ebb>] sock_hash_update_common+0x4ca/0x760 [<00000000b8219bd5>] sock_hash_update_elem+0x1d2/0x200 [<000000005e2c23de>] __do_sys_bpf+0x2046/0x2990 [<00000000d0084618>] do_syscall_64+0xad/0x9a0 [<000000000d96f263>] entry_SYSCALL_64_after_hwframe+0x49/0xb3 Fix it by freeing the list element when we're done with it. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200607205229.2389672-2-jakub@cloudflare.com --- net/core/sock_map.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 00a26cf2cfe9..ea46f07a22d8 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1031,6 +1031,7 @@ static void sock_hash_free(struct bpf_map *map) sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); + sock_hash_free_elem(htab, elem); } } From 75e68e5bf2c7fa9d3e874099139df03d5952a3e1 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 7 Jun 2020 22:52:29 +0200 Subject: [PATCH 17/83] bpf, sockhash: Synchronize delete from bucket list on map free We can end up modifying the sockhash bucket list from two CPUs when a sockhash is being destroyed (sock_hash_free) on one CPU, while a socket that is in the sockhash is unlinking itself from it on another CPU it (sock_hash_delete_from_link). This results in accessing a list element that is in an undefined state as reported by KASAN: | ================================================================== | BUG: KASAN: wild-memory-access in sock_hash_free+0x13c/0x280 | Write of size 8 at addr dead000000000122 by task kworker/2:1/95 | | CPU: 2 PID: 95 Comm: kworker/2:1 Not tainted 5.7.0-rc7-02961-ge22c35ab0038-dirty #691 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 | Workqueue: events bpf_map_free_deferred | Call Trace: | dump_stack+0x97/0xe0 | ? sock_hash_free+0x13c/0x280 | __kasan_report.cold+0x5/0x40 | ? mark_lock+0xbc1/0xc00 | ? sock_hash_free+0x13c/0x280 | kasan_report+0x38/0x50 | ? sock_hash_free+0x152/0x280 | sock_hash_free+0x13c/0x280 | bpf_map_free_deferred+0xb2/0xd0 | ? bpf_map_charge_finish+0x50/0x50 | ? rcu_read_lock_sched_held+0x81/0xb0 | ? rcu_read_lock_bh_held+0x90/0x90 | process_one_work+0x59a/0xac0 | ? lock_release+0x3b0/0x3b0 | ? pwq_dec_nr_in_flight+0x110/0x110 | ? rwlock_bug.part.0+0x60/0x60 | worker_thread+0x7a/0x680 | ? _raw_spin_unlock_irqrestore+0x4c/0x60 | kthread+0x1cc/0x220 | ? process_one_work+0xac0/0xac0 | ? kthread_create_on_node+0xa0/0xa0 | ret_from_fork+0x24/0x30 | ================================================================== Fix it by reintroducing spin-lock protected critical section around the code that removes the elements from the bucket on sockhash free. To do that we also need to defer processing of removed elements, until out of atomic context so that we can unlink the socket from the map when holding the sock lock. Fixes: 90db6d772f74 ("bpf, sockmap: Remove bucket->lock from sock_{hash|map}_free") Reported-by: Eric Dumazet Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200607205229.2389672-3-jakub@cloudflare.com --- net/core/sock_map.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index ea46f07a22d8..17a40a947546 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1013,6 +1013,7 @@ static void sock_hash_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct bpf_htab_bucket *bucket; + struct hlist_head unlink_list; struct bpf_htab_elem *elem; struct hlist_node *node; int i; @@ -1024,13 +1025,31 @@ static void sock_hash_free(struct bpf_map *map) synchronize_rcu(); for (i = 0; i < htab->buckets_num; i++) { bucket = sock_hash_select_bucket(htab, i); - hlist_for_each_entry_safe(elem, node, &bucket->head, node) { - hlist_del_rcu(&elem->node); + + /* We are racing with sock_hash_delete_from_link to + * enter the spin-lock critical section. Every socket on + * the list is still linked to sockhash. Since link + * exists, psock exists and holds a ref to socket. That + * lets us to grab a socket ref too. + */ + raw_spin_lock_bh(&bucket->lock); + hlist_for_each_entry(elem, &bucket->head, node) + sock_hold(elem->sk); + hlist_move_list(&bucket->head, &unlink_list); + raw_spin_unlock_bh(&bucket->lock); + + /* Process removed entries out of atomic context to + * block for socket lock before deleting the psock's + * link to sockhash. + */ + hlist_for_each_entry_safe(elem, node, &unlink_list, node) { + hlist_del(&elem->node); lock_sock(elem->sk); rcu_read_lock(); sock_map_unref(elem->sk, elem); rcu_read_unlock(); release_sock(elem->sk); + sock_put(elem->sk); sock_hash_free_elem(htab, elem); } } From 21a85bd601ee50f2796d52c542c46d04e21cedac Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 8 Jun 2020 10:42:57 +0100 Subject: [PATCH 18/83] scripts: Require pahole v1.16 when generating BTF bpf_iter requires the kernel BTF to be generated with pahole >= 1.16, since otherwise the function definitions that the iterator attaches to are not included. This failure mode is indistiguishable from trying to attach to an iterator that really doesn't exist. Since it's really easy to miss this requirement, bump the pahole version check used at build time to at least 1.16. Fixes: 15d83c4d7cef ("bpf: Allow loading of a bpf_iter program") Suggested-by: Ivan Babrou Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200608094257.47366-1-lmb@cloudflare.com --- scripts/link-vmlinux.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 3adef49250af..a37875904ca6 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -143,8 +143,8 @@ gen_btf() fi pahole_ver=$(${PAHOLE} --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/') - if [ "${pahole_ver}" -lt "113" ]; then - echo >&2 "BTF: ${1}: pahole version $(${PAHOLE} --version) is too old, need at least v1.13" + if [ "${pahole_ver}" -lt "116" ]; then + echo >&2 "BTF: ${1}: pahole version $(${PAHOLE} --version) is too old, need at least v1.16" return 1 fi From 22d5bd6867364b41576a712755271a7d6161abd6 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Mon, 8 Jun 2020 14:45:32 +0200 Subject: [PATCH 19/83] tracing/probe: Fix bpf_task_fd_query() for kprobes and uprobes Commit 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") removed the trace_[ku]probe structure from the trace_event_call->data pointer. As bpf_get_[ku]probe_info() were forgotten in that change, fix them now. These functions are currently only used by the bpf_task_fd_query() syscall handler to collect information about a perf event. Fixes: 60d53e2c3b75 ("tracing/probe: Split trace_event related data from trace_probe") Signed-off-by: Jean-Philippe Brucker Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/20200608124531.819838-1-jean-philippe@linaro.org --- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_uprobe.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 35989383ae11..8eeb95e04bf5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1629,7 +1629,7 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, if (perf_type_tracepoint) tk = find_trace_kprobe(pevent, group); else - tk = event->tp_event->data; + tk = trace_kprobe_primary_from_call(event->tp_event); if (!tk) return -EINVAL; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2a8e8e9c1c75..fdd47f99b18f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1412,7 +1412,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, if (perf_type_tracepoint) tu = find_probe_event(pevent, group); else - tu = event->tp_event->data; + tu = trace_uprobe_primary_from_call(event->tp_event); if (!tu) return -EINVAL; From 26afa0a4eb3fd87757f9de56ec5db5a03b14e120 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 8 Jun 2020 09:17:23 -0600 Subject: [PATCH 20/83] bpf: Reset data_meta before running programs attached to devmap entry This is a new context that does not handle metadata at the moment, so mark data_meta invalid. Fixes: fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry") Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200608151723.9539-1-dsahern@kernel.org --- kernel/bpf/devmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 854b09beb16b..bfdff2faf5cb 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -479,6 +479,7 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev, struct xdp_txq_info txq = { .dev = dev }; u32 act; + xdp_set_data_meta_invalid(xdp); xdp->txq = &txq; act = bpf_prog_run_xdp(xdp_prog, xdp); From 248e00ac47d64e153b9c50f45aad73cd61894a73 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 8 Jun 2020 17:22:01 +0100 Subject: [PATCH 21/83] bpf: cgroup: Allow multi-attach program to replace itself When using BPF_PROG_ATTACH to attach a program to a cgroup in BPF_F_ALLOW_MULTI mode, it is not possible to replace a program with itself. This is because the check for duplicate programs doesn't take the replacement program into account. Replacing a program with itself might seem weird, but it has some uses: first, it allows resetting the associated cgroup storage. Second, it makes the API consistent with the non-ALLOW_MULTI usage, where it is possible to replace a program with itself. Third, it aligns BPF_PROG_ATTACH with bpf_link, where replacing itself is also supported. Sice this code has been refactored a few times this change will only apply to v5.7 and later. Adjustments could be made to commit 1020c1f24a94 ("bpf: Simplify __cgroup_bpf_attach") and commit d7bf2c10af05 ("bpf: allocate cgroup storage entries on attaching bpf programs") as well as commit 324bda9e6c5a ("bpf: multi program support for cgroup+bpf") Fixes: af6eea57437a ("bpf: Implement bpf_link-based cgroup BPF program attachment") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200608162202.94002-1-lmb@cloudflare.com --- kernel/bpf/cgroup.c | 2 +- .../testing/selftests/bpf/prog_tests/cgroup_attach_multi.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index fdf7836750a3..4d76f16524cc 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -378,7 +378,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs, } list_for_each_entry(pl, progs, node) { - if (prog && pl->prog == prog) + if (prog && pl->prog == prog && prog != replace_prog) /* disallow attaching the same prog twice */ return ERR_PTR(-EINVAL); if (link && pl->link == link) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c index 139f8e82c7c6..b549fcfacc0b 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c @@ -230,6 +230,13 @@ void test_cgroup_attach_multi(void) "prog_replace", "errno=%d\n", errno)) goto err; + /* replace program with itself */ + attach_opts.replace_prog_fd = allow_prog[6]; + if (CHECK(bpf_prog_attach_xattr(allow_prog[6], cg1, + BPF_CGROUP_INET_EGRESS, &attach_opts), + "prog_replace", "errno=%d\n", errno)) + goto err; + value = 0; CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0)); CHECK_FAIL(system(PING_CMD)); From 281920b7e0b31e0a7706433ff58e7d52ac97c327 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Jun 2020 15:31:46 +0200 Subject: [PATCH 22/83] bpf: Devmap adjust uapi for attach bpf program V2: - Defer changing BPF-syscall to start at file-descriptor 1 - Use {} to zero initialise struct. The recent commit fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry"), introduced ability to attach (and run) a separate XDP bpf_prog for each devmap entry. A bpf_prog is added via a file-descriptor. As zero were a valid FD, not using the feature requires using value minus-1. The UAPI is extended via tail-extending struct bpf_devmap_val and using map->value_size to determine the feature set. This will break older userspace applications not using the bpf_prog feature. Consider an old userspace app that is compiled against newer kernel uapi/bpf.h, it will not know that it need to initialise the member bpf_prog.fd to minus-1. Thus, users will be forced to update source code to get program running on newer kernels. This patch remove the minus-1 checks, and have zero mean feature isn't used. Followup patches either for kernel or libbpf should handle and avoid returning file-descriptor zero in the first place. Fixes: fbee97feed9b ("bpf: Add support to attach bpf program to a devmap entry") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159170950687.2102545.7235914718298050113.stgit@firesoul --- include/uapi/linux/bpf.h | 13 +++++++++++++ kernel/bpf/devmap.c | 17 ++++------------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c65b374a5090..19684813faae 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3761,6 +3761,19 @@ struct xdp_md { __u32 egress_ifindex; /* txq->dev->ifindex */ }; +/* DEVMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_devmap_val { + __u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + enum sk_action { SK_DROP = 0, SK_PASS, diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index bfdff2faf5cb..0cbb72cdaf63 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -60,15 +60,6 @@ struct xdp_dev_bulk_queue { unsigned int count; }; -/* DEVMAP values */ -struct bpf_devmap_val { - u32 ifindex; /* device index */ - union { - int fd; /* prog fd on map write */ - u32 id; /* prog id on map read */ - } bpf_prog; -}; - struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; @@ -619,7 +610,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev->dev) goto err_out; - if (val->bpf_prog.fd >= 0) { + if (val->bpf_prog.fd > 0) { prog = bpf_prog_get_type_dev(val->bpf_prog.fd, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) @@ -653,8 +644,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; + struct bpf_devmap_val val = {}; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -670,7 +661,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (!val.ifindex) { dev = NULL; /* can not specify fd if ifindex is 0 */ - if (val.bpf_prog.fd != -1) + if (val.bpf_prog.fd > 0) return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); @@ -700,8 +691,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; + struct bpf_devmap_val val = {}; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; From 042b1545fe47788e734b0f074a8ae65856015cdf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Jun 2020 15:31:52 +0200 Subject: [PATCH 23/83] bpf: Selftests and tools use struct bpf_devmap_val from uapi Sync tools uapi bpf.h header file and update selftests that use struct bpf_devmap_val. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159170951195.2102545.1833108712124273987.stgit@firesoul --- tools/include/uapi/linux/bpf.h | 13 +++++++++++++ .../selftests/bpf/prog_tests/xdp_devmap_attach.c | 8 -------- .../selftests/bpf/progs/test_xdp_devmap_helpers.c | 2 +- .../bpf/progs/test_xdp_with_devmap_helpers.c | 3 +-- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c65b374a5090..19684813faae 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3761,6 +3761,19 @@ struct xdp_md { __u32 egress_ifindex; /* txq->dev->ifindex */ }; +/* DEVMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_devmap_val { + __u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + enum sk_action { SK_DROP = 0, SK_PASS, diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c index d19dbd668f6a..88ef3ec8ac4c 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c @@ -8,14 +8,6 @@ #define IFINDEX_LO 1 -struct bpf_devmap_val { - u32 ifindex; /* device index */ - union { - int fd; /* prog fd on map write */ - u32 id; /* prog id on map read */ - } bpf_prog; -}; - void test_xdp_with_devmap_helpers(void) { struct test_xdp_with_devmap_helpers *skel; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c index e5c0f131c8a7..b360ba2bd441 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c @@ -2,7 +2,7 @@ /* fails to load without expected_attach_type = BPF_XDP_DEVMAP * because of access to egress_ifindex */ -#include "vmlinux.h" +#include #include SEC("xdp_dm_log") diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c index deef0e050863..330811260123 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 - -#include "vmlinux.h" +#include #include struct { From 845e0ebb4408d4473cf60d21224a897037e9a77a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 8 Jun 2020 14:53:01 -0700 Subject: [PATCH 24/83] net: change addr_list_lock back to static key The dynamic key update for addr_list_lock still causes troubles, for example the following race condition still exists: CPU 0: CPU 1: (RCU read lock) (RTNL lock) dev_mc_seq_show() netdev_update_lockdep_key() -> lockdep_unregister_key() -> netif_addr_lock_bh() because lockdep doesn't provide an API to update it atomically. Therefore, we have to move it back to static keys and use subclass for nest locking like before. In commit 1a33e10e4a95 ("net: partially revert dynamic lockdep key changes"), I already reverted most parts of commit ab92d68fc22f ("net: core: add generic lockdep keys"). This patch reverts the rest and also part of commit f3b0a18bb6cb ("net: remove unnecessary variables and callback"). After this patch, addr_list_lock changes back to using static keys and subclasses to satisfy lockdep. Thanks to dev->lower_level, we do not have to change back to ->ndo_get_lock_subclass(). And hopefully this reduces some syzbot lockdep noises too. Reported-by: syzbot+f3a0e80c34b3fc28ac5e@syzkaller.appspotmail.com Cc: Taehee Yoo Cc: Dmitry Vyukov Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 -- drivers/net/bonding/bond_options.c | 2 -- drivers/net/hamradio/bpqether.c | 2 ++ drivers/net/macsec.c | 5 ++++ drivers/net/macvlan.c | 13 ++++++-- drivers/net/vxlan.c | 4 +-- .../net/wireless/intersil/hostap/hostap_hw.c | 3 ++ include/linux/netdevice.h | 12 +++++--- net/8021q/vlan_dev.c | 8 +++-- net/batman-adv/soft-interface.c | 2 ++ net/bridge/br_device.c | 8 +++++ net/core/dev.c | 30 ++++++++++--------- net/core/dev_addr_lists.c | 12 ++++---- net/core/rtnetlink.c | 1 - net/dsa/master.c | 4 +++ net/netrom/af_netrom.c | 2 ++ net/rose/af_rose.c | 2 ++ 17 files changed, 76 insertions(+), 36 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index a25c65d4af71..004919aea5fb 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3687,8 +3687,6 @@ static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd case BOND_RELEASE_OLD: case SIOCBONDRELEASE: res = bond_release(bond_dev, slave_dev); - if (!res) - netdev_update_lockdep_key(slave_dev); break; case BOND_SETHWADDR_OLD: case SIOCBONDSETHWADDR: diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 215c10923289..ddb3916d3506 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -1398,8 +1398,6 @@ static int bond_option_slaves_set(struct bonding *bond, case '-': slave_dbg(bond->dev, dev, "Releasing interface\n"); ret = bond_release(bond->dev, dev); - if (!ret) - netdev_update_lockdep_key(dev); break; default: diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index 60dcaf2a04a9..1ad6085994b1 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -113,6 +113,7 @@ static LIST_HEAD(bpq_devices); * off into a separate class since they always nest. */ static struct lock_class_key bpq_netdev_xmit_lock_key; +static struct lock_class_key bpq_netdev_addr_lock_key; static void bpq_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, @@ -123,6 +124,7 @@ static void bpq_set_lockdep_class_one(struct net_device *dev, static void bpq_set_lockdep_class(struct net_device *dev) { + lockdep_set_class(&dev->addr_list_lock, &bpq_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, bpq_set_lockdep_class_one, NULL); } diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 20b53e255f68..e56547bfdac9 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -3999,6 +3999,8 @@ static int macsec_add_dev(struct net_device *dev, sci_t sci, u8 icv_len) return 0; } +static struct lock_class_key macsec_netdev_addr_lock_key; + static int macsec_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) @@ -4050,6 +4052,9 @@ static int macsec_newlink(struct net *net, struct net_device *dev, return err; netdev_lockdep_set_classes(dev); + lockdep_set_class_and_subclass(&dev->addr_list_lock, + &macsec_netdev_addr_lock_key, + dev->lower_level); err = netdev_upper_dev_link(real_dev, dev, extack); if (err < 0) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 563aed5b3d9f..6a6cc9f75307 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -860,6 +860,8 @@ static int macvlan_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ +static struct lock_class_key macvlan_netdev_addr_lock_key; + #define ALWAYS_ON_OFFLOADS \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | \ NETIF_F_GSO_ROBUST | NETIF_F_GSO_ENCAP_ALL) @@ -875,6 +877,14 @@ static int macvlan_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) #define MACVLAN_STATE_MASK \ ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) +static void macvlan_set_lockdep_class(struct net_device *dev) +{ + netdev_lockdep_set_classes(dev); + lockdep_set_class_and_subclass(&dev->addr_list_lock, + &macvlan_netdev_addr_lock_key, + dev->lower_level); +} + static int macvlan_init(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); @@ -892,8 +902,7 @@ static int macvlan_init(struct net_device *dev) dev->gso_max_size = lowerdev->gso_max_size; dev->gso_max_segs = lowerdev->gso_max_segs; dev->hard_header_len = lowerdev->hard_header_len; - - netdev_lockdep_set_classes(dev); + macvlan_set_lockdep_class(dev); vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->pcpu_stats) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 5bb448ae6c9c..47424b2da643 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -4245,10 +4245,8 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], mod_timer(&vxlan->age_timer, jiffies); netdev_adjacent_change_commit(dst->remote_dev, lowerdev, dev); - if (lowerdev && lowerdev != dst->remote_dev) { + if (lowerdev && lowerdev != dst->remote_dev) dst->remote_dev = lowerdev; - netdev_update_lockdep_key(lowerdev); - } vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true); return 0; } diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c index aadf3dec5bf3..2ab34cf74ecc 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_hw.c +++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c @@ -3048,6 +3048,7 @@ static void prism2_clear_set_tim_queue(local_info_t *local) * This is a natural nesting, which needs a split lock type. */ static struct lock_class_key hostap_netdev_xmit_lock_key; +static struct lock_class_key hostap_netdev_addr_lock_key; static void prism2_set_lockdep_class_one(struct net_device *dev, struct netdev_queue *txq, @@ -3059,6 +3060,8 @@ static void prism2_set_lockdep_class_one(struct net_device *dev, static void prism2_set_lockdep_class(struct net_device *dev) { + lockdep_set_class(&dev->addr_list_lock, + &hostap_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, prism2_set_lockdep_class_one, NULL); } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a96e9c4ec36..e2825e27ef89 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1821,8 +1821,6 @@ enum netdev_priv_flags { * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. * - * @addr_list_lock_key: lockdep class annotating - * net_device->addr_list_lock spinlock * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount * @@ -2125,7 +2123,6 @@ struct net_device { #endif struct phy_device *phydev; struct sfp_bus *sfp_bus; - struct lock_class_key addr_list_lock_key; struct lock_class_key *qdisc_tx_busylock; struct lock_class_key *qdisc_running_key; bool proto_down; @@ -2217,10 +2214,13 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, static struct lock_class_key qdisc_tx_busylock_key; \ static struct lock_class_key qdisc_running_key; \ static struct lock_class_key qdisc_xmit_lock_key; \ + static struct lock_class_key dev_addr_list_lock_key; \ unsigned int i; \ \ (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ (dev)->qdisc_running_key = &qdisc_running_key; \ + lockdep_set_class(&(dev)->addr_list_lock, \ + &dev_addr_list_lock_key); \ for (i = 0; i < (dev)->num_tx_queues; i++) \ lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ &qdisc_xmit_lock_key); \ @@ -3253,7 +3253,6 @@ static inline void netif_stop_queue(struct net_device *dev) } void netif_tx_stop_all_queues(struct net_device *dev); -void netdev_update_lockdep_key(struct net_device *dev); static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue) { @@ -4239,6 +4238,11 @@ static inline void netif_addr_lock(struct net_device *dev) spin_lock(&dev->addr_list_lock); } +static inline void netif_addr_lock_nested(struct net_device *dev) +{ + spin_lock_nested(&dev->addr_list_lock, dev->lower_level); +} + static inline void netif_addr_lock_bh(struct net_device *dev) { spin_lock_bh(&dev->addr_list_lock); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index f00bb57f0f60..c8d6a07e23c5 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -494,6 +494,7 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) * separate class since they always nest. */ static struct lock_class_key vlan_netdev_xmit_lock_key; +static struct lock_class_key vlan_netdev_addr_lock_key; static void vlan_dev_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, @@ -502,8 +503,11 @@ static void vlan_dev_set_lockdep_one(struct net_device *dev, lockdep_set_class(&txq->_xmit_lock, &vlan_netdev_xmit_lock_key); } -static void vlan_dev_set_lockdep_class(struct net_device *dev) +static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass) { + lockdep_set_class_and_subclass(&dev->addr_list_lock, + &vlan_netdev_addr_lock_key, + subclass); netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL); } @@ -597,7 +601,7 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); - vlan_dev_set_lockdep_class(dev); + vlan_dev_set_lockdep_class(dev, dev->lower_level); vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 0ddd80130ea3..f1f1c86f3419 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -745,6 +745,7 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, * separate class since they always nest. */ static struct lock_class_key batadv_netdev_xmit_lock_key; +static struct lock_class_key batadv_netdev_addr_lock_key; /** * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue @@ -765,6 +766,7 @@ static void batadv_set_lockdep_class_one(struct net_device *dev, */ static void batadv_set_lockdep_class(struct net_device *dev) { + lockdep_set_class(&dev->addr_list_lock, &batadv_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 8ec1362588af..8c7b78f8bc23 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -105,6 +105,13 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } +static struct lock_class_key bridge_netdev_addr_lock_key; + +static void br_set_lockdep_class(struct net_device *dev) +{ + lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key); +} + static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -143,6 +150,7 @@ static int br_dev_init(struct net_device *dev) br_fdb_hash_fini(br); } + br_set_lockdep_class(dev); return err; } diff --git a/net/core/dev.c b/net/core/dev.c index 061496a1f640..6bc2388141f6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -439,6 +439,7 @@ static const char *const netdev_lock_name[] = { "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; +static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; static inline unsigned short netdev_lock_pos(unsigned short dev_type) { @@ -460,11 +461,25 @@ static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], netdev_lock_name[i]); } + +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ + int i; + + i = netdev_lock_pos(dev->type); + lockdep_set_class_and_name(&dev->addr_list_lock, + &netdev_addr_lock_key[i], + netdev_lock_name[i]); +} #else static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, unsigned short dev_type) { } + +static inline void netdev_set_addr_lockdep_class(struct net_device *dev) +{ +} #endif /******************************************************************************* @@ -9373,15 +9388,6 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); -void netdev_update_lockdep_key(struct net_device *dev) -{ - lockdep_unregister_key(&dev->addr_list_lock_key); - lockdep_register_key(&dev->addr_list_lock_key); - - lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key); -} -EXPORT_SYMBOL(netdev_update_lockdep_key); - /** * register_netdevice - register a network device * @dev: device to register @@ -9420,7 +9426,7 @@ int register_netdevice(struct net_device *dev) return ret; spin_lock_init(&dev->addr_list_lock); - lockdep_set_class(&dev->addr_list_lock, &dev->addr_list_lock_key); + netdev_set_addr_lockdep_class(dev); ret = dev_get_valid_name(net, dev, dev->name); if (ret < 0) @@ -9939,8 +9945,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - lockdep_register_key(&dev->addr_list_lock_key); - dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; dev->upper_level = 1; @@ -10028,8 +10032,6 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; - lockdep_unregister_key(&dev->addr_list_lock_key); - /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 2f949b5a1eb9..6393ba930097 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -637,7 +637,7 @@ int dev_uc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock(to); + netif_addr_lock_nested(to); err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -667,7 +667,7 @@ int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock(to); + netif_addr_lock_nested(to); err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -691,7 +691,7 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from) return; netif_addr_lock_bh(from); - netif_addr_lock(to); + netif_addr_lock_nested(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); @@ -858,7 +858,7 @@ int dev_mc_sync(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock(to); + netif_addr_lock_nested(to); err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -888,7 +888,7 @@ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) if (to->addr_len != from->addr_len) return -EINVAL; - netif_addr_lock(to); + netif_addr_lock_nested(to); err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); @@ -912,7 +912,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from) return; netif_addr_lock_bh(from); - netif_addr_lock(to); + netif_addr_lock_nested(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2269199c5891..9aedc15736ad 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2462,7 +2462,6 @@ static int do_set_master(struct net_device *dev, int ifindex, err = ops->ndo_del_slave(upper_dev, dev); if (err) return err; - netdev_update_lockdep_key(dev); } else { return -EOPNOTSUPP; } diff --git a/net/dsa/master.c b/net/dsa/master.c index a621367c6e8c..480a61460c23 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -327,6 +327,8 @@ static void dsa_master_reset_mtu(struct net_device *dev) rtnl_unlock(); } +static struct lock_class_key dsa_master_addr_list_lock_key; + int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) { int ret; @@ -345,6 +347,8 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) wmb(); dev->dsa_ptr = cpu_dp; + lockdep_set_class(&dev->addr_list_lock, + &dsa_master_addr_list_lock_key); ret = dsa_master_ethtool_setup(dev); if (ret) return ret; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index eccc7d366e17..f90ef6934b8f 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -70,6 +70,7 @@ static const struct proto_ops nr_proto_ops; * separate class since they always nest. */ static struct lock_class_key nr_netdev_xmit_lock_key; +static struct lock_class_key nr_netdev_addr_lock_key; static void nr_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, @@ -80,6 +81,7 @@ static void nr_set_lockdep_one(struct net_device *dev, static void nr_set_lockdep_key(struct net_device *dev) { + lockdep_set_class(&dev->addr_list_lock, &nr_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL); } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index e7a872207b46..ce85656ac9c1 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -71,6 +71,7 @@ ax25_address rose_callsign; * separate class since they always nest. */ static struct lock_class_key rose_netdev_xmit_lock_key; +static struct lock_class_key rose_netdev_addr_lock_key; static void rose_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, @@ -81,6 +82,7 @@ static void rose_set_lockdep_one(struct net_device *dev, static void rose_set_lockdep_key(struct net_device *dev) { + lockdep_set_class(&dev->addr_list_lock, &rose_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL); } From 1a3db27ad9a72d033235b9673653962c02e3486e Mon Sep 17 00:00:00 2001 From: Valentin Longchamp Date: Tue, 9 Jun 2020 22:11:54 +0200 Subject: [PATCH 25/83] net: sched: export __netdev_watchdog_up() Since the quiesce/activate rework, __netdev_watchdog_up() is directly called in the ucc_geth driver. Unfortunately, this function is not available for modules and thus ucc_geth cannot be built as a module anymore. Fix it by exporting __netdev_watchdog_up(). Since the commit introducing the regression was backported to stable branches, this one should ideally be as well. Fixes: 79dde73cf9bc ("net/ethernet/freescale: rework quiesce/activate for ucc_geth") Signed-off-by: Valentin Longchamp Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index b19a0021a0bd..265a61d011df 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -464,6 +464,7 @@ void __netdev_watchdog_up(struct net_device *dev) dev_hold(dev); } } +EXPORT_SYMBOL_GPL(__netdev_watchdog_up); static void dev_watchdog_up(struct net_device *dev) { From 976ee3b21119dcf5c6d96233d688a1453f29fa83 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 8 Jun 2020 20:41:43 -0700 Subject: [PATCH 26/83] ionic: wait on queue start until after IFF_UP The netif_running() test looks at __LINK_STATE_START which gets set before ndo_open() is called, there is a window of time between that and when the queues are actually ready to be run. If ionic_check_link_status() notices that the link is up very soon after netif_running() becomes true, it might try to run the queues before they are ready, causing all manner of potential issues. Since the netdev->flags IFF_UP isn't set until after ndo_open() returns, we can wait for that before we allow ionic_check_link_status() to start the queues. On the way back to close, __LINK_STATE_START is cleared before calling ndo_stop(), and IFF_UP is cleared after. Both of these need to be true in order to safely stop the queues from ionic_check_link_status(). Fixes: 49d3b493673a ("ionic: disable the queues on link down") Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 7321a92f8395..fbc36e9e4729 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -116,7 +116,7 @@ static void ionic_link_status_check(struct ionic_lif *lif) netif_carrier_on(netdev); } - if (netif_running(lif->netdev)) + if (lif->netdev->flags & IFF_UP && netif_running(lif->netdev)) ionic_start_queues(lif); } else { if (netif_carrier_ok(netdev)) { @@ -124,7 +124,7 @@ static void ionic_link_status_check(struct ionic_lif *lif) netif_carrier_off(netdev); } - if (netif_running(lif->netdev)) + if (lif->netdev->flags & IFF_UP && netif_running(lif->netdev)) ionic_stop_queues(lif); } From c96b6acc8f89a4a7f6258dfe1d077654c11415be Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Tue, 9 Jun 2020 22:18:16 +0800 Subject: [PATCH 27/83] dccp: Fix possible memleak in dccp_init and dccp_fini There are some memory leaks in dccp_init() and dccp_fini(). In dccp_fini() and the error handling path in dccp_init(), free lhash2 is missing. Add inet_hashinfo2_free_mod() to do it. If inet_hashinfo2_init_mod() failed in dccp_init(), percpu_counter_destroy() should be called to destroy dccp_orphan_count. It need to goto out_free_percpu when inet_hashinfo2_init_mod() failed. Fixes: c92c81df93df ("net: dccp: fix kernel crash on module load") Reported-by: Hulk Robot Signed-off-by: Wang Hai Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 6 ++++++ net/dccp/proto.c | 7 +++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index ad64ba6a057f..92560974ea67 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -185,6 +185,12 @@ static inline spinlock_t *inet_ehash_lockp( int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo); +static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h) +{ + kfree(h->lhash2); + h->lhash2 = NULL; +} + static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) { kvfree(hashinfo->ehash_locks); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 4af8a98fe784..c13b6609474b 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1139,14 +1139,14 @@ static int __init dccp_init(void) inet_hashinfo_init(&dccp_hashinfo); rc = inet_hashinfo2_init_mod(&dccp_hashinfo); if (rc) - goto out_fail; + goto out_free_percpu; rc = -ENOBUFS; dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN, NULL); if (!dccp_hashinfo.bind_bucket_cachep) - goto out_free_percpu; + goto out_free_hashinfo2; /* * Size and allocate the main established and bind bucket @@ -1242,6 +1242,8 @@ static int __init dccp_init(void) free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); +out_free_hashinfo2: + inet_hashinfo2_free_mod(&dccp_hashinfo); out_free_percpu: percpu_counter_destroy(&dccp_orphan_count); out_fail: @@ -1265,6 +1267,7 @@ static void __exit dccp_fini(void) kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_ackvec_exit(); dccp_sysctl_exit(); + inet_hashinfo2_free_mod(&dccp_hashinfo); percpu_counter_destroy(&dccp_orphan_count); } From 62a502cc91f97e3ffd312d9b42e8d01a137c63ff Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 9 Jun 2020 00:02:39 +0200 Subject: [PATCH 28/83] net: mvneta: do not redirect frames during reconfiguration Disable frames injection in mvneta_xdp_xmit routine during hw re-configuration in order to avoid hardware hangs Fixes: b0a43db9087a ("net: mvneta: add XDP_TX support") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 4cc9abd61c43..946925bbcb2d 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -452,11 +452,17 @@ struct mvneta_pcpu_port { u32 cause_rx_tx; }; +enum { + __MVNETA_DOWN, +}; + struct mvneta_port { u8 id; struct mvneta_pcpu_port __percpu *ports; struct mvneta_pcpu_stats __percpu *stats; + unsigned long state; + int pkt_size; void __iomem *base; struct mvneta_rx_queue *rxqs; @@ -2113,6 +2119,9 @@ mvneta_xdp_xmit(struct net_device *dev, int num_frame, struct netdev_queue *nq; u32 ret; + if (unlikely(test_bit(__MVNETA_DOWN, &pp->state))) + return -ENETDOWN; + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; @@ -3568,12 +3577,16 @@ static void mvneta_start_dev(struct mvneta_port *pp) phylink_start(pp->phylink); netif_tx_start_all_queues(pp->dev); + + clear_bit(__MVNETA_DOWN, &pp->state); } static void mvneta_stop_dev(struct mvneta_port *pp) { unsigned int cpu; + set_bit(__MVNETA_DOWN, &pp->state); + phylink_stop(pp->phylink); if (!pp->neta_armada3700) { From 865a6cbb2288f8af7f9dc3b153c61b7014fdcf1e Mon Sep 17 00:00:00 2001 From: tannerlove Date: Tue, 9 Jun 2020 17:21:32 -0400 Subject: [PATCH 29/83] selftests/net: in rxtimestamp getopt_long needs terminating null entry getopt_long requires the last element to be filled with zeros. Otherwise, passing an unrecognized option can cause a segfault. Fixes: 16e781224198 ("selftests/net: Add a test to validate behavior of rx timestamps") Signed-off-by: Tanner Love Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- tools/testing/selftests/net/rxtimestamp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/rxtimestamp.c b/tools/testing/selftests/net/rxtimestamp.c index 6dee9e636a95..422e7761254d 100644 --- a/tools/testing/selftests/net/rxtimestamp.c +++ b/tools/testing/selftests/net/rxtimestamp.c @@ -115,6 +115,7 @@ static struct option long_options[] = { { "tcp", no_argument, 0, 't' }, { "udp", no_argument, 0, 'u' }, { "ip", no_argument, 0, 'i' }, + { NULL, 0, NULL, 0 }, }; static int next_port = 19999; From 89dc68533b190117e1a2fb4298d88b96b3580abf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Jun 2020 23:47:44 +0200 Subject: [PATCH 30/83] net: flow_offload: remove indirect flow_block declarations leftover Remove function declarations that are not available in the tree anymore. Fixes: 709ffbe19b77 ("net: remove indirect block netdev event registration") Reported-by: Jacob Keller Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/flow_offload.h | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 69e13c8b6b3a..f2c8311a0433 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -542,28 +542,4 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct flow_block_offload *bo, void (*cleanup)(struct flow_block_cb *block_cb)); -typedef void flow_indr_block_cmd_t(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command command); - -int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, - void *cb_ident); - -void __flow_indr_block_cb_unregister(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_ident); - -int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, void *cb_ident); - -void flow_indr_block_cb_unregister(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_ident); - -void flow_indr_block_call(struct net_device *dev, - struct flow_block_offload *bo, - enum flow_block_command command, - enum tc_setup_type type); - #endif /* _NET_FLOW_OFFLOAD_H */ From 8ca8d4a841730c02e77bf3c87bf658cc44f364b9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Jun 2020 18:16:53 -0300 Subject: [PATCH 31/83] libbpf: Define __WORDSIZE if not available Some systems, such as Android, don't have a define for __WORDSIZE, do it in terms of __SIZEOF_LONG__, as done in perf since 2012: http://git.kernel.org/torvalds/c/3f34f6c0233ae055b5 For reference: https://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html I build tested it here and Andrii did some Travis CI build tests too. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200608161150.GA3073@kernel.org --- tools/lib/bpf/hashmap.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/hashmap.h b/tools/lib/bpf/hashmap.h index e823b35e7371..df59fd4fc95b 100644 --- a/tools/lib/bpf/hashmap.h +++ b/tools/lib/bpf/hashmap.h @@ -10,10 +10,9 @@ #include #include -#ifdef __GLIBC__ -#include -#else -#include +#include +#ifndef __WORDSIZE +#define __WORDSIZE (__SIZEOF_LONG__ * 8) #endif static inline size_t hash_bits(size_t h, int bits) From 32022fd97ed34f6812802bf1288db27c313576f4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 9 Jun 2020 22:23:35 -0700 Subject: [PATCH 32/83] libbpf: Handle GCC noreturn-turned-volatile quirk Handle a GCC quirk of emitting extra volatile modifier in DWARF (and subsequently preserved in BTF by pahole) for function pointers marked as __attribute__((noreturn)). This was the way to mark such functions before GCC 2.5 added noreturn attribute. Drop such func_proto modifiers, similarly to how it's done for array (also to handle GCC quirk/bug). Such volatile attribute is emitted by GCC only, so existing selftests can't express such test. Simple repro is like this (compiled with GCC + BTF generated by pahole): struct my_struct { void __attribute__((noreturn)) (*fn)(int); }; struct my_struct a; Without this fix, output will be: struct my_struct { voidvolatile (*fn)(int); }; With the fix: struct my_struct { void (*fn)(int); }; Fixes: 351131b51c7a ("libbpf: add btf_dump API for BTF-to-C conversion") Reported-by: Jean-Philippe Brucker Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Tested-by: Jean-Philippe Brucker Link: https://lore.kernel.org/bpf/20200610052335.2862559-1-andriin@fb.com --- tools/lib/bpf/btf_dump.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index de07e559a11d..bbb430317260 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -1137,6 +1137,20 @@ static void btf_dump_emit_mods(struct btf_dump *d, struct id_stack *decl_stack) } } +static void btf_dump_drop_mods(struct btf_dump *d, struct id_stack *decl_stack) +{ + const struct btf_type *t; + __u32 id; + + while (decl_stack->cnt) { + id = decl_stack->ids[decl_stack->cnt - 1]; + t = btf__type_by_id(d->btf, id); + if (!btf_is_mod(t)) + return; + decl_stack->cnt--; + } +} + static void btf_dump_emit_name(const struct btf_dump *d, const char *name, bool last_was_ptr) { @@ -1235,14 +1249,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, * a const/volatile modifier for array, so we are * going to silently skip them here. */ - while (decls->cnt) { - next_id = decls->ids[decls->cnt - 1]; - next_t = btf__type_by_id(d->btf, next_id); - if (btf_is_mod(next_t)) - decls->cnt--; - else - break; - } + btf_dump_drop_mods(d, decls); if (decls->cnt == 0) { btf_dump_emit_name(d, fname, last_was_ptr); @@ -1270,7 +1277,15 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, __u16 vlen = btf_vlen(t); int i; - btf_dump_emit_mods(d, decls); + /* + * GCC emits extra volatile qualifier for + * __attribute__((noreturn)) function pointers. Clang + * doesn't do it. It's a GCC quirk for backwards + * compatibility with code written for GCC <2.5. So, + * similarly to extra qualifiers for array, just drop + * them, instead of handling them. + */ + btf_dump_drop_mods(d, decls); if (decls->cnt) { btf_dump_printf(d, " ("); btf_dump_emit_type_chain(d, decls, fname, lvl); From 47f6bc4ce1ff70d7ba0924c2f1c218c96cd585fb Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Tue, 9 Jun 2020 17:35:06 -0400 Subject: [PATCH 33/83] tools, bpf: Do not force gcc as CC This allows transparent cross-compilation with CROSS_COMPILE by relying on 7ed1c1901fe5 ("tools: fix cross-compile var clobbering"). Same change was applied to tools/bpf/bpftool/Makefile in 9e88b9312acb ("tools: bpftool: do not force gcc as CC"). Signed-off-by: Brett Mastbergen Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200609213506.3299-1-brett.mastbergen@gmail.com --- tools/bpf/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index 77472e28c8fd..6df1850f8353 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -3,7 +3,6 @@ include ../scripts/Makefile.include prefix ?= /usr/local -CC = gcc LEX = flex YACC = bison MAKE = make From ce9ac056d9cd15630dfca352ff6d3051ba3ba8f6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 8 Jun 2020 20:54:43 -0600 Subject: [PATCH 34/83] nexthop: Fix fdb labeling for groups fdb nexthops are marked with a flag. For standalone nexthops, a flag was added to the nh_info struct. For groups that flag was added to struct nexthop when it should have been added to the group information. Fix by removing the flag from the nexthop struct and adding a flag to nh_group that mirrors nh_info and is really only a caching of the individual types. Add a helper, nexthop_is_fdb, for use by the vxlan code and fixup the internal code to use the flag from either nh_info or nh_group. v2 - propagate fdb_nh in remove_nh_grp_entry Fixes: 38428d68719c ("nexthop: support for fdb ecmp nexthops") Cc: Roopa Prabhu Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 2 +- include/net/nexthop.h | 17 ++++++++- net/ipv4/nexthop.c | 82 ++++++++++++++++++++++++++----------------- 3 files changed, 66 insertions(+), 35 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 47424b2da643..8a39e8047f14 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -876,7 +876,7 @@ static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, nh = NULL; goto err_inval; } - if (!nh->is_fdb_nh) { + if (!nexthop_is_fdb(nh)) { NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop"); goto err_inval; } diff --git a/include/net/nexthop.h b/include/net/nexthop.h index e4b55b43e907..3f9e0ca2dc4d 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -76,6 +76,7 @@ struct nh_group { struct nh_group *spare; /* spare group for removals */ u16 num_nh; bool mpath; + bool fdb_nh; bool has_v4; struct nh_grp_entry nh_entries[]; }; @@ -93,7 +94,6 @@ struct nexthop { u8 protocol; /* app managing this nh */ u8 nh_flags; bool is_group; - bool is_fdb_nh; refcount_t refcnt; struct rcu_head rcu; @@ -136,6 +136,21 @@ static inline bool nexthop_cmp(const struct nexthop *nh1, return nh1 == nh2; } +static inline bool nexthop_is_fdb(const struct nexthop *nh) +{ + if (nh->is_group) { + const struct nh_group *nh_grp; + + nh_grp = rcu_dereference_rtnl(nh->nh_grp); + return nh_grp->fdb_nh; + } else { + const struct nh_info *nhi; + + nhi = rcu_dereference_rtnl(nh->nh_info); + return nhi->fdb_nh; + } +} + static inline bool nexthop_is_multipath(const struct nexthop *nh) { if (nh->is_group) { diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 400a9f89ebdb..cc8049b100b2 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -247,12 +247,11 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; - if (nh->is_fdb_nh && nla_put_flag(skb, NHA_FDB)) - goto nla_put_failure; - if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) + goto nla_put_failure; if (nla_put_nh_group(skb, nhg)) goto nla_put_failure; goto out; @@ -264,7 +263,10 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; goto out; - } else if (!nh->is_fdb_nh) { + } else if (nhi->fdb_nh) { + if (nla_put_flag(skb, NHA_FDB)) + goto nla_put_failure; + } else { const struct net_device *dev; dev = nhi->fib_nhc.nhc_dev; @@ -385,7 +387,7 @@ static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) } static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, - struct netlink_ext_ack *extack) + bool *is_fdb, struct netlink_ext_ack *extack) { if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); @@ -398,6 +400,7 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, "Multipath group can not be a nexthop within a group"); return false; } + *is_fdb = nhg->fdb_nh; } else { struct nh_info *nhi = rtnl_dereference(nh->nh_info); @@ -406,6 +409,7 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, "Blackhole nexthop can not be used in a group with more than 1 path"); return false; } + *is_fdb = nhi->fdb_nh; } return true; @@ -416,12 +420,13 @@ static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, { struct nh_info *nhi; - if (!nh->is_fdb_nh) { + nhi = rtnl_dereference(nh->nh_info); + + if (!nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); return -EINVAL; } - nhi = rtnl_dereference(nh->nh_info); if (*nh_family == AF_UNSPEC) { *nh_family = nhi->family; } else if (*nh_family != nhi->family) { @@ -473,19 +478,20 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { struct nexthop *nh; + bool is_fdb_nh; nh = nexthop_find_by_id(net, nhg[i].id); if (!nh) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } - if (!valid_group_nh(nh, len, extack)) + if (!valid_group_nh(nh, len, &is_fdb_nh, extack)) return -EINVAL; if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) return -EINVAL; - if (!nhg_fdb && nh->is_fdb_nh) { + if (!nhg_fdb && is_fdb_nh) { NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); return -EINVAL; } @@ -553,13 +559,13 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) if (hash > atomic_read(&nhge->upper_bound)) continue; - if (nhge->nh->is_fdb_nh) + nhi = rcu_dereference(nhge->nh->nh_info); + if (nhi->fdb_nh) return nhge->nh; /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ - nhi = rcu_dereference(nhge->nh->nh_info); switch (nhi->family) { case AF_INET: if (ipv4_good_nh(&nhi->fib_nh)) @@ -624,11 +630,7 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; - - if (nh->is_fdb_nh) { - NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); - return -EINVAL; - } + bool is_fdb_nh; /* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared @@ -645,10 +647,17 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, nhg = rtnl_dereference(nh->nh_grp); if (nhg->has_v4) goto no_v4_nh; + is_fdb_nh = nhg->fdb_nh; } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->family == AF_INET) goto no_v4_nh; + is_fdb_nh = nhi->fdb_nh; + } + + if (is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); + return -EINVAL; } return 0; @@ -677,12 +686,9 @@ static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, return fib6_check_nexthop(new, NULL, extack); } -static int nexthop_check_scope(struct nexthop *nh, u8 scope, +static int nexthop_check_scope(struct nh_info *nhi, u8 scope, struct netlink_ext_ack *extack) { - struct nh_info *nhi; - - nhi = rtnl_dereference(nh->nh_info); if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); @@ -704,29 +710,38 @@ static int nexthop_check_scope(struct nexthop *nh, u8 scope, int fib_check_nexthop(struct nexthop *nh, u8 scope, struct netlink_ext_ack *extack) { + struct nh_info *nhi; int err = 0; - if (nh->is_fdb_nh) { - NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); - err = -EINVAL; - goto out; - } - if (nh->is_group) { struct nh_group *nhg; + nhg = rtnl_dereference(nh->nh_grp); + if (nhg->fdb_nh) { + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); + err = -EINVAL; + goto out; + } + if (scope == RT_SCOPE_HOST) { NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); err = -EINVAL; goto out; } - nhg = rtnl_dereference(nh->nh_grp); /* all nexthops in a group have the same scope */ - err = nexthop_check_scope(nhg->nh_entries[0].nh, scope, extack); + nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info); + err = nexthop_check_scope(nhi, scope, extack); } else { - err = nexthop_check_scope(nh, scope, extack); + nhi = rtnl_dereference(nh->nh_info); + if (nhi->fdb_nh) { + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); + err = -EINVAL; + goto out; + } + err = nexthop_check_scope(nhi, scope, extack); } + out: return err; } @@ -787,6 +802,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, newg->has_v4 = nhg->has_v4; newg->mpath = nhg->mpath; + newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; /* copy old entries to new except the one getting removed */ @@ -1216,7 +1232,7 @@ static struct nexthop *nexthop_create_group(struct net *net, } if (cfg->nh_fdb) - nh->is_fdb_nh = 1; + nhg->fdb_nh = 1; rcu_assign_pointer(nh->nh_grp, nhg); @@ -1255,7 +1271,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, goto out; } - if (nh->is_fdb_nh) + if (nhi->fdb_nh) goto out; /* sets nh_dev if successful */ @@ -1326,7 +1342,7 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; if (cfg->nh_fdb) - nh->is_fdb_nh = 1; + nhi->fdb_nh = 1; if (cfg->nh_blackhole) { nhi->reject_nh = 1; @@ -1349,7 +1365,7 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, } /* add the entry to the device based hash */ - if (!nh->is_fdb_nh) + if (!nhi->fdb_nh) nexthop_devhash_add(net, nhi); rcu_assign_pointer(nh->nh_info, nhi); From 50cb8769f2c1c657a470bda192b79ff679d0ecfc Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 9 Jun 2020 17:27:28 -0600 Subject: [PATCH 35/83] vxlan: Remove access to nexthop group struct vxlan driver should be using helpers to access nexthop struct internals. Remove open check if whether nexthop is multipath in favor of the existing nexthop_is_multipath helper. Add a new helper, nexthop_has_v4, to cover the need to check has_v4 in a group. Fixes: 1274e1cc4226 ("vxlan: ecmp support for mac fdb entries") Cc: Roopa Prabhu Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 8 +++----- include/net/nexthop.h | 11 +++++++++++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 8a39e8047f14..e8085ab6d484 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -857,7 +857,6 @@ static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, u32 nhid, struct netlink_ext_ack *extack) { struct nexthop *old_nh = rtnl_dereference(fdb->nh); - struct nh_group *nhg; struct nexthop *nh; int err = -EINVAL; @@ -881,8 +880,7 @@ static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, goto err_inval; } - nhg = rtnl_dereference(nh->nh_grp); - if (!nh->is_group || !nhg->mpath) { + if (!nexthop_is_multipath(nh)) { NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group"); goto err_inval; } @@ -890,14 +888,14 @@ static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, /* check nexthop group family */ switch (vxlan->default_dst.remote_ip.sa.sa_family) { case AF_INET: - if (!nhg->has_v4) { + if (!nexthop_has_v4(nh)) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); goto err_inval; } break; case AF_INET6: - if (nhg->has_v4) { + if (nexthop_has_v4(nh)) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); goto err_inval; diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 3f9e0ca2dc4d..3a4f9e3b91a5 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -151,6 +151,17 @@ static inline bool nexthop_is_fdb(const struct nexthop *nh) } } +static inline bool nexthop_has_v4(const struct nexthop *nh) +{ + if (nh->is_group) { + struct nh_group *nh_grp; + + nh_grp = rcu_dereference_rtnl(nh->nh_grp); + return nh_grp->has_v4; + } + return false; +} + static inline bool nexthop_is_multipath(const struct nexthop *nh) { if (nh->is_group) { From 5969856ae8ce29c9d523a1a6145cbd9e87f7046c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 10 Jun 2020 10:47:41 +0200 Subject: [PATCH 36/83] mptcp: fix races between shutdown and recvmsg The msk sk_shutdown flag is set by a workqueue, possibly introducing some delay in user-space notification. If the last subflow carries some data with the fin packet, the user space can wake-up before RCV_SHUTDOWN is set. If it executes unblocking recvmsg(), it may return with an error instead of eof. Address the issue explicitly checking for eof in recvmsg(), when no data is found. Fixes: 59832e246515 ("mptcp: subflow: check parent mptcp socket on subflow state change") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 45 +++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 14b253d10ccf..3980fbb6f31e 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -374,6 +374,27 @@ void mptcp_subflow_eof(struct sock *sk) sock_hold(sk); } +static void mptcp_check_for_eof(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int receivers = 0; + + mptcp_for_each_subflow(msk, subflow) + receivers += !subflow->rx_eof; + + if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + /* hopefully temporary hack: propagate shutdown status + * to msk, when all subflows agree on it + */ + sk->sk_shutdown |= RCV_SHUTDOWN; + + smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ + set_bit(MPTCP_DATA_READY, &msk->flags); + sk->sk_data_ready(sk); + } +} + static void mptcp_stop_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -1011,6 +1032,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, break; } + if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) + mptcp_check_for_eof(msk); + if (sk->sk_shutdown & RCV_SHUTDOWN) break; @@ -1148,27 +1172,6 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) return 0; } -static void mptcp_check_for_eof(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - struct sock *sk = (struct sock *)msk; - int receivers = 0; - - mptcp_for_each_subflow(msk, subflow) - receivers += !subflow->rx_eof; - - if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { - /* hopefully temporary hack: propagate shutdown status - * to msk, when all subflows agree on it - */ - sk->sk_shutdown |= RCV_SHUTDOWN; - - smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - set_bit(MPTCP_DATA_READY, &msk->flags); - sk->sk_data_ready(sk); - } -} - static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); From 4b5af44129d0653a4df44e5511c7d480c61c8f3c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 10 Jun 2020 10:49:00 +0200 Subject: [PATCH 37/83] mptcp: don't leak msk in token container If a listening MPTCP socket has unaccepted sockets at close time, the related msks are freed via mptcp_sock_destruct(), which in turn does not invoke the proto->destroy() method nor the mptcp_token_destroy() function. Due to the above, the child msk socket is not removed from the token container, leading to later UaF. Address the issue explicitly removing the token even in the above error path. Fixes: 79c0949e9a09 ("mptcp: Add key generation and token tree") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 493b98a0825c..bf132575040d 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -393,6 +393,7 @@ static void mptcp_sock_destruct(struct sock *sk) sock_orphan(sk); } + mptcp_token_destroy(mptcp_sk(sk)->token); inet_sock_destruct(sk); } From 014406babc1f5f887a08737566b5b356c7018242 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Wed, 10 Jun 2020 09:53:44 +0000 Subject: [PATCH 38/83] net: cadence: macb: disable NAPI on error When the PHY is not working, the macb driver crash on a second try to setup it. [ 78.545994] macb e000b000.ethernet eth0: Could not attach PHY (-19) ifconfig: SIOCSIFFLAGS: No such device [ 78.655457] ------------[ cut here ]------------ [ 78.656014] kernel BUG at /linux-next/include/linux/netdevice.h:521! [ 78.656504] Internal error: Oops - BUG: 0 [#1] SMP ARM [ 78.657079] Modules linked in: [ 78.657795] CPU: 0 PID: 122 Comm: ifconfig Not tainted 5.7.0-next-20200609 #1 [ 78.658202] Hardware name: Xilinx Zynq Platform [ 78.659632] PC is at macb_open+0x220/0x294 [ 78.660160] LR is at 0x0 [ 78.660373] pc : [] lr : [<00000000>] psr: 60000013 [ 78.660716] sp : c89ffd70 ip : c8a28800 fp : c199bac0 [ 78.661040] r10: 00000000 r9 : c8838540 r8 : c8838568 [ 78.661362] r7 : 00000001 r6 : c8838000 r5 : c883c000 r4 : 00000000 [ 78.661724] r3 : 00000010 r2 : 00000000 r1 : 00000000 r0 : 00000000 [ 78.662187] Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none [ 78.662635] Control: 10c5387d Table: 08b64059 DAC: 00000051 [ 78.663035] Process ifconfig (pid: 122, stack limit = 0x(ptrval)) [ 78.663476] Stack: (0xc89ffd70 to 0xc8a00000) [ 78.664121] fd60: 00000000 c89fe000 c8838000 c89fe000 [ 78.664866] fd80: 00000000 c11ff9ac c8838028 00000000 00000000 c0de6f2c 00000001 c1804eec [ 78.665579] fda0: c19b8178 c8838000 00000000 ca760866 c8838000 00000001 00001043 c89fe000 [ 78.666355] fdc0: 00001002 c0de72f4 c89fe000 c0de8dc0 00008914 c89fe000 c199bac0 ca760866 [ 78.667111] fde0: c89ffddc c8838000 00001002 00000000 c8838138 c881010c 00008914 c0de7364 [ 78.667862] fe00: 00000000 c89ffe70 c89fe000 ffffffff c881010c c0e8bd48 00000003 00000000 [ 78.668601] fe20: c8838000 c8810100 39c1118f 00039c11 c89a0960 00001043 00000000 000a26d0 [ 78.669343] fe40: b6f43000 ca760866 c89a0960 00000051 befe6c50 00008914 c8b2a3c0 befe6c50 [ 78.670086] fe60: 00000003 ee610500 00000000 c0e8ef58 30687465 00000000 00000000 00000000 [ 78.670865] fe80: 00001043 00000000 000a26d0 b6f43000 c89a0600 ee40ae7c c8870d00 c0ddabf4 [ 78.671593] fea0: c89ffeec c0ddabf4 c89ffeec c199bac0 00008913 c0ddac48 c89ffeec c89fe000 [ 78.672324] fec0: befe6c50 ca760866 befe6c50 00008914 c89fe000 befe6c50 c8b2a3c0 c0dc00e4 [ 78.673088] fee0: c89a0480 00000201 00000cc0 30687465 00000000 00000000 00000000 00001002 [ 78.673822] ff00: 00000000 000a26d0 b6f43000 ca760866 00008914 c8b2a3c0 000a0ec4 c8b2a3c0 [ 78.674576] ff20: befe6c50 c04b21bc 000d5004 00000817 c89a0480 c0315f94 00000000 00000003 [ 78.675415] ff40: c19a2bc8 c8a3cc00 c89fe000 00000255 00000000 00000000 00000000 000d5000 [ 78.676182] ff60: 000f6000 c180b2a0 00000817 c0315e64 000d5004 c89fffb0 b6ec0c30 ca760866 [ 78.676928] ff80: 00000000 000b609b befe6c50 000a0ec4 00000036 c03002c4 c89fe000 00000036 [ 78.677673] ffa0: 00000000 c03000c0 000b609b befe6c50 00000003 00008914 befe6c50 000b609b [ 78.678415] ffc0: 000b609b befe6c50 000a0ec4 00000036 befe6e0c befe6f1a 000d5150 00000000 [ 78.679154] ffe0: 000d41e4 befe6bf4 00019648 b6e4509c 20000010 00000003 00000000 00000000 [ 78.681059] [] (macb_open) from [] (__dev_open+0xd0/0x154) [ 78.681571] [] (__dev_open) from [] (__dev_change_flags+0x16c/0x1c4) [ 78.682015] [] (__dev_change_flags) from [] (dev_change_flags+0x18/0x48) [ 78.682493] [] (dev_change_flags) from [] (devinet_ioctl+0x5e4/0x75c) [ 78.682945] [] (devinet_ioctl) from [] (inet_ioctl+0x1f0/0x3b4) [ 78.683381] [] (inet_ioctl) from [] (sock_ioctl+0x39c/0x664) [ 78.683818] [] (sock_ioctl) from [] (ksys_ioctl+0x2d8/0x9c0) [ 78.684343] [] (ksys_ioctl) from [] (ret_fast_syscall+0x0/0x54) [ 78.684789] Exception stack(0xc89fffa8 to 0xc89ffff0) [ 78.685346] ffa0: 000b609b befe6c50 00000003 00008914 befe6c50 000b609b [ 78.686106] ffc0: 000b609b befe6c50 000a0ec4 00000036 befe6e0c befe6f1a 000d5150 00000000 [ 78.686710] ffe0: 000d41e4 befe6bf4 00019648 b6e4509c [ 78.687582] Code: 9a000003 e5983078 e3130001 1affffef (e7f001f2) [ 78.688788] ---[ end trace e3f2f6ab69754eae ]--- This is due to NAPI left enabled if macb_phylink_connect() fail. Fixes: 7897b071ac3b ("net: macb: convert to phylink") Signed-off-by: Corentin Labbe Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 36290a8e2a84..5b9d7c60eebc 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -2558,13 +2558,16 @@ static int macb_open(struct net_device *dev) err = macb_phylink_connect(bp); if (err) - goto pm_exit; + goto napi_exit; netif_tx_start_all_queues(dev); if (bp->ptp_info) bp->ptp_info->ptp_init(dev); +napi_exit: + for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) + napi_disable(&queue->napi); pm_exit: if (err) { pm_runtime_put_sync(&bp->pdev->dev); From 58e898a07b9aa2de5eb2fdb9bfe1d0017682d340 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 10 Jun 2020 15:36:48 -0700 Subject: [PATCH 39/83] docs: networking: fix extra spaces in ethtool-netlink Sphinx appears to get upset at extra spaces at the end of a literal: Documentation/networking/ethtool-netlink.rst:1032: WARNING: Inline literal start-string without end-string. Documentation/networking/ethtool-netlink.rst:1034: WARNING: Inline literal start-string without end-string. Documentation/networking/ethtool-netlink.rst:1036: WARNING: Inline literal start-string without end-string. Documentation/networking/ethtool-netlink.rst:1089: WARNING: Inline literal start-string without end-string. Documentation/networking/ethtool-netlink.rst:1091: WARNING: Inline literal start-string without end-string. Documentation/networking/ethtool-netlink.rst:1093: WARNING: Inline literal start-string without end-string. Fixes: f2bc8ad31a7f ("net: ethtool: Allow PHY cable test TDR data to configured") Fixes: a331172b156b ("net: ethtool: Add attributes for cable test TDR data") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index d42661b91128..82470c36c27a 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1028,11 +1028,11 @@ Request contents: +--------------------------------------------+--------+-----------------------+ | ``ETHTOOL_A_CABLE_TEST_TDR_CFG`` | nested | test configuration | +-+------------------------------------------+--------+-----------------------+ - | | ``ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE `` | u32 | first data distance | + | | ``ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE`` | u32 | first data distance | +-+-+----------------------------------------+--------+-----------------------+ - | | ``ETHTOOL_A_CABLE_STEP_LAST_DISTANCE `` | u32 | last data distance | + | | ``ETHTOOL_A_CABLE_STEP_LAST_DISTANCE`` | u32 | last data distance | +-+-+----------------------------------------+--------+-----------------------+ - | | ``ETHTOOL_A_CABLE_STEP_STEP_DISTANCE `` | u32 | distance of each step | + | | ``ETHTOOL_A_CABLE_STEP_STEP_DISTANCE`` | u32 | distance of each step | +-+-+----------------------------------------+--------+-----------------------+ | | ``ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR`` | u8 | pair to test | +-+-+----------------------------------------+--------+-----------------------+ @@ -1085,11 +1085,11 @@ used to report the amplitude of the reflection for a given pair. +-+-+-----------------------------------------+--------+----------------------+ | | ``ETHTOOL_A_CABLE_NEST_STEP`` | nested | TDR step info | +-+-+-----------------------------------------+--------+----------------------+ - | | | ``ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE ``| u32 | First data distance | + | | | ``ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE`` | u32 | First data distance | +-+-+-----------------------------------------+--------+----------------------+ - | | | ``ETHTOOL_A_CABLE_STEP_LAST_DISTANCE `` | u32 | Last data distance | + | | | ``ETHTOOL_A_CABLE_STEP_LAST_DISTANCE`` | u32 | Last data distance | +-+-+-----------------------------------------+--------+----------------------+ - | | | ``ETHTOOL_A_CABLE_STEP_STEP_DISTANCE `` | u32 | distance of each step| + | | | ``ETHTOOL_A_CABLE_STEP_STEP_DISTANCE`` | u32 | distance of each step| +-+-+-----------------------------------------+--------+----------------------+ | | ``ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE`` | nested | Reflection amplitude | +-+-+-----------------------------------------+--------+----------------------+ From 934e36ec5e81b8bc079f3d7acd12beb16cad9531 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 10 Jun 2020 16:09:06 -0700 Subject: [PATCH 40/83] docs: networkng: fix lists and table in sja1105 We need an empty line before list stats, otherwise first point will be smooshed into the paragraph. Inside tables text must start at the same offset in the cell, otherwise sphinx thinks it's a new indented block. Documentation/networking/dsa/sja1105.rst:108: WARNING: Block quote ends without a blank line; unexpected unindent. Documentation/networking/dsa/sja1105.rst:112: WARNING: Definition list ends without a blank line; unexpected unindent. Documentation/networking/dsa/sja1105.rst:245: WARNING: Unexpected indentation. Documentation/networking/dsa/sja1105.rst:246: WARNING: Block quote ends without a blank line; unexpected unindent. Documentation/networking/dsa/sja1105.rst:253: WARNING: Unexpected indentation. Documentation/networking/dsa/sja1105.rst:254: WARNING: Block quote ends without a blank line; unexpected unindent. Fixes: a20bc43bfb2e ("docs: net: dsa: sja1105: document the best_effort_vlan_filtering option") Signed-off-by: Jakub Kicinski Acked-by: Vladimir Oltean Signed-off-by: David S. Miller --- Documentation/networking/dsa/sja1105.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/dsa/sja1105.rst b/Documentation/networking/dsa/sja1105.rst index b6bbc17814fb..7395a33baaf9 100644 --- a/Documentation/networking/dsa/sja1105.rst +++ b/Documentation/networking/dsa/sja1105.rst @@ -103,11 +103,11 @@ the switch net devices: +-------------+-----------+--------------+------------+ | | Mode 1 | Mode 2 | Mode 3 | +=============+===========+==============+============+ -| Regular | Yes | No | Yes | +| Regular | Yes | No | Yes | | traffic | | (use master) | | +-------------+-----------+--------------+------------+ | Management | Yes | Yes | Yes | -| traffic | | | | +| traffic | | | | | (BPDU, PTP) | | | | +-------------+-----------+--------------+------------+ @@ -241,6 +241,7 @@ switch. In this case, SJA1105 switch 1 consumes a total of 11 retagging entries, as follows: + - 8 retagging entries for VLANs 1 and 100 installed on its user ports (``sw1p0`` - ``sw1p3``) - 3 retagging entries for VLAN 100 installed on the user ports of SJA1105 @@ -249,6 +250,7 @@ follows: reverse retagging. SJA1105 switch 2 also consumes 11 retagging entries, but organized as follows: + - 7 retagging entries for the bridge VLANs on its user ports (``sw2p0`` - ``sw2p3``). - 4 retagging entries for VLAN 100 installed on the user ports of SJA1105 From fb90a1c85d8f08c85d9fd5729bfdeb786119f219 Mon Sep 17 00:00:00 2001 From: Ayush Sawal Date: Wed, 10 Jun 2020 02:54:31 +0530 Subject: [PATCH 41/83] Crypto/chcr: Calculate src and dst sg lengths separately for dma map This patch calculates src and dst sg lengths separately for dma mapping in case of aead operation. This fixes a panic which occurs due to the accessing of a zero length sg. Panic: [ 138.173225] kernel BUG at drivers/iommu/intel-iommu.c:1184! Signed-off-by: Ayush Sawal Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chcr_algo.c | 63 +++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index f26a7a15551a..f8b55137cf7d 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -2590,11 +2590,22 @@ int chcr_aead_dma_map(struct device *dev, struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(tfm); - int dst_size; + int src_len, dst_len; - dst_size = req->assoclen + req->cryptlen + (op_type ? - 0 : authsize); - if (!req->cryptlen || !dst_size) + /* calculate and handle src and dst sg length separately + * for inplace and out-of place operations + */ + if (req->src == req->dst) { + src_len = req->assoclen + req->cryptlen + (op_type ? + 0 : authsize); + dst_len = src_len; + } else { + src_len = req->assoclen + req->cryptlen; + dst_len = req->assoclen + req->cryptlen + (op_type ? + -authsize : authsize); + } + + if (!req->cryptlen || !src_len || !dst_len) return 0; reqctx->iv_dma = dma_map_single(dev, reqctx->iv, (IV + reqctx->b0_len), DMA_BIDIRECTIONAL); @@ -2606,20 +2617,23 @@ int chcr_aead_dma_map(struct device *dev, reqctx->b0_dma = 0; if (req->src == req->dst) { error = dma_map_sg(dev, req->src, - sg_nents_for_len(req->src, dst_size), + sg_nents_for_len(req->src, src_len), DMA_BIDIRECTIONAL); if (!error) goto err; } else { - error = dma_map_sg(dev, req->src, sg_nents(req->src), + error = dma_map_sg(dev, req->src, + sg_nents_for_len(req->src, src_len), DMA_TO_DEVICE); if (!error) goto err; - error = dma_map_sg(dev, req->dst, sg_nents(req->dst), + error = dma_map_sg(dev, req->dst, + sg_nents_for_len(req->dst, dst_len), DMA_FROM_DEVICE); if (!error) { - dma_unmap_sg(dev, req->src, sg_nents(req->src), - DMA_TO_DEVICE); + dma_unmap_sg(dev, req->src, + sg_nents_for_len(req->src, src_len), + DMA_TO_DEVICE); goto err; } } @@ -2637,24 +2651,37 @@ void chcr_aead_dma_unmap(struct device *dev, struct chcr_aead_reqctx *reqctx = aead_request_ctx(req); struct crypto_aead *tfm = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(tfm); - int dst_size; + int src_len, dst_len; - dst_size = req->assoclen + req->cryptlen + (op_type ? - 0 : authsize); - if (!req->cryptlen || !dst_size) + /* calculate and handle src and dst sg length separately + * for inplace and out-of place operations + */ + if (req->src == req->dst) { + src_len = req->assoclen + req->cryptlen + (op_type ? + 0 : authsize); + dst_len = src_len; + } else { + src_len = req->assoclen + req->cryptlen; + dst_len = req->assoclen + req->cryptlen + (op_type ? + -authsize : authsize); + } + + if (!req->cryptlen || !src_len || !dst_len) return; dma_unmap_single(dev, reqctx->iv_dma, (IV + reqctx->b0_len), DMA_BIDIRECTIONAL); if (req->src == req->dst) { dma_unmap_sg(dev, req->src, - sg_nents_for_len(req->src, dst_size), + sg_nents_for_len(req->src, src_len), DMA_BIDIRECTIONAL); } else { - dma_unmap_sg(dev, req->src, sg_nents(req->src), - DMA_TO_DEVICE); - dma_unmap_sg(dev, req->dst, sg_nents(req->dst), - DMA_FROM_DEVICE); + dma_unmap_sg(dev, req->src, + sg_nents_for_len(req->src, src_len), + DMA_TO_DEVICE); + dma_unmap_sg(dev, req->dst, + sg_nents_for_len(req->dst, dst_len), + DMA_FROM_DEVICE); } } From 8b9914cd723bfce8dbff65bd135563f887dcb19d Mon Sep 17 00:00:00 2001 From: Ayush Sawal Date: Wed, 10 Jun 2020 02:54:32 +0530 Subject: [PATCH 42/83] Crypto/chcr: Checking cra_refcnt before unregistering the algorithms This patch puts a check for algorithm unregister, to avoid removal of driver if the algorithm is under use. Signed-off-by: Ayush Sawal Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chcr_algo.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index f8b55137cf7d..4c2553672b6f 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -4391,22 +4391,32 @@ static int chcr_unregister_alg(void) for (i = 0; i < ARRAY_SIZE(driver_algs); i++) { switch (driver_algs[i].type & CRYPTO_ALG_TYPE_MASK) { case CRYPTO_ALG_TYPE_SKCIPHER: - if (driver_algs[i].is_registered) + if (driver_algs[i].is_registered && refcount_read( + &driver_algs[i].alg.skcipher.base.cra_refcnt) + == 1) { crypto_unregister_skcipher( &driver_algs[i].alg.skcipher); + driver_algs[i].is_registered = 0; + } break; case CRYPTO_ALG_TYPE_AEAD: - if (driver_algs[i].is_registered) + if (driver_algs[i].is_registered && refcount_read( + &driver_algs[i].alg.aead.base.cra_refcnt) == 1) { crypto_unregister_aead( &driver_algs[i].alg.aead); + driver_algs[i].is_registered = 0; + } break; case CRYPTO_ALG_TYPE_AHASH: - if (driver_algs[i].is_registered) + if (driver_algs[i].is_registered && refcount_read( + &driver_algs[i].alg.hash.halg.base.cra_refcnt) + == 1) { crypto_unregister_ahash( &driver_algs[i].alg.hash); + driver_algs[i].is_registered = 0; + } break; } - driver_algs[i].is_registered = 0; } return 0; } From ae0b829df7120c78692e49c88453007e531c0e00 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 10 Jun 2020 16:59:11 -0700 Subject: [PATCH 43/83] docs: networkng: convert sja1105's devlink info to RTS A new file snuck into the tree after all existing documentation was converted to RST. Convert sja1105's devlink info and move it where the rest of the drivers are documented. Signed-off-by: Jakub Kicinski Acked-by: Vladimir Oltean Tested-by: Vladimir Oltean Signed-off-by: David S. Miller --- .../networking/devlink-params-sja1105.txt | 27 ---------- Documentation/networking/devlink/index.rst | 1 + Documentation/networking/devlink/sja1105.rst | 49 +++++++++++++++++++ 3 files changed, 50 insertions(+), 27 deletions(-) delete mode 100644 Documentation/networking/devlink-params-sja1105.txt create mode 100644 Documentation/networking/devlink/sja1105.rst diff --git a/Documentation/networking/devlink-params-sja1105.txt b/Documentation/networking/devlink-params-sja1105.txt deleted file mode 100644 index 1d71742e270a..000000000000 --- a/Documentation/networking/devlink-params-sja1105.txt +++ /dev/null @@ -1,27 +0,0 @@ -best_effort_vlan_filtering - [DEVICE, DRIVER-SPECIFIC] - Allow plain ETH_P_8021Q headers to be used as DSA tags. - Benefits: - - Can terminate untagged traffic over switch net - devices even when enslaved to a bridge with - vlan_filtering=1. - - Can terminate VLAN-tagged traffic over switch net - devices even when enslaved to a bridge with - vlan_filtering=1, with some constraints (no more than - 7 non-pvid VLANs per user port). - - Can do QoS based on VLAN PCP and VLAN membership - admission control for autonomously forwarded frames - (regardless of whether they can be terminated on the - CPU or not). - Drawbacks: - - User cannot use VLANs in range 1024-3071. If the - switch receives frames with such VIDs, it will - misinterpret them as DSA tags. - - Switch uses Shared VLAN Learning (FDB lookup uses - only DMAC as key). - - When VLANs span cross-chip topologies, the total - number of permitted VLANs may be less than 7 per - port, due to a maximum number of 32 VLAN retagging - rules per switch. - Configuration mode: runtime - Type: bool. diff --git a/Documentation/networking/devlink/index.rst b/Documentation/networking/devlink/index.rst index c536db2cc0f9..7684ae5c4a4a 100644 --- a/Documentation/networking/devlink/index.rst +++ b/Documentation/networking/devlink/index.rst @@ -40,5 +40,6 @@ parameters, info versions, and other features it supports. mv88e6xxx netdevsim nfp + sja1105 qed ti-cpsw-switch diff --git a/Documentation/networking/devlink/sja1105.rst b/Documentation/networking/devlink/sja1105.rst new file mode 100644 index 000000000000..e2679c274085 --- /dev/null +++ b/Documentation/networking/devlink/sja1105.rst @@ -0,0 +1,49 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================= +sja1105 devlink support +======================= + +This document describes the devlink features implemented +by the ``sja1105`` device driver. + +Parameters +========== + +.. list-table:: Driver-specific parameters implemented + :widths: 5 5 5 85 + + * - Name + - Type + - Mode + - Description + * - ``best_effort_vlan_filtering`` + - Boolean + - runtime + - Allow plain ETH_P_8021Q headers to be used as DSA tags. + + Benefits: + + - Can terminate untagged traffic over switch net + devices even when enslaved to a bridge with + vlan_filtering=1. + - Can terminate VLAN-tagged traffic over switch net + devices even when enslaved to a bridge with + vlan_filtering=1, with some constraints (no more than + 7 non-pvid VLANs per user port). + - Can do QoS based on VLAN PCP and VLAN membership + admission control for autonomously forwarded frames + (regardless of whether they can be terminated on the + CPU or not). + + Drawbacks: + + - User cannot use VLANs in range 1024-3071. If the + switch receives frames with such VIDs, it will + misinterpret them as DSA tags. + - Switch uses Shared VLAN Learning (FDB lookup uses + only DMAC as key). + - When VLANs span cross-chip topologies, the total + number of permitted VLANs may be less than 7 per + port, due to a maximum number of 32 VLAN retagging + rules per switch. From 0f5d82f187e1beda3fe7295dfc500af266a5bd80 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Wed, 10 Jun 2020 13:41:39 -0500 Subject: [PATCH 44/83] net/filter: Permit reading NET in load_bytes_relative when MAC not set Added a check in the switch case on start_header that checks for the existence of the header, and in the case that MAC is not set and the caller requests for MAC, -EFAULT. If the caller requests for NET then MAC's existence is completely ignored. There is no function to check NET header's existence and as far as cgroup_skb/egress is concerned it should always be set. Removed for ptr >= the start of header, considering offset is bounded unsigned and should always be true. len <= end - mac is redundant to ptr + len <= end. Fixes: 3eee1f75f2b9 ("bpf: fix bpf_skb_load_bytes_relative pkt length check") Signed-off-by: YiFei Zhu Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/76bb820ddb6a95f59a772ecbd8c8a336f646b362.1591812755.git.zhuyifei@google.com --- net/core/filter.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 90d2eb77002f..1b7d1180931c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1755,25 +1755,27 @@ BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, u32, offset, void *, to, u32, len, u32, start_header) { u8 *end = skb_tail_pointer(skb); - u8 *net = skb_network_header(skb); - u8 *mac = skb_mac_header(skb); - u8 *ptr; + u8 *start, *ptr; - if (unlikely(offset > 0xffff || len > (end - mac))) + if (unlikely(offset > 0xffff)) goto err_clear; switch (start_header) { case BPF_HDR_START_MAC: - ptr = mac + offset; + if (unlikely(!skb_mac_header_was_set(skb))) + goto err_clear; + start = skb_mac_header(skb); break; case BPF_HDR_START_NET: - ptr = net + offset; + start = skb_network_header(skb); break; default: goto err_clear; } - if (likely(ptr >= mac && ptr + len <= end)) { + ptr = start + offset; + + if (likely(ptr + len <= end)) { memcpy(to, ptr, len); return 0; } From bd6fecb9a99cceb949271c1821cfbad2b2db97c6 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Wed, 10 Jun 2020 13:41:40 -0500 Subject: [PATCH 45/83] selftests/bpf: Add cgroup_skb/egress test for load_bytes_relative When cgroup_skb/egress triggers the MAC header is not set. Added a test that asserts reading MAC header is a -EFAULT but NET header succeeds. The test result from within the eBPF program is stored in an 1-element array map that the userspace then reads and asserts on. Another assertion is added that reading from a large offset, past the end of packet, returns -EFAULT. Signed-off-by: YiFei Zhu Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/9028ccbea4385a620e69c0a104f469ffd655c01e.1591812755.git.zhuyifei@google.com --- .../bpf/prog_tests/load_bytes_relative.c | 71 +++++++++++++++++++ .../selftests/bpf/progs/load_bytes_relative.c | 48 +++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c create mode 100644 tools/testing/selftests/bpf/progs/load_bytes_relative.c diff --git a/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c b/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c new file mode 100644 index 000000000000..c1168e4a9036 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright 2020 Google LLC. + */ + +#include +#include + +void test_load_bytes_relative(void) +{ + int server_fd, cgroup_fd, prog_fd, map_fd, client_fd; + int err; + struct bpf_object *obj; + struct bpf_program *prog; + struct bpf_map *test_result; + __u32 duration = 0; + + __u32 map_key = 0; + __u32 map_value = 0; + + cgroup_fd = test__join_cgroup("/load_bytes_relative"); + if (CHECK_FAIL(cgroup_fd < 0)) + return; + + server_fd = start_server(AF_INET, SOCK_STREAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + + err = bpf_prog_load("./load_bytes_relative.o", BPF_PROG_TYPE_CGROUP_SKB, + &obj, &prog_fd); + if (CHECK_FAIL(err)) + goto close_server_fd; + + test_result = bpf_object__find_map_by_name(obj, "test_result"); + if (CHECK_FAIL(!test_result)) + goto close_bpf_object; + + map_fd = bpf_map__fd(test_result); + if (map_fd < 0) + goto close_bpf_object; + + prog = bpf_object__find_program_by_name(obj, "load_bytes_relative"); + if (CHECK_FAIL(!prog)) + goto close_bpf_object; + + err = bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI); + if (CHECK_FAIL(err)) + goto close_bpf_object; + + client_fd = connect_to_fd(AF_INET, SOCK_STREAM, server_fd); + if (CHECK_FAIL(client_fd < 0)) + goto close_bpf_object; + close(client_fd); + + err = bpf_map_lookup_elem(map_fd, &map_key, &map_value); + if (CHECK_FAIL(err)) + goto close_bpf_object; + + CHECK(map_value != 1, "bpf", "bpf program returned failure"); + +close_bpf_object: + bpf_object__close(obj); + +close_server_fd: + close(server_fd); + +close_cgroup_fd: + close(cgroup_fd); +} diff --git a/tools/testing/selftests/bpf/progs/load_bytes_relative.c b/tools/testing/selftests/bpf/progs/load_bytes_relative.c new file mode 100644 index 000000000000..dc1d04a7a3d6 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/load_bytes_relative.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright 2020 Google LLC. + */ + +#include +#include +#include +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u32); +} test_result SEC(".maps"); + +SEC("cgroup_skb/egress") +int load_bytes_relative(struct __sk_buff *skb) +{ + struct ethhdr eth; + struct iphdr iph; + + __u32 map_key = 0; + __u32 test_passed = 0; + + /* MAC header is not set by the time cgroup_skb/egress triggers */ + if (bpf_skb_load_bytes_relative(skb, 0, ð, sizeof(eth), + BPF_HDR_START_MAC) != -EFAULT) + goto fail; + + if (bpf_skb_load_bytes_relative(skb, 0, &iph, sizeof(iph), + BPF_HDR_START_NET)) + goto fail; + + if (bpf_skb_load_bytes_relative(skb, 0xffff, &iph, sizeof(iph), + BPF_HDR_START_NET) != -EFAULT) + goto fail; + + test_passed = 1; + +fail: + bpf_map_update_elem(&test_result, &map_key, &test_passed, BPF_ANY); + + return 1; +} From d4060ac969563113101c79433f2ae005feca1c29 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 10 Jun 2020 15:08:04 +0200 Subject: [PATCH 46/83] tools, bpftool: Fix memory leak in codegen error cases Free the memory allocated for the template on error paths in function codegen. Signed-off-by: Tobias Klauser Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200610130804.21423-1-tklauser@distanz.ch --- tools/bpf/bpftool/gen.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index a3c4bb86c05a..ecbae47e66b8 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -224,6 +224,7 @@ static int codegen(const char *template, ...) } else { p_err("unrecognized character at pos %td in template '%s'", src - template - 1, template); + free(s); return -EINVAL; } } @@ -234,6 +235,7 @@ static int codegen(const char *template, ...) if (*src != '\t') { p_err("not enough tabs at pos %td in template '%s'", src - template - 1, template); + free(s); return -EINVAL; } } From 9334d5ba32c052fc925e0c817dc398c98b093221 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Thu, 11 Jun 2020 02:45:20 +0000 Subject: [PATCH 47/83] drivers: dpaa2: Use devm_kcalloc() in setup_dpni() A multiplication for the size determination of a memory allocation indicated that an array data structure should be processed. Thus use the corresponding function "devm_kcalloc". Signed-off-by: Xu Wang Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 8fb48de5d18c..f150cd454fa4 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -2907,8 +2907,9 @@ static int setup_dpni(struct fsl_mc_device *ls_dev) if (err && err != -EOPNOTSUPP) goto close; - priv->cls_rules = devm_kzalloc(dev, sizeof(struct dpaa2_eth_cls_rule) * - dpaa2_eth_fs_count(priv), GFP_KERNEL); + priv->cls_rules = devm_kcalloc(dev, dpaa2_eth_fs_count(priv), + sizeof(struct dpaa2_eth_cls_rule), + GFP_KERNEL); if (!priv->cls_rules) { err = -ENOMEM; goto close; From 77f972a7077d06d565243ecc192f45e1e5813cf1 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Wed, 10 Jun 2020 21:07:39 -0700 Subject: [PATCH 48/83] ionic: remove support for mgmt device We no longer support the mgmt device in the ionic driver, so remove the device id and related code. Fixes: b3f064e9746d ("ionic: add support for device id 0x1004") Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic.h | 2 -- drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c | 6 ------ drivers/net/ethernet/pensando/ionic/ionic_devlink.c | 4 ---- drivers/net/ethernet/pensando/ionic/ionic_lif.c | 13 ------------- 4 files changed, 25 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic.h b/drivers/net/ethernet/pensando/ionic/ionic.h index 23ccc0da2341..f5a910c458ba 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic.h +++ b/drivers/net/ethernet/pensando/ionic/ionic.h @@ -17,7 +17,6 @@ struct ionic_lif; #define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_PF 0x1002 #define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_VF 0x1003 -#define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_MGMT 0x1004 #define DEVCMD_TIMEOUT 10 @@ -42,7 +41,6 @@ struct ionic { struct dentry *dentry; struct ionic_dev_bar bars[IONIC_BARS_MAX]; unsigned int num_bars; - bool is_mgmt_nic; struct ionic_identity ident; struct list_head lifs; struct ionic_lif *master_lif; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c index 60fc191a35e5..0ac6acbc5f31 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c @@ -15,7 +15,6 @@ static const struct pci_device_id ionic_id_table[] = { { PCI_VDEVICE(PENSANDO, PCI_DEVICE_ID_PENSANDO_IONIC_ETH_PF) }, { PCI_VDEVICE(PENSANDO, PCI_DEVICE_ID_PENSANDO_IONIC_ETH_VF) }, - { PCI_VDEVICE(PENSANDO, PCI_DEVICE_ID_PENSANDO_IONIC_ETH_MGMT) }, { 0, } /* end of table */ }; MODULE_DEVICE_TABLE(pci, ionic_id_table); @@ -225,9 +224,6 @@ static int ionic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_drvdata(pdev, ionic); mutex_init(&ionic->dev_cmd_lock); - ionic->is_mgmt_nic = - ent->device == PCI_DEVICE_ID_PENSANDO_IONIC_ETH_MGMT; - /* Query system for DMA addressing limitation for the device. */ err = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(IONIC_ADDR_LEN)); if (err) { @@ -252,8 +248,6 @@ static int ionic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } pci_set_master(pdev); - if (!ionic->is_mgmt_nic) - pcie_print_link_status(pdev); err = ionic_map_bars(ionic); if (err) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c index 273c889faaad..2d590e571133 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c @@ -77,10 +77,6 @@ int ionic_devlink_register(struct ionic *ionic) return err; } - /* don't register the mgmt_nic as a port */ - if (ionic->is_mgmt_nic) - return 0; - devlink_port_attrs_set(&ionic->dl_port, DEVLINK_PORT_FLAVOUR_PHYSICAL, 0, false, 0, NULL, 0); err = devlink_port_register(dl, &ionic->dl_port, 0); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index fbc36e9e4729..9d8c969f21cb 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -99,9 +99,6 @@ static void ionic_link_status_check(struct ionic_lif *lif) if (!test_bit(IONIC_LIF_F_LINK_CHECK_REQUESTED, lif->state)) return; - if (lif->ionic->is_mgmt_nic) - return; - link_status = le16_to_cpu(lif->info->status.link_status); link_up = link_status == IONIC_PORT_OPER_STATUS_UP; @@ -1193,10 +1190,6 @@ static int ionic_init_nic_features(struct ionic_lif *lif) netdev_features_t features; int err; - /* no netdev features on the management device */ - if (lif->ionic->is_mgmt_nic) - return 0; - /* set up what we expect to support by default */ features = NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | @@ -2594,12 +2587,6 @@ int ionic_lifs_register(struct ionic *ionic) { int err; - /* the netdev is not registered on the management device, it is - * only used as a vehicle for napi operations on the adminq - */ - if (ionic->is_mgmt_nic) - return 0; - INIT_WORK(&ionic->nb_work, ionic_lif_notify_work); ionic->nb.notifier_call = ionic_lif_notify; From c9aa81faf19115fc2e732e7f210b37bb316987ff Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Thu, 11 Jun 2020 17:07:35 +0700 Subject: [PATCH 49/83] tipc: fix kernel WARNING in tipc_msg_append() syzbot found the following issue: WARNING: CPU: 0 PID: 6808 at include/linux/thread_info.h:150 check_copy_size include/linux/thread_info.h:150 [inline] WARNING: CPU: 0 PID: 6808 at include/linux/thread_info.h:150 copy_from_iter include/linux/uio.h:144 [inline] WARNING: CPU: 0 PID: 6808 at include/linux/thread_info.h:150 tipc_msg_append+0x49a/0x5e0 net/tipc/msg.c:242 Kernel panic - not syncing: panic_on_warn set ... This happens after commit 5e9eeccc58f3 ("tipc: fix NULL pointer dereference in streaming") that tried to build at least one buffer even when the message data length is zero... However, it now exposes another bug that the 'mss' can be zero and the 'cpy' will be negative, thus the above kernel WARNING will appear! The zero value of 'mss' is never expected because it means Nagle is not enabled for the socket (actually the socket type was 'SOCK_SEQPACKET'), so the function 'tipc_msg_append()' must not be called at all. But that was in this particular case since the message data length was zero, and the 'send <= maxnagle' check became true. We resolve the issue by explicitly checking if Nagle is enabled for the socket, i.e. 'maxnagle != 0' before calling the 'tipc_msg_append()'. We also reinforce the function to against such a negative values if any. Reported-by: syzbot+75139a7d2605236b0b7f@syzkaller.appspotmail.com Fixes: c0bceb97db9e ("tipc: add smart nagle feature") Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/msg.c | 4 ++-- net/tipc/socket.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 046e4cb3acea..01b64869a173 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -238,14 +238,14 @@ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, hdr = buf_msg(skb); curr = msg_blocks(hdr); mlen = msg_size(hdr); - cpy = min_t(int, rem, mss - mlen); + cpy = min_t(size_t, rem, mss - mlen); if (cpy != copy_from_iter(skb->data + mlen, cpy, &m->msg_iter)) return -EFAULT; msg_set_size(hdr, mlen + cpy); skb_put(skb, cpy); rem -= cpy; total += msg_blocks(hdr) - curr; - } while (rem); + } while (rem > 0); return total - accounted; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 26123f4177fd..a94f38333698 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1574,7 +1574,8 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) break; send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE); blocks = tsk->snd_backlog; - if (tsk->oneway++ >= tsk->nagle_start && send <= maxnagle) { + if (tsk->oneway++ >= tsk->nagle_start && maxnagle && + send <= maxnagle) { rc = tipc_msg_append(hdr, m, send, maxnagle, txq); if (unlikely(rc < 0)) break; From 9798278260e8f61d04415342544a8f701bc5ace7 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Thu, 11 Jun 2020 17:08:08 +0700 Subject: [PATCH 50/83] tipc: fix NULL pointer dereference in tipc_disc_rcv() When a bearer is enabled, we create a 'tipc_discoverer' object to store the bearer related data along with a timer and a preformatted discovery message buffer for later probing... However, this is only carried after the bearer was set 'up', that left a race condition resulting in kernel panic. It occurs when a discovery message from a peer node is received and processed in bottom half (since the bearer is 'up' already) just before the discoverer object is created but is now accessed in order to update the preformatted buffer (with a new trial address, ...) so leads to the NULL pointer dereference. We solve the problem by simply moving the bearer 'up' setting to later, so make sure everything is ready prior to any message receiving. Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/bearer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 34ca7b789eba..e366ec9a7e4d 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -316,7 +316,6 @@ static int tipc_enable_bearer(struct net *net, const char *name, b->domain = disc_domain; b->net_plane = bearer_id + 'A'; b->priority = prio; - test_and_set_bit_lock(0, &b->up); refcount_set(&b->refcnt, 1); res = tipc_disc_create(net, b, &b->bcast_addr, &skb); @@ -326,6 +325,7 @@ static int tipc_enable_bearer(struct net *net, const char *name, goto rejected; } + test_and_set_bit_lock(0, &b->up); rcu_assign_pointer(tn->bearer_list[bearer_id], b); if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); From aa2cad0600ed2ca6a0ab39948d4db1666b6c962b Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 11 Jun 2020 13:11:06 +0800 Subject: [PATCH 51/83] xdp: Fix xsk_generic_xmit errno MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Propagate sock_alloc_send_skb error code, not set it to EAGAIN unconditionally, when fail to allocate skb, which might cause that user space unnecessary loops. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Li RongQing Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1591852266-24017-1-git-send-email-lirongqing@baidu.com --- net/xdp/xsk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b6c0f08bd80d..3700266229f6 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -352,10 +352,8 @@ static int xsk_generic_xmit(struct sock *sk) len = desc.len; skb = sock_alloc_send_skb(sk, len, 1, &err); - if (unlikely(!skb)) { - err = -EAGAIN; + if (unlikely(!skb)) goto out; - } skb_put(skb, len); addr = desc.addr; From 2c4779eff837f1035f6f9650d246905daadd9528 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 11 Jun 2020 12:33:41 +0200 Subject: [PATCH 52/83] tools, bpftool: Exit on error in function codegen Currently, the codegen function might fail and return an error. But its callers continue without checking its return value. Since codegen can fail only in the unlikely case of the system running out of memory or the static template being malformed, just exit(-1) directly from codegen and make it void-returning. Suggested-by: Andrii Nakryiko Signed-off-by: Tobias Klauser Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200611103341.21532-1-tklauser@distanz.ch --- tools/bpf/bpftool/gen.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index ecbae47e66b8..7443879e87af 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -200,7 +200,7 @@ static int codegen_datasecs(struct bpf_object *obj, const char *obj_name) return err; } -static int codegen(const char *template, ...) +static void codegen(const char *template, ...) { const char *src, *end; int skip_tabs = 0, n; @@ -211,7 +211,7 @@ static int codegen(const char *template, ...) n = strlen(template); s = malloc(n + 1); if (!s) - return -ENOMEM; + exit(-1); src = template; dst = s; @@ -225,7 +225,7 @@ static int codegen(const char *template, ...) p_err("unrecognized character at pos %td in template '%s'", src - template - 1, template); free(s); - return -EINVAL; + exit(-1); } } @@ -236,7 +236,7 @@ static int codegen(const char *template, ...) p_err("not enough tabs at pos %td in template '%s'", src - template - 1, template); free(s); - return -EINVAL; + exit(-1); } } /* trim trailing whitespace */ @@ -257,7 +257,8 @@ static int codegen(const char *template, ...) va_end(args); free(s); - return n; + if (n) + exit(-1); } static int do_skeleton(int argc, char **argv) From e0ef26fbe2b0c62f42ba7667076dc38b693b6fb8 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 5 Jun 2020 10:09:43 -0700 Subject: [PATCH 53/83] iavf: fix speed reporting over virtchnl Link speeds are communicated over virtchnl using an enum virtchnl_link_speed. Currently, the highest link speed is 40Gbps which leaves us unable to reflect some speeds that an ice VF is capable of. This causes link speed to be misreported on the iavf driver. Allow for communicating link speeds using Mbps so that the proper speed can be reported for an ice VF. Moving away from the enum allows us to communicate future speed changes without requiring a new enum to be added. In order to support communicating link speeds over virtchnl in Mbps the following functionality was added: - Added u32 link_speed_mbps in the iavf_adapter structure. - Added the macro ADV_LINK_SUPPORT(_a) to determine if the VF driver supports communicating link speeds in Mbps. - Added the function iavf_get_vpe_link_status() to fill the correct link_status in the event_data union based on the ADV_LINK_SUPPORT(_a) macro. - Added the function iavf_set_adapter_link_speed_from_vpe() to determine whether or not to fill the u32 link_speed_mbps or enum virtchnl_link_speed link_speed field in the iavf_adapter structure based on the ADV_LINK_SUPPORT(_a) macro. - Do not free vf_res in iavf_init_get_resources() as vf_res will be accessed in iavf_get_link_ksettings(); memset to 0 instead. This memory is subsequently freed in iavf_remove(). Fixes: 7c710869d64e ("ice: Add handlers for VF netdevice operations") Signed-off-by: Brett Creeley Signed-off-by: Sergey Nemov Signed-off-by: Paul Greenwalt Signed-off-by: Tony Nguyen Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/iavf/iavf.h | 14 +++ .../net/ethernet/intel/iavf/iavf_ethtool.c | 14 ++- drivers/net/ethernet/intel/iavf/iavf_main.c | 25 ++++-- .../net/ethernet/intel/iavf/iavf_virtchnl.c | 88 ++++++++++++++++--- 4 files changed, 120 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index bcd11b4b29df..2d4ce6fdba1a 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -87,6 +87,10 @@ struct iavf_vsi { #define IAVF_HLUT_ARRAY_SIZE ((IAVF_VFQF_HLUT_MAX_INDEX + 1) * 4) #define IAVF_MBPS_DIVISOR 125000 /* divisor to convert to Mbps */ +#define IAVF_VIRTCHNL_VF_RESOURCE_SIZE (sizeof(struct virtchnl_vf_resource) + \ + (IAVF_MAX_VF_VSI * \ + sizeof(struct virtchnl_vsi_resource))) + /* MAX_MSIX_Q_VECTORS of these are allocated, * but we only use one per queue-specific vector. */ @@ -306,6 +310,14 @@ struct iavf_adapter { bool netdev_registered; bool link_up; enum virtchnl_link_speed link_speed; + /* This is only populated if the VIRTCHNL_VF_CAP_ADV_LINK_SPEED is set + * in vf_res->vf_cap_flags. Use ADV_LINK_SUPPORT macro to determine if + * this field is valid. This field should be used going forward and the + * enum virtchnl_link_speed above should be considered the legacy way of + * storing/communicating link speeds. + */ + u32 link_speed_mbps; + enum virtchnl_ops current_op; #define CLIENT_ALLOWED(_a) ((_a)->vf_res ? \ (_a)->vf_res->vf_cap_flags & \ @@ -322,6 +334,8 @@ struct iavf_adapter { VIRTCHNL_VF_OFFLOAD_RSS_PF))) #define VLAN_ALLOWED(_a) ((_a)->vf_res->vf_cap_flags & \ VIRTCHNL_VF_OFFLOAD_VLAN) +#define ADV_LINK_SUPPORT(_a) ((_a)->vf_res->vf_cap_flags & \ + VIRTCHNL_VF_CAP_ADV_LINK_SPEED) struct virtchnl_vf_resource *vf_res; /* incl. all VSIs */ struct virtchnl_vsi_resource *vsi_res; /* our LAN VSI */ struct virtchnl_version_info pf_version; diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index 2c39d46b6138..40a3fc7c5ea5 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -278,7 +278,18 @@ static int iavf_get_link_ksettings(struct net_device *netdev, ethtool_link_ksettings_zero_link_mode(cmd, supported); cmd->base.autoneg = AUTONEG_DISABLE; cmd->base.port = PORT_NONE; - /* Set speed and duplex */ + cmd->base.duplex = DUPLEX_FULL; + + if (ADV_LINK_SUPPORT(adapter)) { + if (adapter->link_speed_mbps && + adapter->link_speed_mbps < U32_MAX) + cmd->base.speed = adapter->link_speed_mbps; + else + cmd->base.speed = SPEED_UNKNOWN; + + return 0; + } + switch (adapter->link_speed) { case IAVF_LINK_SPEED_40GB: cmd->base.speed = SPEED_40000; @@ -306,7 +317,6 @@ static int iavf_get_link_ksettings(struct net_device *netdev, default: break; } - cmd->base.duplex = DUPLEX_FULL; return 0; } diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 2050649848ba..a21ae74bcd1b 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1756,17 +1756,17 @@ static int iavf_init_get_resources(struct iavf_adapter *adapter) struct net_device *netdev = adapter->netdev; struct pci_dev *pdev = adapter->pdev; struct iavf_hw *hw = &adapter->hw; - int err = 0, bufsz; + int err; WARN_ON(adapter->state != __IAVF_INIT_GET_RESOURCES); /* aq msg sent, awaiting reply */ if (!adapter->vf_res) { - bufsz = sizeof(struct virtchnl_vf_resource) + - (IAVF_MAX_VF_VSI * - sizeof(struct virtchnl_vsi_resource)); - adapter->vf_res = kzalloc(bufsz, GFP_KERNEL); - if (!adapter->vf_res) + adapter->vf_res = kzalloc(IAVF_VIRTCHNL_VF_RESOURCE_SIZE, + GFP_KERNEL); + if (!adapter->vf_res) { + err = -ENOMEM; goto err; + } } err = iavf_get_vf_config(adapter); if (err == IAVF_ERR_ADMIN_QUEUE_NO_WORK) { @@ -2036,7 +2036,7 @@ static void iavf_disable_vf(struct iavf_adapter *adapter) iavf_reset_interrupt_capability(adapter); iavf_free_queues(adapter); iavf_free_q_vectors(adapter); - kfree(adapter->vf_res); + memset(adapter->vf_res, 0, IAVF_VIRTCHNL_VF_RESOURCE_SIZE); iavf_shutdown_adminq(&adapter->hw); adapter->netdev->flags &= ~IFF_UP; clear_bit(__IAVF_IN_CRITICAL_TASK, &adapter->crit_section); @@ -2487,6 +2487,16 @@ static int iavf_validate_tx_bandwidth(struct iavf_adapter *adapter, { int speed = 0, ret = 0; + if (ADV_LINK_SUPPORT(adapter)) { + if (adapter->link_speed_mbps < U32_MAX) { + speed = adapter->link_speed_mbps; + goto validate_bw; + } else { + dev_err(&adapter->pdev->dev, "Unknown link speed\n"); + return -EINVAL; + } + } + switch (adapter->link_speed) { case IAVF_LINK_SPEED_40GB: speed = 40000; @@ -2510,6 +2520,7 @@ static int iavf_validate_tx_bandwidth(struct iavf_adapter *adapter, break; } +validate_bw: if (max_tx_rate > speed) { dev_err(&adapter->pdev->dev, "Invalid tx rate specified\n"); diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index d58374c2c33d..ca79bec4ebd9 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -139,7 +139,8 @@ int iavf_send_vf_config_msg(struct iavf_adapter *adapter) VIRTCHNL_VF_OFFLOAD_ENCAP | VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM | VIRTCHNL_VF_OFFLOAD_REQ_QUEUES | - VIRTCHNL_VF_OFFLOAD_ADQ; + VIRTCHNL_VF_OFFLOAD_ADQ | + VIRTCHNL_VF_CAP_ADV_LINK_SPEED; adapter->current_op = VIRTCHNL_OP_GET_VF_RESOURCES; adapter->aq_required &= ~IAVF_FLAG_AQ_GET_CONFIG; @@ -891,6 +892,8 @@ void iavf_disable_vlan_stripping(struct iavf_adapter *adapter) iavf_send_pf_msg(adapter, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING, NULL, 0); } +#define IAVF_MAX_SPEED_STRLEN 13 + /** * iavf_print_link_message - print link up or down * @adapter: adapter structure @@ -900,37 +903,99 @@ void iavf_disable_vlan_stripping(struct iavf_adapter *adapter) static void iavf_print_link_message(struct iavf_adapter *adapter) { struct net_device *netdev = adapter->netdev; - char *speed = "Unknown "; + int link_speed_mbps; + char *speed; if (!adapter->link_up) { netdev_info(netdev, "NIC Link is Down\n"); return; } + speed = kcalloc(1, IAVF_MAX_SPEED_STRLEN, GFP_KERNEL); + if (!speed) + return; + + if (ADV_LINK_SUPPORT(adapter)) { + link_speed_mbps = adapter->link_speed_mbps; + goto print_link_msg; + } + switch (adapter->link_speed) { case IAVF_LINK_SPEED_40GB: - speed = "40 G"; + link_speed_mbps = SPEED_40000; break; case IAVF_LINK_SPEED_25GB: - speed = "25 G"; + link_speed_mbps = SPEED_25000; break; case IAVF_LINK_SPEED_20GB: - speed = "20 G"; + link_speed_mbps = SPEED_20000; break; case IAVF_LINK_SPEED_10GB: - speed = "10 G"; + link_speed_mbps = SPEED_10000; break; case IAVF_LINK_SPEED_1GB: - speed = "1000 M"; + link_speed_mbps = SPEED_1000; break; case IAVF_LINK_SPEED_100MB: - speed = "100 M"; + link_speed_mbps = SPEED_100; break; default: + link_speed_mbps = SPEED_UNKNOWN; break; } - netdev_info(netdev, "NIC Link is Up %sbps Full Duplex\n", speed); +print_link_msg: + if (link_speed_mbps > SPEED_1000) { + if (link_speed_mbps == SPEED_2500) + snprintf(speed, IAVF_MAX_SPEED_STRLEN, "2.5 Gbps"); + else + /* convert to Gbps inline */ + snprintf(speed, IAVF_MAX_SPEED_STRLEN, "%d %s", + link_speed_mbps / 1000, "Gbps"); + } else if (link_speed_mbps == SPEED_UNKNOWN) { + snprintf(speed, IAVF_MAX_SPEED_STRLEN, "%s", "Unknown Mbps"); + } else { + snprintf(speed, IAVF_MAX_SPEED_STRLEN, "%u %s", + link_speed_mbps, "Mbps"); + } + + netdev_info(netdev, "NIC Link is Up Speed is %s Full Duplex\n", speed); + kfree(speed); +} + +/** + * iavf_get_vpe_link_status + * @adapter: adapter structure + * @vpe: virtchnl_pf_event structure + * + * Helper function for determining the link status + **/ +static bool +iavf_get_vpe_link_status(struct iavf_adapter *adapter, + struct virtchnl_pf_event *vpe) +{ + if (ADV_LINK_SUPPORT(adapter)) + return vpe->event_data.link_event_adv.link_status; + else + return vpe->event_data.link_event.link_status; +} + +/** + * iavf_set_adapter_link_speed_from_vpe + * @adapter: adapter structure for which we are setting the link speed + * @vpe: virtchnl_pf_event structure that contains the link speed we are setting + * + * Helper function for setting iavf_adapter link speed + **/ +static void +iavf_set_adapter_link_speed_from_vpe(struct iavf_adapter *adapter, + struct virtchnl_pf_event *vpe) +{ + if (ADV_LINK_SUPPORT(adapter)) + adapter->link_speed_mbps = + vpe->event_data.link_event_adv.link_speed; + else + adapter->link_speed = vpe->event_data.link_event.link_speed; } /** @@ -1160,12 +1225,11 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, if (v_opcode == VIRTCHNL_OP_EVENT) { struct virtchnl_pf_event *vpe = (struct virtchnl_pf_event *)msg; - bool link_up = vpe->event_data.link_event.link_status; + bool link_up = iavf_get_vpe_link_status(adapter, vpe); switch (vpe->event) { case VIRTCHNL_EVENT_LINK_CHANGE: - adapter->link_speed = - vpe->event_data.link_event.link_speed; + iavf_set_adapter_link_speed_from_vpe(adapter, vpe); /* we've already got the right link status, bail */ if (adapter->link_up == link_up) From 5071bda2947f61da0b1c271cf0b16be45c9b81e9 Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Fri, 5 Jun 2020 10:09:44 -0700 Subject: [PATCH 54/83] iavf: use appropriate enum for comparison adapter->link_speed has type enum virtchnl_link_speed but our comparisons are against enum iavf_aq_link_speed. Though they are, currently, the same values, change the comparison to the matching enum virtchnl_link_speed since that may not always be the case. Signed-off-by: Aleksandr Loktionov Signed-off-by: Piotr Kwapulinski Signed-off-by: Tony Nguyen Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 17 ++++++----------- drivers/net/ethernet/intel/iavf/iavf_main.c | 12 ++++++------ drivers/net/ethernet/intel/iavf/iavf_txrx.c | 12 ++++++------ drivers/net/ethernet/intel/iavf/iavf_virtchnl.c | 12 ++++++------ 4 files changed, 24 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index 40a3fc7c5ea5..b29a5979cce2 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -291,27 +291,22 @@ static int iavf_get_link_ksettings(struct net_device *netdev, } switch (adapter->link_speed) { - case IAVF_LINK_SPEED_40GB: + case VIRTCHNL_LINK_SPEED_40GB: cmd->base.speed = SPEED_40000; break; - case IAVF_LINK_SPEED_25GB: -#ifdef SPEED_25000 + case VIRTCHNL_LINK_SPEED_25GB: cmd->base.speed = SPEED_25000; -#else - netdev_info(netdev, - "Speed is 25G, display not supported by this version of ethtool.\n"); -#endif break; - case IAVF_LINK_SPEED_20GB: + case VIRTCHNL_LINK_SPEED_20GB: cmd->base.speed = SPEED_20000; break; - case IAVF_LINK_SPEED_10GB: + case VIRTCHNL_LINK_SPEED_10GB: cmd->base.speed = SPEED_10000; break; - case IAVF_LINK_SPEED_1GB: + case VIRTCHNL_LINK_SPEED_1GB: cmd->base.speed = SPEED_1000; break; - case IAVF_LINK_SPEED_100MB: + case VIRTCHNL_LINK_SPEED_100MB: cmd->base.speed = SPEED_100; break; default: diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index a21ae74bcd1b..922f20962a29 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2498,22 +2498,22 @@ static int iavf_validate_tx_bandwidth(struct iavf_adapter *adapter, } switch (adapter->link_speed) { - case IAVF_LINK_SPEED_40GB: + case VIRTCHNL_LINK_SPEED_40GB: speed = 40000; break; - case IAVF_LINK_SPEED_25GB: + case VIRTCHNL_LINK_SPEED_25GB: speed = 25000; break; - case IAVF_LINK_SPEED_20GB: + case VIRTCHNL_LINK_SPEED_20GB: speed = 20000; break; - case IAVF_LINK_SPEED_10GB: + case VIRTCHNL_LINK_SPEED_10GB: speed = 10000; break; - case IAVF_LINK_SPEED_1GB: + case VIRTCHNL_LINK_SPEED_1GB: speed = 1000; break; - case IAVF_LINK_SPEED_100MB: + case VIRTCHNL_LINK_SPEED_100MB: speed = 100; break; default: diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c index 7a30d5d5ef53..e091bab7e770 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c +++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c @@ -379,19 +379,19 @@ static inline unsigned int iavf_itr_divisor(struct iavf_q_vector *q_vector) unsigned int divisor; switch (q_vector->adapter->link_speed) { - case IAVF_LINK_SPEED_40GB: + case VIRTCHNL_LINK_SPEED_40GB: divisor = IAVF_ITR_ADAPTIVE_MIN_INC * 1024; break; - case IAVF_LINK_SPEED_25GB: - case IAVF_LINK_SPEED_20GB: + case VIRTCHNL_LINK_SPEED_25GB: + case VIRTCHNL_LINK_SPEED_20GB: divisor = IAVF_ITR_ADAPTIVE_MIN_INC * 512; break; default: - case IAVF_LINK_SPEED_10GB: + case VIRTCHNL_LINK_SPEED_10GB: divisor = IAVF_ITR_ADAPTIVE_MIN_INC * 256; break; - case IAVF_LINK_SPEED_1GB: - case IAVF_LINK_SPEED_100MB: + case VIRTCHNL_LINK_SPEED_1GB: + case VIRTCHNL_LINK_SPEED_100MB: divisor = IAVF_ITR_ADAPTIVE_MIN_INC * 32; break; } diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index ca79bec4ebd9..c4735589a296 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -921,22 +921,22 @@ static void iavf_print_link_message(struct iavf_adapter *adapter) } switch (adapter->link_speed) { - case IAVF_LINK_SPEED_40GB: + case VIRTCHNL_LINK_SPEED_40GB: link_speed_mbps = SPEED_40000; break; - case IAVF_LINK_SPEED_25GB: + case VIRTCHNL_LINK_SPEED_25GB: link_speed_mbps = SPEED_25000; break; - case IAVF_LINK_SPEED_20GB: + case VIRTCHNL_LINK_SPEED_20GB: link_speed_mbps = SPEED_20000; break; - case IAVF_LINK_SPEED_10GB: + case VIRTCHNL_LINK_SPEED_10GB: link_speed_mbps = SPEED_10000; break; - case IAVF_LINK_SPEED_1GB: + case VIRTCHNL_LINK_SPEED_1GB: link_speed_mbps = SPEED_1000; break; - case IAVF_LINK_SPEED_100MB: + case VIRTCHNL_LINK_SPEED_100MB: link_speed_mbps = SPEED_100; break; default: From 18c012d922620bb35ff2ab6838f1269bc12cf647 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Fri, 5 Jun 2020 10:09:45 -0700 Subject: [PATCH 55/83] iavf: Fix reporting 2.5 Gb and 5Gb speeds Commit 4ae4916b5643 ("i40e: fix 'Unknown bps' in dmesg for 2.5Gb/5Gb speeds") added the ability for the PF to report 2.5 and 5Gb speeds, however, the iavf driver does not recognize those speeds as the values were not added there. Add the proper enums and values so that iavf can properly deal with those speeds. Fixes: 4ae4916b5643 ("i40e: fix 'Unknown bps' in dmesg for 2.5Gb/5Gb speeds") Signed-off-by: Brett Creeley Signed-off-by: Witold Fijalkowski Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 6 ++++++ drivers/net/ethernet/intel/iavf/iavf_main.c | 18 ++++++++++++------ .../net/ethernet/intel/iavf/iavf_virtchnl.c | 6 ++++++ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index b29a5979cce2..181573822942 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -303,6 +303,12 @@ static int iavf_get_link_ksettings(struct net_device *netdev, case VIRTCHNL_LINK_SPEED_10GB: cmd->base.speed = SPEED_10000; break; + case VIRTCHNL_LINK_SPEED_5GB: + cmd->base.speed = SPEED_5000; + break; + case VIRTCHNL_LINK_SPEED_2_5GB: + cmd->base.speed = SPEED_2500; + break; case VIRTCHNL_LINK_SPEED_1GB: cmd->base.speed = SPEED_1000; break; diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 922f20962a29..06c481e9ac5c 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2499,22 +2499,28 @@ static int iavf_validate_tx_bandwidth(struct iavf_adapter *adapter, switch (adapter->link_speed) { case VIRTCHNL_LINK_SPEED_40GB: - speed = 40000; + speed = SPEED_40000; break; case VIRTCHNL_LINK_SPEED_25GB: - speed = 25000; + speed = SPEED_25000; break; case VIRTCHNL_LINK_SPEED_20GB: - speed = 20000; + speed = SPEED_20000; break; case VIRTCHNL_LINK_SPEED_10GB: - speed = 10000; + speed = SPEED_10000; + break; + case VIRTCHNL_LINK_SPEED_5GB: + speed = SPEED_5000; + break; + case VIRTCHNL_LINK_SPEED_2_5GB: + speed = SPEED_2500; break; case VIRTCHNL_LINK_SPEED_1GB: - speed = 1000; + speed = SPEED_1000; break; case VIRTCHNL_LINK_SPEED_100MB: - speed = 100; + speed = SPEED_100; break; default: break; diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index c4735589a296..ed08ace4f05a 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -933,6 +933,12 @@ static void iavf_print_link_message(struct iavf_adapter *adapter) case VIRTCHNL_LINK_SPEED_10GB: link_speed_mbps = SPEED_10000; break; + case VIRTCHNL_LINK_SPEED_5GB: + link_speed_mbps = SPEED_5000; + break; + case VIRTCHNL_LINK_SPEED_2_5GB: + link_speed_mbps = SPEED_2500; + break; case VIRTCHNL_LINK_SPEED_1GB: link_speed_mbps = SPEED_1000; break; From 8e3e4b9da7e62680668f6cf71742207758764458 Mon Sep 17 00:00:00 2001 From: Paul Greenwalt Date: Fri, 5 Jun 2020 10:09:46 -0700 Subject: [PATCH 56/83] iavf: increase reset complete wait time With an increased number of VFs, it's possible to encounter the following issue during reset. iavf b8d4:00:02.0: Hardware reset detected iavf b8d4:00:02.0: Reset never finished (0) iavf b8d4:00:02.0: Reset task did not complete, VF disabled Increase the reset complete wait count to allow for 128 VFs to complete reset. Signed-off-by: Paul Greenwalt Signed-off-by: Tony Nguyen Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/iavf/iavf.h | 4 ++++ drivers/net/ethernet/intel/iavf/iavf_main.c | 12 +++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index 2d4ce6fdba1a..10b805ba03ee 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -219,6 +219,10 @@ struct iavf_cloud_filter { bool add; /* filter needs to be added */ }; +#define IAVF_RESET_WAIT_MS 10 +#define IAVF_RESET_WAIT_DETECTED_COUNT 500 +#define IAVF_RESET_WAIT_COMPLETE_COUNT 2000 + /* board specific private data structure */ struct iavf_adapter { struct work_struct reset_task; diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 06c481e9ac5c..fa82768e5eda 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2046,8 +2046,6 @@ static void iavf_disable_vf(struct iavf_adapter *adapter) dev_info(&adapter->pdev->dev, "Reset task did not complete, VF disabled\n"); } -#define IAVF_RESET_WAIT_MS 10 -#define IAVF_RESET_WAIT_COUNT 500 /** * iavf_reset_task - Call-back task to handle hardware reset * @work: pointer to work_struct @@ -2101,20 +2099,20 @@ static void iavf_reset_task(struct work_struct *work) adapter->flags |= IAVF_FLAG_RESET_PENDING; /* poll until we see the reset actually happen */ - for (i = 0; i < IAVF_RESET_WAIT_COUNT; i++) { + for (i = 0; i < IAVF_RESET_WAIT_DETECTED_COUNT; i++) { reg_val = rd32(hw, IAVF_VF_ARQLEN1) & IAVF_VF_ARQLEN1_ARQENABLE_MASK; if (!reg_val) break; usleep_range(5000, 10000); } - if (i == IAVF_RESET_WAIT_COUNT) { + if (i == IAVF_RESET_WAIT_DETECTED_COUNT) { dev_info(&adapter->pdev->dev, "Never saw reset\n"); goto continue_reset; /* act like the reset happened */ } /* wait until the reset is complete and the PF is responding to us */ - for (i = 0; i < IAVF_RESET_WAIT_COUNT; i++) { + for (i = 0; i < IAVF_RESET_WAIT_COMPLETE_COUNT; i++) { /* sleep first to make sure a minimum wait time is met */ msleep(IAVF_RESET_WAIT_MS); @@ -2126,7 +2124,7 @@ static void iavf_reset_task(struct work_struct *work) pci_set_master(adapter->pdev); - if (i == IAVF_RESET_WAIT_COUNT) { + if (i == IAVF_RESET_WAIT_COMPLETE_COUNT) { dev_err(&adapter->pdev->dev, "Reset never finished (%x)\n", reg_val); iavf_disable_vf(adapter); @@ -3429,7 +3427,7 @@ static int iavf_check_reset_complete(struct iavf_hw *hw) u32 rstat; int i; - for (i = 0; i < 100; i++) { + for (i = 0; i < IAVF_RESET_WAIT_COMPLETE_COUNT; i++) { rstat = rd32(hw, IAVF_VFGEN_RSTAT) & IAVF_VFGEN_RSTAT_VFR_STATE_MASK; if ((rstat == VIRTCHNL_VFR_VFACTIVE) || From 42ea9f1b5c625fad225d4ac96a7e757dd4199d9c Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Wed, 6 May 2020 15:59:48 +0300 Subject: [PATCH 57/83] net/mlx5: drain health workqueue in case of driver load error In case there is a work in the health WQ when we teardown the driver, in driver load error flow, the health work will try to read dev->iseg, which was already unmap in mlx5_pci_close(). Fix it by draining the health workqueue first thing in mlx5_pci_close(). Trace of the error: BUG: unable to handle page fault for address: ffffb5b141c18014 PF: supervisor read access in kernel mode PF: error_code(0x0000) - not-present page PGD 1fe95d067 P4D 1fe95d067 PUD 1fe95e067 PMD 1b7823067 PTE 0 Oops: 0000 [#1] SMP PTI CPU: 3 PID: 6755 Comm: kworker/u128:2 Not tainted 5.2.0-net-next-mlx5-hv_stats-over-last-worked-hyperv #1 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090006 04/28/2016 Workqueue: mlx5_healtha050:00:02.0 mlx5_fw_fatal_reporter_err_work [mlx5_core] RIP: 0010:ioread32be+0x30/0x40 Code: 00 77 27 48 81 ff 00 00 01 00 76 07 0f b7 d7 ed 0f c8 c3 55 48 c7 c6 3b ee d5 9f 48 89 e5 e8 67 fc ff ff b8 ff ff ff ff 5d c3 <8b> 07 0f c8 c3 66 66 2e 0f 1f 84 00 00 00 00 00 48 81 fe ff ff 03 RSP: 0018:ffffb5b14c56fd78 EFLAGS: 00010292 RAX: ffffb5b141c18000 RBX: ffff8e9f78a801c0 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffff8e9f7ecd7628 RDI: ffffb5b141c18014 RBP: ffffb5b14c56fd90 R08: 0000000000000001 R09: 0000000000000000 R10: ffff8e9f372a2c30 R11: ffff8e9f87f4bc40 R12: ffff8e9f372a1fc0 R13: ffff8e9f78a80000 R14: ffffffffc07136a0 R15: ffff8e9f78ae6f20 FS: 0000000000000000(0000) GS:ffff8e9f7ecc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffb5b141c18014 CR3: 00000001c8f82006 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? mlx5_health_try_recover+0x4d/0x270 [mlx5_core] mlx5_fw_fatal_reporter_recover+0x16/0x20 [mlx5_core] devlink_health_reporter_recover+0x1c/0x50 devlink_health_report+0xfb/0x240 mlx5_fw_fatal_reporter_err_work+0x65/0xd0 [mlx5_core] process_one_work+0x1fb/0x4e0 ? process_one_work+0x16b/0x4e0 worker_thread+0x4f/0x3d0 kthread+0x10d/0x140 ? process_one_work+0x4e0/0x4e0 ? kthread_cancel_delayed_work_sync+0x20/0x20 ret_from_fork+0x1f/0x30 Modules linked in: nfsv3 rpcsec_gss_krb5 nfsv4 nfs fscache 8021q garp mrp stp llc ipmi_devintf ipmi_msghandler rpcrdma rdma_ucm ib_iser rdma_cm ib_umad iw_cm ib_ipoib libiscsi scsi_transport_iscsi ib_cm mlx5_ib ib_uverbs ib_core mlx5_core sb_edac crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel aes_x86_64 mlxfw crypto_simd cryptd glue_helper input_leds hyperv_fb intel_rapl_perf joydev serio_raw pci_hyperv pci_hyperv_mini mac_hid hv_balloon nfsd auth_rpcgss nfs_acl lockd grace sunrpc sch_fq_codel ip_tables x_tables autofs4 hv_utils hid_generic hv_storvsc ptp hid_hyperv hid hv_netvsc hyperv_keyboard pps_core scsi_transport_fc psmouse hv_vmbus i2c_piix4 floppy pata_acpi CR2: ffffb5b141c18014 ---[ end trace b12c5503157cad24 ]--- RIP: 0010:ioread32be+0x30/0x40 Code: 00 77 27 48 81 ff 00 00 01 00 76 07 0f b7 d7 ed 0f c8 c3 55 48 c7 c6 3b ee d5 9f 48 89 e5 e8 67 fc ff ff b8 ff ff ff ff 5d c3 <8b> 07 0f c8 c3 66 66 2e 0f 1f 84 00 00 00 00 00 48 81 fe ff ff 03 RSP: 0018:ffffb5b14c56fd78 EFLAGS: 00010292 RAX: ffffb5b141c18000 RBX: ffff8e9f78a801c0 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffff8e9f7ecd7628 RDI: ffffb5b141c18014 RBP: ffffb5b14c56fd90 R08: 0000000000000001 R09: 0000000000000000 R10: ffff8e9f372a2c30 R11: ffff8e9f87f4bc40 R12: ffff8e9f372a1fc0 R13: ffff8e9f78a80000 R14: ffffffffc07136a0 R15: ffff8e9f78ae6f20 FS: 0000000000000000(0000) GS:ffff8e9f7ecc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffb5b141c18014 CR3: 00000001c8f82006 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 BUG: sleeping function called from invalid context at ./include/linux/percpu-rwsem.h:38 in_atomic(): 0, irqs_disabled(): 1, pid: 6755, name: kworker/u128:2 INFO: lockdep is turned off. CPU: 3 PID: 6755 Comm: kworker/u128:2 Tainted: G D 5.2.0-net-next-mlx5-hv_stats-over-last-worked-hyperv #1 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090006 04/28/2016 Workqueue: mlx5_healtha050:00:02.0 mlx5_fw_fatal_reporter_err_work [mlx5_core] Call Trace: dump_stack+0x63/0x88 ___might_sleep+0x10a/0x130 __might_sleep+0x4a/0x80 exit_signals+0x33/0x230 ? blocking_notifier_call_chain+0x16/0x20 do_exit+0xb1/0xc30 ? kthread+0x10d/0x140 ? process_one_work+0x4e0/0x4e0 Fixes: 52c368dc3da7 ("net/mlx5: Move health and page alloc init to mdev_init") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index df46b1fce3a7..18d6c3752abe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -785,6 +785,11 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev, static void mlx5_pci_close(struct mlx5_core_dev *dev) { + /* health work might still be active, and it needs pci bar in + * order to know the NIC state. Therefore, drain the health WQ + * before removing the pci bars + */ + mlx5_drain_health_wq(dev); iounmap(dev->iseg); pci_clear_master(dev->pdev); release_bar(dev->pdev); From b6e0b6bebe0732d5cac51f0791f269d2413b8980 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Thu, 7 May 2020 09:32:53 +0300 Subject: [PATCH 58/83] net/mlx5: Fix fatal error handling during device load Currently, in case of fatal error during mlx5_load_one(), we cannot enter error state until mlx5_load_one() is finished, what can take several minutes until commands will get timeouts, because these commands can't be processed due to the fatal error. Fix it by setting dev->state as MLX5_DEVICE_STATE_INTERNAL_ERROR before requesting the lock. Fixes: c1d4d2e92ad6 ("net/mlx5: Avoid calling sleeping function by the health poll thread") Signed-off-by: Shay Drory Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index c0cfbab15fe9..b31f769d2df9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -192,15 +192,23 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev) void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) { + bool err_detected = false; + + /* Mark the device as fatal in order to abort FW commands */ + if ((check_fatal_sensors(dev) || force) && + dev->state == MLX5_DEVICE_STATE_UP) { + dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; + err_detected = true; + } mutex_lock(&dev->intf_state_mutex); - if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) - goto unlock; + if (!err_detected && dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + goto unlock;/* a previous error is still being handled */ if (dev->state == MLX5_DEVICE_STATE_UNINITIALIZED) { dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; goto unlock; } - if (check_fatal_sensors(dev) || force) { + if (check_fatal_sensors(dev) || force) { /* protected state setting */ dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; mlx5_cmd_flush(dev); } From 47a357de2b6b706af3c9471d5042f9ba8907031e Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Mon, 1 Jun 2020 19:45:26 +0300 Subject: [PATCH 59/83] net/mlx5: DR, Fix freeing in dr_create_rc_qp() Variable "in" in dr_create_rc_qp() is allocated with kvzalloc() and should be freed with kvfree(). Fixes: 297cccebdc5a ("net/mlx5: DR, Expose an internal API to issue RDMA operations") Cc: stable@vger.kernel.org Signed-off-by: Denis Efremov Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index f421013b0b54..2ca79b9bde1f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -179,7 +179,7 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); dr_qp->qpn = MLX5_GET(create_qp_out, out, qpn); - kfree(in); + kvfree(in); if (err) goto err_in; dr_qp->uar = attr->uar; From 36d45fb9d2fdf348d778bfe73f0427db1c6f9bc7 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Mon, 1 Jun 2020 16:03:44 +0300 Subject: [PATCH 60/83] net/mlx5e: Fix repeated XSK usage on one channel After an XSK is closed, the relevant structures in the channel are not zeroed. If an XSK is opened the second time on the same channel without recreating channels, the stray values in the structures will lead to incorrect operation of queues, which causes CQE errors, and the new socket doesn't work at all. This patch fixes the issue by explicitly zeroing XSK-related structs in the channel on XSK close. Note that those structs are zeroed on channel creation, and usually a configuration change (XDP program is set) happens on XSK open, which leads to recreating channels, so typical XSK usecases don't suffer from this issue. However, if XSKs are opened and closed on the same channel without removing the XDP program, this bug reproduces. Fixes: db05815b36cb ("net/mlx5e: Add XSK zero-copy support") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c index c28cbae42331..2c80205dc939 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c @@ -152,6 +152,10 @@ void mlx5e_close_xsk(struct mlx5e_channel *c) mlx5e_close_cq(&c->xskicosq.cq); mlx5e_close_xdpsq(&c->xsksq); mlx5e_close_cq(&c->xsksq.cq); + + memset(&c->xskrq, 0, sizeof(c->xskrq)); + memset(&c->xsksq, 0, sizeof(c->xsksq)); + memset(&c->xskicosq, 0, sizeof(c->xskicosq)); } void mlx5e_activate_xsk(struct mlx5e_channel *c) From 5f1572e6178e47c3ace55ced187d93240952c9cd Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 17 May 2020 12:45:52 +0300 Subject: [PATCH 61/83] net/mlx5e: Fix ethtool hfunc configuration change Changing RX hash function requires rearranging of RQT internal indexes, the user isn't exposed to such changes and these changes do not affect the user configured indirection table. Rebuild RQ table on hfunc change. Fixes: bdfc028de1b3 ("net/mlx5e: Fix ethtool RX hash func configuration change") Signed-off-by: Aya Levin Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en_ethtool.c | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 3ef2525e8de9..ec5658bbe3c5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1173,7 +1173,8 @@ int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir, struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5e_rss_params *rss = &priv->rss_params; int inlen = MLX5_ST_SZ_BYTES(modify_tir_in); - bool hash_changed = false; + bool refresh_tirs = false; + bool refresh_rqt = false; void *in; if ((hfunc != ETH_RSS_HASH_NO_CHANGE) && @@ -1189,36 +1190,38 @@ int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir, if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != rss->hfunc) { rss->hfunc = hfunc; - hash_changed = true; + refresh_rqt = true; + refresh_tirs = true; } if (indir) { memcpy(rss->indirection_rqt, indir, sizeof(rss->indirection_rqt)); - - if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { - u32 rqtn = priv->indir_rqt.rqtn; - struct mlx5e_redirect_rqt_param rrp = { - .is_rss = true, - { - .rss = { - .hfunc = rss->hfunc, - .channels = &priv->channels, - }, - }, - }; - - mlx5e_redirect_rqt(priv, rqtn, MLX5E_INDIR_RQT_SIZE, rrp); - } + refresh_rqt = true; } if (key) { memcpy(rss->toeplitz_hash_key, key, sizeof(rss->toeplitz_hash_key)); - hash_changed = hash_changed || rss->hfunc == ETH_RSS_HASH_TOP; + refresh_tirs = refresh_tirs || rss->hfunc == ETH_RSS_HASH_TOP; } - if (hash_changed) + if (refresh_rqt && test_bit(MLX5E_STATE_OPENED, &priv->state)) { + struct mlx5e_redirect_rqt_param rrp = { + .is_rss = true, + { + .rss = { + .hfunc = rss->hfunc, + .channels = &priv->channels, + }, + }, + }; + u32 rqtn = priv->indir_rqt.rqtn; + + mlx5e_redirect_rqt(priv, rqtn, MLX5E_INDIR_RQT_SIZE, rrp); + } + + if (refresh_tirs) mlx5e_modify_tirs_hash(priv, in); mutex_unlock(&priv->state_lock); From 60904cd349abc98cb888fc28d1ca55a8e2cf87b3 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 14 May 2020 05:12:56 -0500 Subject: [PATCH 62/83] net/mlx5: Disable reload while removing the device While unregistration is in progress, user might be reloading the interface. This can race with unregistration in below flow which uses the resources which are getting disabled by reload flow. Hence, disable the devlink reloading first when removing the device. CPU0 CPU1 ---- ---- local_pci_remove() devlink_mutex remove_one() devlink_nl_cmd_reload() mlx5_unregister_device() devlink_reload() ops->reload_down() mlx5_unload_one() Fixes: 4383cfcc65e7 ("net/mlx5: Add devlink reload") Signed-off-by: Parav Pandit Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 2 -- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index e94f0c4d74a7..a99fe4b02b9b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -283,7 +283,6 @@ int mlx5_devlink_register(struct devlink *devlink, struct device *dev) goto params_reg_err; mlx5_devlink_set_params_init_values(devlink); devlink_params_publish(devlink); - devlink_reload_enable(devlink); return 0; params_reg_err: @@ -293,7 +292,6 @@ int mlx5_devlink_register(struct devlink *devlink, struct device *dev) void mlx5_devlink_unregister(struct devlink *devlink) { - devlink_reload_disable(devlink); devlink_params_unregister(devlink, mlx5_devlink_params, ARRAY_SIZE(mlx5_devlink_params)); devlink_unregister(devlink); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 18d6c3752abe..2729afc13ab4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1367,6 +1367,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *id) dev_err(&pdev->dev, "mlx5_crdump_enable failed with error code %d\n", err); pci_save_state(pdev); + devlink_reload_enable(devlink); return 0; err_load_one: @@ -1384,6 +1385,7 @@ static void remove_one(struct pci_dev *pdev) struct mlx5_core_dev *dev = pci_get_drvdata(pdev); struct devlink *devlink = priv_to_devlink(dev); + devlink_reload_disable(devlink); mlx5_crdump_disable(dev); mlx5_devlink_unregister(devlink); From 98f91c45769302b26e781f949b07a90df3c5cbda Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 15 May 2020 02:44:06 -0500 Subject: [PATCH 63/83] net/mlx5: Fix devlink objects and devlink device unregister sequence Current below problems exists. 1. devlink device is registered by mlx5_load_one(). But it is not unregistered by mlx5_unload_one(). This is incorrect. 2. Above issue leads to, When mlx5 PCI device is removed, currently devlink device is unregistered before devlink ports are unregistered in below ladder diagram. remove_one() mlx5_devlink_unregister() [..] devlink_unregister() <- ports are still registered! mlx5_unload_one() mlx5_unregister_device() mlx5_remove_device() mlx5e_remove() mlx5e_devlink_port_unregister() devlink_port_unregister() 3. Condition checking for registering and unregister device are not symmetric either in these routines. Hence, fix the sequence by having load and unload routines symmetric and in right order. i.e. (a) register devlink device followed by registering devlink ports (b) unregister devlink ports followed by devlink device Do this based on boot and cleanup flags instead of different conditions. Fixes: c6acd629eec7 ("net/mlx5e: Add support for devlink-port in non-representors mode") Fixes: f60f315d339e ("net/mlx5e: Register devlink ports for physical link, PCI PF, VFs") Signed-off-by: Parav Pandit Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/main.c | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 2729afc13ab4..e786c5c75dba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1199,23 +1199,22 @@ int mlx5_load_one(struct mlx5_core_dev *dev, bool boot) if (err) goto err_load; + set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); + if (boot) { err = mlx5_devlink_register(priv_to_devlink(dev), dev->device); if (err) goto err_devlink_reg; - } - - if (mlx5_device_registered(dev)) - mlx5_attach_device(dev); - else mlx5_register_device(dev); - - set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); + } else { + mlx5_attach_device(dev); + } mutex_unlock(&dev->intf_state_mutex); return 0; err_devlink_reg: + clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); mlx5_unload(dev); err_load: if (boot) @@ -1231,10 +1230,15 @@ int mlx5_load_one(struct mlx5_core_dev *dev, bool boot) void mlx5_unload_one(struct mlx5_core_dev *dev, bool cleanup) { - if (cleanup) - mlx5_unregister_device(dev); - mutex_lock(&dev->intf_state_mutex); + + if (cleanup) { + mlx5_unregister_device(dev); + mlx5_devlink_unregister(priv_to_devlink(dev)); + } else { + mlx5_detach_device(dev); + } + if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) { mlx5_core_warn(dev, "%s: interface is down, NOP\n", __func__); @@ -1245,9 +1249,6 @@ void mlx5_unload_one(struct mlx5_core_dev *dev, bool cleanup) clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); - if (mlx5_device_registered(dev)) - mlx5_detach_device(dev); - mlx5_unload(dev); if (cleanup) @@ -1387,8 +1388,6 @@ static void remove_one(struct pci_dev *pdev) devlink_reload_disable(devlink); mlx5_crdump_disable(dev); - mlx5_devlink_unregister(devlink); - mlx5_drain_health_wq(dev); mlx5_unload_one(dev, true); mlx5_pci_close(dev); From 0d156f2deda8675c29fa2b8b5ed9b374370e47f2 Mon Sep 17 00:00:00 2001 From: Oz Shlomo Date: Sun, 7 Jun 2020 15:40:40 +0000 Subject: [PATCH 64/83] net/mlx5e: CT: Fix ipv6 nat header rewrite actions Set the ipv6 word fields according to the hardware definitions. Fixes: ac991b48d43c ("net/mlx5e: CT: Offload established flows") Signed-off-by: Oz Shlomo Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en/tc_ct.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index afc19dca1f5f..430025550fad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -328,21 +328,21 @@ mlx5_tc_ct_parse_mangle_to_mod_act(struct flow_action_entry *act, case FLOW_ACT_MANGLE_HDR_TYPE_IP6: MLX5_SET(set_action_in, modact, length, 0); - if (offset == offsetof(struct ipv6hdr, saddr)) + if (offset == offsetof(struct ipv6hdr, saddr) + 12) field = MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0; - else if (offset == offsetof(struct ipv6hdr, saddr) + 4) - field = MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32; else if (offset == offsetof(struct ipv6hdr, saddr) + 8) + field = MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32; + else if (offset == offsetof(struct ipv6hdr, saddr) + 4) field = MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64; - else if (offset == offsetof(struct ipv6hdr, saddr) + 12) + else if (offset == offsetof(struct ipv6hdr, saddr)) field = MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96; - else if (offset == offsetof(struct ipv6hdr, daddr)) - field = MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0; - else if (offset == offsetof(struct ipv6hdr, daddr) + 4) - field = MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32; - else if (offset == offsetof(struct ipv6hdr, daddr) + 8) - field = MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64; else if (offset == offsetof(struct ipv6hdr, daddr) + 12) + field = MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0; + else if (offset == offsetof(struct ipv6hdr, daddr) + 8) + field = MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32; + else if (offset == offsetof(struct ipv6hdr, daddr) + 4) + field = MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64; + else if (offset == offsetof(struct ipv6hdr, daddr)) field = MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96; else return -EOPNOTSUPP; From 17e73d47cd095154878dfedd4918d6a9482eba13 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Jun 2020 15:28:37 +0300 Subject: [PATCH 65/83] net/mlx5: Don't fail driver on failure to create debugfs Clang warns: drivers/net/ethernet/mellanox/mlx5/core/main.c:1278:6: warning: variable 'err' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized] if (!priv->dbg_root) { ^~~~~~~~~~~~~~~ drivers/net/ethernet/mellanox/mlx5/core/main.c:1303:9: note: uninitialized use occurs here return err; ^~~ drivers/net/ethernet/mellanox/mlx5/core/main.c:1278:2: note: remove the 'if' if its condition is always false if (!priv->dbg_root) { ^~~~~~~~~~~~~~~~~~~~~~ drivers/net/ethernet/mellanox/mlx5/core/main.c:1259:9: note: initialize the variable 'err' to silence this warning int err; ^ = 0 1 warning generated. The check of returned value of debugfs_create_dir() is wrong because by the design debugfs failures should never fail the driver and the check itself was wrong too. The kernel compiled without CONFIG_DEBUG_FS will return ERR_PTR(-ENODEV) and not NULL as expected. Fixes: 11f3b84d7068 ("net/mlx5: Split mdev init and pci init") Link: https://github.com/ClangBuiltLinux/linux/issues/1042 Reported-by: Nathan Chancellor Signed-off-by: Leon Romanovsky Reviewed-by: Nathan Chancellor Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index e786c5c75dba..8b658908f044 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1281,11 +1281,6 @@ static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) priv->dbg_root = debugfs_create_dir(dev_name(dev->device), mlx5_debugfs_root); - if (!priv->dbg_root) { - dev_err(dev->device, "mlx5_core: error, Cannot create debugfs dir, aborting\n"); - goto err_dbg_root; - } - err = mlx5_health_init(dev); if (err) goto err_health_init; @@ -1300,7 +1295,6 @@ static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) mlx5_health_cleanup(dev); err_health_init: debugfs_remove(dev->priv.dbg_root); -err_dbg_root: mutex_destroy(&priv->pgdir_mutex); mutex_destroy(&priv->alloc_mutex); mutex_destroy(&priv->bfregs.wc_head.lock); From 09a9297574cb10b3d9fe722b2baa9a379b2d289c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 3 Jun 2020 20:54:36 +0300 Subject: [PATCH 66/83] net/mlx5: E-Switch, Fix some error pointer dereferences We can't leave "counter" set to an error pointer. Otherwise either it will lead to an error pointer dereference later in the function or it leads to an error pointer dereference when we call mlx5_fc_destroy(). Fixes: 07bab9502641d ("net/mlx5: E-Switch, Refactor eswitch ingress acl codes") Signed-off-by: Dan Carpenter Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c index 9bda4fe2eafa..5dc335e621c5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/acl/ingress_lgcy.c @@ -162,10 +162,12 @@ int esw_acl_ingress_lgcy_setup(struct mlx5_eswitch *esw, if (MLX5_CAP_ESW_INGRESS_ACL(esw->dev, flow_counter)) { counter = mlx5_fc_create(esw->dev, false); - if (IS_ERR(counter)) + if (IS_ERR(counter)) { esw_warn(esw->dev, "vport[%d] configure ingress drop rule counter failed\n", vport->vport); + counter = NULL; + } vport->ingress.legacy.drop_counter = counter; } @@ -272,7 +274,7 @@ void esw_acl_ingress_lgcy_cleanup(struct mlx5_eswitch *esw, esw_acl_ingress_table_destroy(vport); clean_drop_counter: - if (!IS_ERR_OR_NULL(vport->ingress.legacy.drop_counter)) { + if (vport->ingress.legacy.drop_counter) { mlx5_fc_destroy(esw->dev, vport->ingress.legacy.drop_counter); vport->ingress.legacy.drop_counter = NULL; } From 2ad6691d988c0c611362ddc2aad89e0fb50e3261 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 11 Jun 2020 21:57:00 +0100 Subject: [PATCH 67/83] rxrpc: Fix race between incoming ACK parser and retransmitter There's a race between the retransmission code and the received ACK parser. The problem is that the retransmission loop has to drop the lock under which it is iterating through the transmission buffer in order to transmit a packet, but whilst the lock is dropped, the ACK parser can crank the Tx window round and discard the packets from the buffer. The retransmission code then updated the annotations for the wrong packet and a later retransmission thought it had to retransmit a packet that wasn't there, leading to a NULL pointer dereference. Fix this by: (1) Moving the annotation change to before we drop the lock prior to transmission. This means we can't vary the annotation depending on the outcome of the transmission, but that's fine - we'll retransmit again later if it failed now. (2) Skipping the packet if the skb pointer is NULL. The following oops was seen: BUG: kernel NULL pointer dereference, address: 000000000000002d Workqueue: krxrpcd rxrpc_process_call RIP: 0010:rxrpc_get_skb+0x14/0x8a ... Call Trace: rxrpc_resend+0x331/0x41e ? get_vtime_delta+0x13/0x20 rxrpc_process_call+0x3c0/0x4ac process_one_work+0x18f/0x27f worker_thread+0x1a3/0x247 ? create_worker+0x17d/0x17d kthread+0xe6/0xeb ? kthread_delayed_work_timer_fn+0x83/0x83 ret_from_fork+0x1f/0x30 Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/call_event.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 61a51c251e1b..aa1c8eee6557 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -248,7 +248,18 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) if (anno_type != RXRPC_TX_ANNO_RETRANS) continue; + /* We need to reset the retransmission state, but we need to do + * so before we drop the lock as a new ACK/NAK may come in and + * confuse things + */ + annotation &= ~RXRPC_TX_ANNO_MASK; + annotation |= RXRPC_TX_ANNO_RESENT; + call->rxtx_annotations[ix] = annotation; + skb = call->rxtx_buffer[ix]; + if (!skb) + continue; + rxrpc_get_skb(skb, rxrpc_skb_got); spin_unlock_bh(&call->lock); @@ -262,24 +273,6 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) rxrpc_free_skb(skb, rxrpc_skb_freed); spin_lock_bh(&call->lock); - - /* We need to clear the retransmit state, but there are two - * things we need to be aware of: A new ACK/NAK might have been - * received and the packet might have been hard-ACK'd (in which - * case it will no longer be in the buffer). - */ - if (after(seq, call->tx_hard_ack)) { - annotation = call->rxtx_annotations[ix]; - anno_type = annotation & RXRPC_TX_ANNO_MASK; - if (anno_type == RXRPC_TX_ANNO_RETRANS || - anno_type == RXRPC_TX_ANNO_NAK) { - annotation &= ~RXRPC_TX_ANNO_MASK; - annotation |= RXRPC_TX_ANNO_UNACK; - } - annotation |= RXRPC_TX_ANNO_RESENT; - call->rxtx_annotations[ix] = annotation; - } - if (after(call->tx_hard_ack, seq)) seq = call->tx_hard_ack; } From c25cba3689c7ab5ae6ee7228d1d49a5652429229 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 11 Jun 2020 17:18:15 -0700 Subject: [PATCH 68/83] ionic: add pcie_print_link_status Print the PCIe link information for our device. Fixes: 77f972a7077d ("ionic: remove support for mgmt device") Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c index 0ac6acbc5f31..2924cde440aa 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_bus_pci.c @@ -248,6 +248,7 @@ static int ionic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } pci_set_master(pdev); + pcie_print_link_status(pdev); err = ionic_map_bars(ionic); if (err) From 8730f45d1ca5ff60033f5ba022f32e5379d7bb89 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 11 Jun 2020 14:48:30 -0500 Subject: [PATCH 69/83] net: ipa: program metadata mask differently The way the mask value is programmed for QMAP RX endpoints was based on some wrong assumptions about the way metadata containing the QMAP mux_id value is formatted. The metadata value supplied by the modem is *not* in QMAP format, and in fact contains the mux_id we want in its (big endian) low-order byte. That byte must be written by the IPA into offset 1 of the QMAP header it inserts before the received packet. QMAP TX endpoints *do* use a QMAP header as the metadata sent with each packet. The modem assumes this, and based on that assumes the mux_id is in the second byte. To match those assumptions we must program the modem TX (QMAP) endpoint HDR register to indicate the metadata will be found at offset 0 in the message header. The previous configuration managed to work, but it was not working correctly. This patch fixes a bug whose symptom was receipt of messages containing the wrong QMAP mux_id. In fixing this, get rid of ipa_rmnet_mux_id_metadata_mask(), which was more or less defined so there was a separate place to explain what was happening as we generated the mask value. Instead, put a longer description of how this works above ipa_endpoint_init_hdr(), and define the metadata mask to use as a simple constant. Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_endpoint.c | 74 ++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index 66649a806dd1..2825dca23ec4 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -32,6 +32,9 @@ /* The amount of RX buffer space consumed by standard skb overhead */ #define IPA_RX_BUFFER_OVERHEAD (PAGE_SIZE - SKB_MAX_ORDER(NET_SKB_PAD, 0)) +/* Where to find the QMAP mux_id for a packet within modem-supplied metadata */ +#define IPA_ENDPOINT_QMAP_METADATA_MASK 0x000000ff /* host byte order */ + #define IPA_ENDPOINT_RESET_AGGR_RETRY_MAX 3 #define IPA_AGGR_TIME_LIMIT_DEFAULT 1000 /* microseconds */ @@ -433,6 +436,24 @@ static void ipa_endpoint_init_cfg(struct ipa_endpoint *endpoint) iowrite32(val, endpoint->ipa->reg_virt + offset); } +/** + * We program QMAP endpoints so each packet received is preceded by a QMAP + * header structure. The QMAP header contains a 1-byte mux_id and 2-byte + * packet size field, and we have the IPA hardware populate both for each + * received packet. The header is configured (in the HDR_EXT register) + * to use big endian format. + * + * The packet size is written into the QMAP header's pkt_len field. That + * location is defined here using the HDR_OFST_PKT_SIZE field. + * + * The mux_id comes from a 4-byte metadata value supplied with each packet + * by the modem. It is *not* a QMAP header, but it does contain the mux_id + * value that we want, in its low-order byte. A bitmask defined in the + * endpoint's METADATA_MASK register defines which byte within the modem + * metadata contains the mux_id. And the OFST_METADATA field programmed + * here indicates where the extracted byte should be placed within the QMAP + * header. + */ static void ipa_endpoint_init_hdr(struct ipa_endpoint *endpoint) { u32 offset = IPA_REG_ENDP_INIT_HDR_N_OFFSET(endpoint->endpoint_id); @@ -441,25 +462,31 @@ static void ipa_endpoint_init_hdr(struct ipa_endpoint *endpoint) if (endpoint->data->qmap) { size_t header_size = sizeof(struct rmnet_map_header); + /* We might supply a checksum header after the QMAP header */ if (endpoint->toward_ipa && endpoint->data->checksum) header_size += sizeof(struct rmnet_map_ul_csum_header); - val |= u32_encode_bits(header_size, HDR_LEN_FMASK); - /* metadata is the 4 byte rmnet_map header itself */ - val |= HDR_OFST_METADATA_VALID_FMASK; - val |= u32_encode_bits(0, HDR_OFST_METADATA_FMASK); - /* HDR_ADDITIONAL_CONST_LEN is 0; (IPA->AP only) */ - if (!endpoint->toward_ipa) { - u32 size_offset = offsetof(struct rmnet_map_header, - pkt_len); + /* Define how to fill mux_id in a received QMAP header */ + if (!endpoint->toward_ipa) { + u32 off; /* Field offset within header */ + + /* Where IPA will write the metadata value */ + off = offsetof(struct rmnet_map_header, mux_id); + val |= u32_encode_bits(off, HDR_OFST_METADATA_FMASK); + + /* Where IPA will write the length */ + off = offsetof(struct rmnet_map_header, pkt_len); val |= HDR_OFST_PKT_SIZE_VALID_FMASK; - val |= u32_encode_bits(size_offset, - HDR_OFST_PKT_SIZE_FMASK); + val |= u32_encode_bits(off, HDR_OFST_PKT_SIZE_FMASK); } + /* For QMAP TX, metadata offset is 0 (modem assumes this) */ + val |= HDR_OFST_METADATA_VALID_FMASK; + + /* HDR_ADDITIONAL_CONST_LEN is 0; (RX only) */ /* HDR_A5_MUX is 0 */ /* HDR_LEN_INC_DEAGG_HDR is 0 */ - /* HDR_METADATA_REG_VALID is 0; (AP->IPA only) */ + /* HDR_METADATA_REG_VALID is 0 (TX only) */ } iowrite32(val, endpoint->ipa->reg_virt + offset); @@ -482,28 +509,6 @@ static void ipa_endpoint_init_hdr_ext(struct ipa_endpoint *endpoint) iowrite32(val, endpoint->ipa->reg_virt + offset); } -/** - * Generate a metadata mask value that will select only the mux_id - * field in an rmnet_map header structure. The mux_id is at offset - * 1 byte from the beginning of the structure, but the metadata - * value is treated as a 4-byte unit. So this mask must be computed - * with endianness in mind. Note that ipa_endpoint_init_hdr_metadata_mask() - * will convert this value to the proper byte order. - * - * Marked __always_inline because this is really computing a - * constant value. - */ -static __always_inline __be32 ipa_rmnet_mux_id_metadata_mask(void) -{ - size_t mux_id_offset = offsetof(struct rmnet_map_header, mux_id); - u32 mux_id_mask = 0; - u8 *bytes; - - bytes = (u8 *)&mux_id_mask; - bytes[mux_id_offset] = 0xff; /* mux_id is 1 byte */ - - return cpu_to_be32(mux_id_mask); -} static void ipa_endpoint_init_hdr_metadata_mask(struct ipa_endpoint *endpoint) { @@ -513,8 +518,9 @@ static void ipa_endpoint_init_hdr_metadata_mask(struct ipa_endpoint *endpoint) offset = IPA_REG_ENDP_INIT_HDR_METADATA_MASK_N_OFFSET(endpoint_id); + /* Note that HDR_ENDIANNESS indicates big endian header fields */ if (!endpoint->toward_ipa && endpoint->data->qmap) - val = ipa_rmnet_mux_id_metadata_mask(); + val = cpu_to_be32(IPA_ENDPOINT_QMAP_METADATA_MASK); iowrite32(val, endpoint->ipa->reg_virt + offset); } From 9b8ad8dab994f4cba682ca6110bda37f3dcd4b83 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 11 Jun 2020 14:48:31 -0500 Subject: [PATCH 70/83] net: ipa: fix modem LAN RX endpoint id The endpoint id assigned to the modem LAN RX endpoint for the SC7180 SoC is incorrect. The erroneous value might have been copied from SDM845 and never updated. The correct endpoint id to use for this SoC is 11. Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_data-sc7180.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ipa/ipa_data-sc7180.c b/drivers/net/ipa/ipa_data-sc7180.c index 43faa35ae726..d4c2bc7ad24b 100644 --- a/drivers/net/ipa/ipa_data-sc7180.c +++ b/drivers/net/ipa/ipa_data-sc7180.c @@ -106,7 +106,7 @@ static const struct ipa_gsi_endpoint_data ipa_gsi_endpoint_data[] = { [IPA_ENDPOINT_MODEM_LAN_RX] = { .ee_id = GSI_EE_MODEM, .channel_id = 3, - .endpoint_id = 13, + .endpoint_id = 11, .toward_ipa = false, }, [IPA_ENDPOINT_MODEM_AP_TX] = { From 636edeaad5577b6023f0de2b98a010d1cea73607 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 11 Jun 2020 14:48:32 -0500 Subject: [PATCH 71/83] net: ipa: program upper nibbles of sequencer type The upper two nibbles of the sequencer type were not used for SDM845, and were assumed to be 0. But for SC7180 they are used, and so they must be programmed by ipa_endpoint_init_seq(). Fix this bug. IPA_SEQ_PKT_PROCESS_NO_DEC_NO_UCP_DMAP doesn't have a descriptive comment, so add one. Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_endpoint.c | 6 ++++-- drivers/net/ipa/ipa_reg.h | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index 2825dca23ec4..bf3e8ced3ee0 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -699,10 +699,12 @@ static void ipa_endpoint_init_seq(struct ipa_endpoint *endpoint) u32 seq_type = endpoint->seq_type; u32 val = 0; + /* Sequencer type is made up of four nibbles */ val |= u32_encode_bits(seq_type & 0xf, HPS_SEQ_TYPE_FMASK); val |= u32_encode_bits((seq_type >> 4) & 0xf, DPS_SEQ_TYPE_FMASK); - /* HPS_REP_SEQ_TYPE is 0 */ - /* DPS_REP_SEQ_TYPE is 0 */ + /* The second two apply to replicated packets */ + val |= u32_encode_bits((seq_type >> 8) & 0xf, HPS_REP_SEQ_TYPE_FMASK); + val |= u32_encode_bits((seq_type >> 12) & 0xf, DPS_REP_SEQ_TYPE_FMASK); iowrite32(val, endpoint->ipa->reg_virt + offset); } diff --git a/drivers/net/ipa/ipa_reg.h b/drivers/net/ipa/ipa_reg.h index 3b8106aa277a..0a688d8c1d7c 100644 --- a/drivers/net/ipa/ipa_reg.h +++ b/drivers/net/ipa/ipa_reg.h @@ -455,6 +455,8 @@ enum ipa_mode { * second packet processing pass + no decipher + microcontroller * @IPA_SEQ_DMA_DEC: DMA + cipher/decipher * @IPA_SEQ_DMA_COMP_DECOMP: DMA + compression/decompression + * @IPA_SEQ_PKT_PROCESS_NO_DEC_NO_UCP_DMAP: + * packet processing + no decipher + no uCP + HPS REP DMA parser * @IPA_SEQ_INVALID: invalid sequencer type * * The values defined here are broken into 4-bit nibbles that are written From f330fda331d276464baec8ba938d031b4adcf5c7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 11 Jun 2020 14:48:33 -0500 Subject: [PATCH 72/83] net: ipa: header pad field only valid for AP->modem endpoint Only QMAP endpoints should be configured to find a pad size field within packet headers. They are found in the first byte of the QMAP header (and the hardware fills only the 6 bits in that byte that constitute the pad_len field). The RMNet driver assumes the pad_len field is valid for received packets, so we want to ensure the pad_len field is filled in that case. That driver also assumes the length in the QMAP header includes the pad bytes. The RMNet driver does *not* pad the packets it sends, so the pad_len field can be ignored. Fix ipa_endpoint_init_hdr_ext() so it only marks the pad field offset valid for QMAP RX endpoints, and in that case indicates that the length field in the header includes the pad bytes. Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_endpoint.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index bf3e8ced3ee0..9f50d0d11704 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -467,7 +467,7 @@ static void ipa_endpoint_init_hdr(struct ipa_endpoint *endpoint) header_size += sizeof(struct rmnet_map_ul_csum_header); val |= u32_encode_bits(header_size, HDR_LEN_FMASK); - /* Define how to fill mux_id in a received QMAP header */ + /* Define how to fill fields in a received QMAP header */ if (!endpoint->toward_ipa) { u32 off; /* Field offset within header */ @@ -499,10 +499,21 @@ static void ipa_endpoint_init_hdr_ext(struct ipa_endpoint *endpoint) u32 val = 0; val |= HDR_ENDIANNESS_FMASK; /* big endian */ - val |= HDR_TOTAL_LEN_OR_PAD_VALID_FMASK; - /* HDR_TOTAL_LEN_OR_PAD is 0 (pad, not total_len) */ + + /* A QMAP header contains a 6 bit pad field at offset 0. The RMNet + * driver assumes this field is meaningful in packets it receives, + * and assumes the header's payload length includes that padding. + * The RMNet driver does *not* pad packets it sends, however, so + * the pad field (although 0) should be ignored. + */ + if (endpoint->data->qmap && !endpoint->toward_ipa) { + val |= HDR_TOTAL_LEN_OR_PAD_VALID_FMASK; + /* HDR_TOTAL_LEN_OR_PAD is 0 (pad, not total_len) */ + val |= HDR_PAYLOAD_LEN_INC_PADDING_FMASK; + /* HDR_TOTAL_LEN_OR_PAD_OFFSET is 0 */ + } + /* HDR_PAYLOAD_LEN_INC_PADDING is 0 */ - /* HDR_TOTAL_LEN_OR_PAD_OFFSET is 0 */ if (!endpoint->toward_ipa) val |= u32_encode_bits(pad_align, HDR_PAD_TO_ALIGNMENT_FMASK); From b65ce380b754e77fbfdcfc83fd6e29c8ceedf431 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 12 Jun 2020 00:16:55 -0700 Subject: [PATCH 73/83] genetlink: clean up family attributes allocations genl_family_rcv_msg_attrs_parse() and genl_family_rcv_msg_attrs_free() take a boolean parameter to determine whether allocate/free the family attrs. This is unnecessary as we can just check family->parallel_ops. More importantly, callers would not need to worry about pairing these parameters correctly after this patch. And this fixes a memory leak, as after commit c36f05559104 ("genetlink: fix memory leaks in genl_family_rcv_msg_dumpit()") we call genl_family_rcv_msg_attrs_parse() for both parallel and non-parallel cases. Fixes: c36f05559104 ("genetlink: fix memory leaks in genl_family_rcv_msg_dumpit()") Reported-by: Ido Schimmel Signed-off-by: Cong Wang Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 6c19b91bbb86..55ee680e9db1 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -474,8 +474,7 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family, struct netlink_ext_ack *extack, const struct genl_ops *ops, int hdrlen, - enum genl_validate_flags no_strict_flag, - bool parallel) + enum genl_validate_flags no_strict_flag) { enum netlink_validation validate = ops->validate & no_strict_flag ? NL_VALIDATE_LIBERAL : @@ -486,7 +485,7 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family, if (!family->maxattr) return NULL; - if (parallel) { + if (family->parallel_ops) { attrbuf = kmalloc_array(family->maxattr + 1, sizeof(struct nlattr *), GFP_KERNEL); if (!attrbuf) @@ -498,7 +497,7 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family, err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, family->policy, validate, extack); if (err) { - if (parallel) + if (family->parallel_ops) kfree(attrbuf); return ERR_PTR(err); } @@ -506,10 +505,9 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family, } static void genl_family_rcv_msg_attrs_free(const struct genl_family *family, - struct nlattr **attrbuf, - bool parallel) + struct nlattr **attrbuf) { - if (parallel) + if (family->parallel_ops) kfree(attrbuf); } @@ -537,15 +535,14 @@ static int genl_start(struct netlink_callback *cb) attrs = genl_family_rcv_msg_attrs_parse(ctx->family, ctx->nlh, ctx->extack, ops, ctx->hdrlen, - GENL_DONT_VALIDATE_DUMP_STRICT, - true); + GENL_DONT_VALIDATE_DUMP_STRICT); if (IS_ERR(attrs)) return PTR_ERR(attrs); no_attrs: info = genl_dumpit_info_alloc(); if (!info) { - kfree(attrs); + genl_family_rcv_msg_attrs_free(ctx->family, attrs); return -ENOMEM; } info->family = ctx->family; @@ -562,7 +559,7 @@ static int genl_start(struct netlink_callback *cb) } if (rc) { - kfree(attrs); + genl_family_rcv_msg_attrs_free(info->family, info->attrs); genl_dumpit_info_free(info); cb->data = NULL; } @@ -591,7 +588,7 @@ static int genl_lock_done(struct netlink_callback *cb) rc = ops->done(cb); genl_unlock(); } - genl_family_rcv_msg_attrs_free(info->family, info->attrs, false); + genl_family_rcv_msg_attrs_free(info->family, info->attrs); genl_dumpit_info_free(info); return rc; } @@ -604,7 +601,7 @@ static int genl_parallel_done(struct netlink_callback *cb) if (ops->done) rc = ops->done(cb); - genl_family_rcv_msg_attrs_free(info->family, info->attrs, true); + genl_family_rcv_msg_attrs_free(info->family, info->attrs); genl_dumpit_info_free(info); return rc; } @@ -671,8 +668,7 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family, attrbuf = genl_family_rcv_msg_attrs_parse(family, nlh, extack, ops, hdrlen, - GENL_DONT_VALIDATE_STRICT, - family->parallel_ops); + GENL_DONT_VALIDATE_STRICT); if (IS_ERR(attrbuf)) return PTR_ERR(attrbuf); @@ -698,7 +694,7 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family, family->post_doit(ops, skb, &info); out: - genl_family_rcv_msg_attrs_free(family, attrbuf, family->parallel_ops); + genl_family_rcv_msg_attrs_free(family, attrbuf); return err; } From 6954a9e4192b86d778fb52b525fd7b62d51b1147 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Fri, 12 Jun 2020 13:34:41 -0500 Subject: [PATCH 74/83] ibmvnic: Flush existing work items before device removal Ensure that all scheduled work items have completed before continuing with device removal and after further event scheduling has been halted. This patch fixes a bug where a scheduled driver reset event is processed following device removal. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 197dc5b2c090..1b4d04e4474b 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -5184,6 +5184,9 @@ static int ibmvnic_remove(struct vio_dev *dev) adapter->state = VNIC_REMOVING; spin_unlock_irqrestore(&adapter->state_lock, flags); + flush_work(&adapter->ibmvnic_reset); + flush_delayed_work(&adapter->ibmvnic_delayed_reset); + rtnl_lock(); unregister_netdevice(netdev); From 2c7269b231194aae23fb90ab65842573a91acbc9 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 10 Jun 2020 12:19:43 +0200 Subject: [PATCH 75/83] bpf: tcp: Recv() should return 0 when the peer socket is closed If the peer is closed, we will never get more data, so tcp_bpf_wait_data will get stuck forever. In case we passed MSG_DONTWAIT to recv(), we get EAGAIN but we should actually get 0. >From man 2 recv: RETURN VALUE When a stream socket peer has performed an orderly shutdown, the return value will be 0 (the traditional "end-of-file" return). This patch makes tcp_bpf_wait_data always return 1 when the peer socket has been shutdown. Either we have data available, and it would have returned 1 anyway, or there isn't, in which case we'll call tcp_recvmsg which does the right thing in this situation. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Sabrina Dubroca Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/26038a28c21fea5d04d4bd4744c5686d3f2e5504.1591784177.git.sd@queasysnail.net --- net/ipv4/tcp_bpf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 2b915aafda42..7aa68f4aae6c 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -245,6 +245,9 @@ static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + if (!timeo) return ret; From f6fede8569689dd31e7b0ed15024b25e5ce2e2e5 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 11 Jun 2020 18:25:20 +0100 Subject: [PATCH 76/83] bpf: sockmap: Don't attach programs to UDP sockets The stream parser infrastructure isn't set up to deal with UDP sockets, so we mustn't try to attach programs to them. I remember making this change at some point, but I must have lost it while rebasing or something similar. Fixes: 7b98cd42b049 ("bpf: sockmap: Add UDP support") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20200611172520.327602-1-lmb@cloudflare.com --- net/core/sock_map.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 17a40a947546..a2dc64de5213 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -424,10 +424,7 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) return 0; } -static bool sock_map_redirect_allowed(const struct sock *sk) -{ - return sk->sk_state != TCP_LISTEN; -} +static bool sock_map_redirect_allowed(const struct sock *sk); static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) @@ -508,6 +505,11 @@ static bool sk_is_udp(const struct sock *sk) sk->sk_protocol == IPPROTO_UDP; } +static bool sock_map_redirect_allowed(const struct sock *sk) +{ + return sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN; +} + static bool sock_map_sk_is_suitable(const struct sock *sk) { return sk_is_tcp(sk) || sk_is_udp(sk); From 60e5ca8a64bad8f3e2e20a1e57846e497361c700 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 11 Jun 2020 17:08:57 -0700 Subject: [PATCH 77/83] bpf: Fix memlock accounting for sock_hash Add missed bpf_map_charge_init() in sock_hash_alloc() and correspondingly bpf_map_charge_finish() on ENOMEM. It was found accidentally while working on unrelated selftest that checks "map->memory.pages > 0" is true for all map types. Before: # bpftool m l ... 3692: sockhash name m_sockhash flags 0x0 key 4B value 4B max_entries 8 memlock 0B After: # bpftool m l ... 84: sockmap name m_sockmap flags 0x0 key 4B value 4B max_entries 8 memlock 4096B Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200612000857.2881453-1-rdna@fb.com --- net/core/sock_map.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index a2dc64de5213..4059f94e9bb5 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -991,11 +991,15 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) err = -EINVAL; goto free_htab; } + err = bpf_map_charge_init(&htab->map.memory, cost); + if (err) + goto free_htab; htab->buckets = bpf_map_area_alloc(htab->buckets_num * sizeof(struct bpf_htab_bucket), htab->map.numa_node); if (!htab->buckets) { + bpf_map_charge_finish(&htab->map.memory); err = -ENOMEM; goto free_htab; } From 22eb78792e07a4dfb63c85f34950d4e58eb90326 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jun 2020 13:16:03 -0700 Subject: [PATCH 78/83] tools/bpftool: Fix skeleton codegen Remove unnecessary check at the end of codegen() routine which makes codegen() to always fail and exit bpftool with error code. Positive value of variable n is not an indicator of a failure. Fixes: 2c4779eff837 ("tools, bpftool: Exit on error in function codegen") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Tobias Klauser Link: https://lore.kernel.org/bpf/20200612201603.680852-1-andriin@fb.com --- tools/bpf/bpftool/gen.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 7443879e87af..10de76b296ba 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -257,8 +257,6 @@ static void codegen(const char *template, ...) va_end(args); free(s); - if (n) - exit(-1); } static int do_skeleton(int argc, char **argv) From caf62492f479585296e9d636c798d5ac256b7b04 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jun 2020 12:45:04 -0700 Subject: [PATCH 79/83] libbpf: Support pre-initializing .bss global variables Remove invalid assumption in libbpf that .bss map doesn't have to be updated in kernel. With addition of skeleton and memory-mapped initialization image, .bss doesn't have to be all zeroes when BPF map is created, because user-code might have initialized those variables from user-space. Fixes: eba9c5f498a1 ("libbpf: Refactor global data map initialization") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200612194504.557844-1-andriin@fb.com --- tools/lib/bpf/libbpf.c | 4 -- .../selftests/bpf/prog_tests/skeleton.c | 45 ++++++++++++++++--- .../selftests/bpf/progs/test_skeleton.c | 19 ++++++-- 3 files changed, 55 insertions(+), 13 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7f01be2b88b8..477c679ed945 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -3564,10 +3564,6 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) char *cp, errmsg[STRERR_BUFSIZE]; int err, zero = 0; - /* kernel already zero-initializes .bss map. */ - if (map_type == LIBBPF_MAP_BSS) - return 0; - err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0); if (err) { err = -errno; diff --git a/tools/testing/selftests/bpf/prog_tests/skeleton.c b/tools/testing/selftests/bpf/prog_tests/skeleton.c index 9264a2736018..fa153cf67b1b 100644 --- a/tools/testing/selftests/bpf/prog_tests/skeleton.c +++ b/tools/testing/selftests/bpf/prog_tests/skeleton.c @@ -15,6 +15,8 @@ void test_skeleton(void) int duration = 0, err; struct test_skeleton* skel; struct test_skeleton__bss *bss; + struct test_skeleton__data *data; + struct test_skeleton__rodata *rodata; struct test_skeleton__kconfig *kcfg; skel = test_skeleton__open(); @@ -24,13 +26,45 @@ void test_skeleton(void) if (CHECK(skel->kconfig, "skel_kconfig", "kconfig is mmaped()!\n")) goto cleanup; + bss = skel->bss; + data = skel->data; + rodata = skel->rodata; + + /* validate values are pre-initialized correctly */ + CHECK(data->in1 != -1, "in1", "got %d != exp %d\n", data->in1, -1); + CHECK(data->out1 != -1, "out1", "got %d != exp %d\n", data->out1, -1); + CHECK(data->in2 != -1, "in2", "got %lld != exp %lld\n", data->in2, -1LL); + CHECK(data->out2 != -1, "out2", "got %lld != exp %lld\n", data->out2, -1LL); + + CHECK(bss->in3 != 0, "in3", "got %d != exp %d\n", bss->in3, 0); + CHECK(bss->out3 != 0, "out3", "got %d != exp %d\n", bss->out3, 0); + CHECK(bss->in4 != 0, "in4", "got %lld != exp %lld\n", bss->in4, 0LL); + CHECK(bss->out4 != 0, "out4", "got %lld != exp %lld\n", bss->out4, 0LL); + + CHECK(rodata->in6 != 0, "in6", "got %d != exp %d\n", rodata->in6, 0); + CHECK(bss->out6 != 0, "out6", "got %d != exp %d\n", bss->out6, 0); + + /* validate we can pre-setup global variables, even in .bss */ + data->in1 = 10; + data->in2 = 11; + bss->in3 = 12; + bss->in4 = 13; + rodata->in6 = 14; + err = test_skeleton__load(skel); if (CHECK(err, "skel_load", "failed to load skeleton: %d\n", err)) goto cleanup; - bss = skel->bss; - bss->in1 = 1; - bss->in2 = 2; + /* validate pre-setup values are still there */ + CHECK(data->in1 != 10, "in1", "got %d != exp %d\n", data->in1, 10); + CHECK(data->in2 != 11, "in2", "got %lld != exp %lld\n", data->in2, 11LL); + CHECK(bss->in3 != 12, "in3", "got %d != exp %d\n", bss->in3, 12); + CHECK(bss->in4 != 13, "in4", "got %lld != exp %lld\n", bss->in4, 13LL); + CHECK(rodata->in6 != 14, "in6", "got %d != exp %d\n", rodata->in6, 14); + + /* now set new values and attach to get them into outX variables */ + data->in1 = 1; + data->in2 = 2; bss->in3 = 3; bss->in4 = 4; bss->in5.a = 5; @@ -44,14 +78,15 @@ void test_skeleton(void) /* trigger tracepoint */ usleep(1); - CHECK(bss->out1 != 1, "res1", "got %d != exp %d\n", bss->out1, 1); - CHECK(bss->out2 != 2, "res2", "got %lld != exp %d\n", bss->out2, 2); + CHECK(data->out1 != 1, "res1", "got %d != exp %d\n", data->out1, 1); + CHECK(data->out2 != 2, "res2", "got %lld != exp %d\n", data->out2, 2); CHECK(bss->out3 != 3, "res3", "got %d != exp %d\n", (int)bss->out3, 3); CHECK(bss->out4 != 4, "res4", "got %lld != exp %d\n", bss->out4, 4); CHECK(bss->handler_out5.a != 5, "res5", "got %d != exp %d\n", bss->handler_out5.a, 5); CHECK(bss->handler_out5.b != 6, "res6", "got %lld != exp %d\n", bss->handler_out5.b, 6); + CHECK(bss->out6 != 14, "res7", "got %d != exp %d\n", bss->out6, 14); CHECK(bss->bpf_syscall != kcfg->CONFIG_BPF_SYSCALL, "ext1", "got %d != exp %d\n", bss->bpf_syscall, kcfg->CONFIG_BPF_SYSCALL); diff --git a/tools/testing/selftests/bpf/progs/test_skeleton.c b/tools/testing/selftests/bpf/progs/test_skeleton.c index de03a90f78ca..77ae86f44db5 100644 --- a/tools/testing/selftests/bpf/progs/test_skeleton.c +++ b/tools/testing/selftests/bpf/progs/test_skeleton.c @@ -10,16 +10,26 @@ struct s { long long b; } __attribute__((packed)); -int in1 = 0; -long long in2 = 0; +/* .data section */ +int in1 = -1; +long long in2 = -1; + +/* .bss section */ char in3 = '\0'; long long in4 __attribute__((aligned(64))) = 0; struct s in5 = {}; -long long out2 = 0; +/* .rodata section */ +const volatile int in6 = 0; + +/* .data section */ +int out1 = -1; +long long out2 = -1; + +/* .bss section */ char out3 = 0; long long out4 = 0; -int out1 = 0; +int out6 = 0; extern bool CONFIG_BPF_SYSCALL __kconfig; extern int LINUX_KERNEL_VERSION __kconfig; @@ -36,6 +46,7 @@ int handler(const void *ctx) out3 = in3; out4 = in4; out5 = in5; + out6 = in6; bpf_syscall = CONFIG_BPF_SYSCALL; kern_ver = LINUX_KERNEL_VERSION; From 29fcb05bbf1a7008900bb9bee347bdbfc7171036 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 12 Jun 2020 17:21:15 -0700 Subject: [PATCH 80/83] bpf: Undo internal BPF_PROBE_MEM in BPF insns dump BPF_PROBE_MEM is kernel-internal implmementation details. When dumping BPF instructions to user-space, it needs to be replaced back with BPF_MEM mode. Fixes: 2a02759ef5f8 ("bpf: Add support for BTF pointers to interpreter") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200613002115.1632142-1-andriin@fb.com --- kernel/bpf/syscall.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4d530b1d5683..e9a3ebc00e08 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3158,6 +3158,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) struct bpf_insn *insns; u32 off, type; u64 imm; + u8 code; int i; insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), @@ -3166,21 +3167,27 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) return insns; for (i = 0; i < prog->len; i++) { - if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) { + code = insns[i].code; + + if (code == (BPF_JMP | BPF_TAIL_CALL)) { insns[i].code = BPF_JMP | BPF_CALL; insns[i].imm = BPF_FUNC_tail_call; /* fall-through */ } - if (insns[i].code == (BPF_JMP | BPF_CALL) || - insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) { - if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) + if (code == (BPF_JMP | BPF_CALL) || + code == (BPF_JMP | BPF_CALL_ARGS)) { + if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; if (!bpf_dump_raw_ok()) insns[i].imm = 0; continue; } + if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) { + insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM; + continue; + } - if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW)) + if (code != (BPF_LD | BPF_IMM | BPF_DW)) continue; imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; From bf97bac9dc6481e9f68992e52bed5cc4b210e636 Mon Sep 17 00:00:00 2001 From: Liao Pingfang Date: Sat, 13 Jun 2020 14:03:26 +0800 Subject: [PATCH 81/83] net: atm: Remove the error message according to the atomic context Looking into the context (atomic!) and the error message should be dropped. Signed-off-by: Liao Pingfang Signed-off-by: David S. Miller --- net/atm/lec.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/atm/lec.c b/net/atm/lec.c index ca37f5a71f5e..875fc0bc1780 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1536,10 +1536,8 @@ static struct lec_arp_table *make_entry(struct lec_priv *priv, struct lec_arp_table *to_return; to_return = kzalloc(sizeof(struct lec_arp_table), GFP_ATOMIC); - if (!to_return) { - pr_info("LEC: Arp entry kmalloc failed\n"); + if (!to_return) return NULL; - } ether_addr_copy(to_return->mac_addr, mac_addr); INIT_HLIST_NODE(&to_return->next); timer_setup(&to_return->timer, lec_arp_expire_arp, 0); From 2074f9eaa58795a99e9da61c10f93180f810cfd6 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Sat, 13 Jun 2020 17:52:59 +0300 Subject: [PATCH 82/83] net: ethernet: ti: am65-cpsw-nuss: fix ale parameters init The ALE parameters structure is created on stack, so it has to be reset before passing to cpsw_ale_create() to avoid garbage values. Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver") Signed-off-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 87a4775ed53a..1492648247d9 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1981,7 +1981,7 @@ MODULE_DEVICE_TABLE(of, am65_cpsw_nuss_of_mtable); static int am65_cpsw_nuss_probe(struct platform_device *pdev) { - struct cpsw_ale_params ale_params; + struct cpsw_ale_params ale_params = { 0 }; const struct of_device_id *of_id; struct device *dev = &pdev->dev; struct am65_cpsw_common *common; From bc139119a1708ae3db1ebb379630f286e28d06e8 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Sat, 13 Jun 2020 17:54:14 +0300 Subject: [PATCH 83/83] net: ethernet: ti: ale: fix allmulti for nu type ale On AM65xx MCU CPSW2G NUSS and 66AK2E/L NUSS allmulti setting does not allow unregistered mcast packets to pass. This happens, because ALE VLAN entries on these SoCs do not contain port masks for reg/unreg mcast packets, but instead store indexes of ALE_VLAN_MASK_MUXx_REG registers which intended for store port masks for reg/unreg mcast packets. This path was missed by commit 9d1f6447274f ("net: ethernet: ti: ale: fix seeing unreg mcast packets with promisc and allmulti disabled"). Hence, fix it by taking into account ALE type in cpsw_ale_set_allmulti(). Fixes: 9d1f6447274f ("net: ethernet: ti: ale: fix seeing unreg mcast packets with promisc and allmulti disabled") Signed-off-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw_ale.c | 49 ++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c index 8dc6be11b2ff..9ad872bfae3a 100644 --- a/drivers/net/ethernet/ti/cpsw_ale.c +++ b/drivers/net/ethernet/ti/cpsw_ale.c @@ -604,10 +604,44 @@ void cpsw_ale_set_unreg_mcast(struct cpsw_ale *ale, int unreg_mcast_mask, } } +static void cpsw_ale_vlan_set_unreg_mcast(struct cpsw_ale *ale, u32 *ale_entry, + int allmulti) +{ + int unreg_mcast; + + unreg_mcast = + cpsw_ale_get_vlan_unreg_mcast(ale_entry, + ale->vlan_field_bits); + if (allmulti) + unreg_mcast |= ALE_PORT_HOST; + else + unreg_mcast &= ~ALE_PORT_HOST; + cpsw_ale_set_vlan_unreg_mcast(ale_entry, unreg_mcast, + ale->vlan_field_bits); +} + +static void +cpsw_ale_vlan_set_unreg_mcast_idx(struct cpsw_ale *ale, u32 *ale_entry, + int allmulti) +{ + int unreg_mcast; + int idx; + + idx = cpsw_ale_get_vlan_unreg_mcast_idx(ale_entry); + + unreg_mcast = readl(ale->params.ale_regs + ALE_VLAN_MASK_MUX(idx)); + + if (allmulti) + unreg_mcast |= ALE_PORT_HOST; + else + unreg_mcast &= ~ALE_PORT_HOST; + + writel(unreg_mcast, ale->params.ale_regs + ALE_VLAN_MASK_MUX(idx)); +} + void cpsw_ale_set_allmulti(struct cpsw_ale *ale, int allmulti, int port) { u32 ale_entry[ALE_ENTRY_WORDS]; - int unreg_mcast = 0; int type, idx; for (idx = 0; idx < ale->params.ale_entries; idx++) { @@ -624,15 +658,12 @@ void cpsw_ale_set_allmulti(struct cpsw_ale *ale, int allmulti, int port) if (port != -1 && !(vlan_members & BIT(port))) continue; - unreg_mcast = - cpsw_ale_get_vlan_unreg_mcast(ale_entry, - ale->vlan_field_bits); - if (allmulti) - unreg_mcast |= ALE_PORT_HOST; + if (!ale->params.nu_switch_ale) + cpsw_ale_vlan_set_unreg_mcast(ale, ale_entry, allmulti); else - unreg_mcast &= ~ALE_PORT_HOST; - cpsw_ale_set_vlan_unreg_mcast(ale_entry, unreg_mcast, - ale->vlan_field_bits); + cpsw_ale_vlan_set_unreg_mcast_idx(ale, ale_entry, + allmulti); + cpsw_ale_write(ale, idx, ale_entry); } }