From 2871734e85e920503d49b3a8bc0afbe0773b6036 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Sat, 12 Mar 2016 11:12:59 +0100 Subject: [PATCH 01/18] batman-adv: fix DAT candidate selection (must use vid) Now that DAT is VLAN aware, it must use the VID when computing the DHT address of the candidate nodes where an entry is going to be stored/retrieved. Fixes: be1db4f6615b ("batman-adv: make the Distributed ARP Table vlan aware") Signed-off-by: Antonio Quartulli [sven@narfation.org: fix conflicts with current version] Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner --- net/batman-adv/distributed-arp-table.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index e96d7c745b4a..3e6b2624f980 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -568,6 +568,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, * be sent to * @bat_priv: the bat priv with all the soft interface information * @ip_dst: ipv4 to look up in the DHT + * @vid: VLAN identifier * * An originator O is selected if and only if its DHT_ID value is one of three * closest values (from the LEFT, with wrap around if needed) then the hash @@ -576,7 +577,8 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, * Return: the candidate array of size BATADV_DAT_CANDIDATE_NUM. */ static struct batadv_dat_candidate * -batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) +batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, + unsigned short vid) { int select; batadv_dat_addr_t last_max = BATADV_DAT_ADDR_MAX, ip_key; @@ -592,7 +594,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) return NULL; dat.ip = ip_dst; - dat.vid = 0; + dat.vid = vid; ip_key = (batadv_dat_addr_t)batadv_hash_dat(&dat, BATADV_DAT_ADDR_MAX); @@ -612,6 +614,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) * @bat_priv: the bat priv with all the soft interface information * @skb: payload to send * @ip: the DHT key + * @vid: VLAN identifier * @packet_subtype: unicast4addr packet subtype to use * * This function copies the skb with pskb_copy() and is sent as unicast packet @@ -622,7 +625,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) */ static bool batadv_dat_send_data(struct batadv_priv *bat_priv, struct sk_buff *skb, __be32 ip, - int packet_subtype) + unsigned short vid, int packet_subtype) { int i; bool ret = false; @@ -631,7 +634,7 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv, struct sk_buff *tmp_skb; struct batadv_dat_candidate *cand; - cand = batadv_dat_select_candidates(bat_priv, ip); + cand = batadv_dat_select_candidates(bat_priv, ip, vid); if (!cand) goto out; @@ -1022,7 +1025,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, ret = true; } else { /* Send the request to the DHT */ - ret = batadv_dat_send_data(bat_priv, skb, ip_dst, + ret = batadv_dat_send_data(bat_priv, skb, ip_dst, vid, BATADV_P_DAT_DHT_GET); } out: @@ -1150,8 +1153,8 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, /* Send the ARP reply to the candidates for both the IP addresses that * the node obtained from the ARP reply */ - batadv_dat_send_data(bat_priv, skb, ip_src, BATADV_P_DAT_DHT_PUT); - batadv_dat_send_data(bat_priv, skb, ip_dst, BATADV_P_DAT_DHT_PUT); + batadv_dat_send_data(bat_priv, skb, ip_src, vid, BATADV_P_DAT_DHT_PUT); + batadv_dat_send_data(bat_priv, skb, ip_dst, vid, BATADV_P_DAT_DHT_PUT); } /** From b6cf5d499fddbfcffe751e81fb9f1a07d6348026 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Thu, 14 Apr 2016 09:37:05 +0800 Subject: [PATCH 02/18] batman-adv: B.A.T.M.A.N V - make sure iface is reactivated upon NETDEV_UP event At the moment there is no explicit reactivation of an hard-interface upon NETDEV_UP event. In case of B.A.T.M.A.N. IV the interface is reactivated as soon as the next OGM is scheduled for sending, but this mechanism does not work with B.A.T.M.A.N. V. The latter does not rely on the same scheduling mechanism as its predecessor and for this reason the hard-interface remains deactivated forever after being brought down once. This patch fixes the reactivation mechanism by adding a new routing API which explicitly allows each algorithm to perform any needed operation upon interface re-activation. Such API is optional and is implemented by B.A.T.M.A.N. V only and it just takes care of setting the iface status to ACTIVE Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner --- net/batman-adv/bat_v.c | 12 ++++++++++++ net/batman-adv/hard-interface.c | 3 +++ net/batman-adv/types.h | 3 +++ 3 files changed, 18 insertions(+) diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 3315b9a598af..4026f198a734 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -32,10 +32,21 @@ #include "bat_v_elp.h" #include "bat_v_ogm.h" +#include "hard-interface.h" #include "hash.h" #include "originator.h" #include "packet.h" +static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) +{ + /* B.A.T.M.A.N. V does not use any queuing mechanism, therefore it can + * set the interface as ACTIVE right away, without any risk of race + * condition + */ + if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) + hard_iface->if_status = BATADV_IF_ACTIVE; +} + static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface) { int ret; @@ -274,6 +285,7 @@ static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1, static struct batadv_algo_ops batadv_batman_v __read_mostly = { .name = "BATMAN_V", + .bat_iface_activate = batadv_v_iface_activate, .bat_iface_enable = batadv_v_iface_enable, .bat_iface_disable = batadv_v_iface_disable, .bat_iface_update_mac = batadv_v_iface_update_mac, diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index c61d5b0b24d2..0a7deaf2670a 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -407,6 +407,9 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) batadv_update_min_mtu(hard_iface->soft_iface); + if (bat_priv->bat_algo_ops->bat_iface_activate) + bat_priv->bat_algo_ops->bat_iface_activate(hard_iface); + out: if (primary_if) batadv_hardif_put(primary_if); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 9abfb3e73c34..443e9b84e07d 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1250,6 +1250,8 @@ struct batadv_forw_packet { * struct batadv_algo_ops - mesh algorithm callbacks * @list: list node for the batadv_algo_list * @name: name of the algorithm + * @bat_iface_activate: start routing mechanisms when hard-interface is brought + * up * @bat_iface_enable: init routing info when hard-interface is enabled * @bat_iface_disable: de-init routing info when hard-interface is disabled * @bat_iface_update_mac: (re-)init mac addresses of the protocol information @@ -1277,6 +1279,7 @@ struct batadv_forw_packet { struct batadv_algo_ops { struct hlist_node list; char *name; + void (*bat_iface_activate)(struct batadv_hard_iface *hard_iface); int (*bat_iface_enable)(struct batadv_hard_iface *hard_iface); void (*bat_iface_disable)(struct batadv_hard_iface *hard_iface); void (*bat_iface_update_mac)(struct batadv_hard_iface *hard_iface); From a33d970d0b54b09746d5540af8271fad4eb10229 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 11 Mar 2016 16:44:05 +0100 Subject: [PATCH 03/18] batman-adv: Fix reference counting of vlan object for tt_local_entry The batadv_tt_local_entry was specific to a batadv_softif_vlan and held an implicit reference to it. But this reference was never stored in form of a pointer in the tt_local_entry itself. Instead batadv_tt_local_remove, batadv_tt_local_table_free and batadv_tt_local_purge_pending_clients depend on a consistent state of bat_priv->softif_vlan_list and that batadv_softif_vlan_get always returns the batadv_softif_vlan object which it has a reference for. But batadv_softif_vlan_get cannot guarantee that because it is working only with rcu_read_lock on this list. It can therefore happen that an vid is in this list twice or that batadv_softif_vlan_get cannot find the batadv_softif_vlan for an vid due to some other list operations taking place at the same time. Instead add a batadv_softif_vlan pointer directly in batadv_tt_local_entry which will be used for the reference counter decremented on release of batadv_tt_local_entry. Fixes: 35df3b298fc8 ("batman-adv: fix TT VLAN inconsistency on VLAN re-add") Signed-off-by: Sven Eckelmann Acked-by: Antonio Quartulli Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/translation-table.c | 42 +++--------------------------- net/batman-adv/types.h | 2 ++ 2 files changed, 6 insertions(+), 38 deletions(-) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 0b43e86328a5..9b4551a86535 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -215,6 +215,8 @@ static void batadv_tt_local_entry_release(struct kref *ref) tt_local_entry = container_of(ref, struct batadv_tt_local_entry, common.refcount); + batadv_softif_vlan_put(tt_local_entry->vlan); + kfree_rcu(tt_local_entry, common.rcu); } @@ -673,6 +675,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, kref_get(&tt_local->common.refcount); tt_local->last_seen = jiffies; tt_local->common.added_at = tt_local->last_seen; + tt_local->vlan = vlan; /* the batman interface mac and multicast addresses should never be * purged @@ -991,7 +994,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local; struct batadv_hard_iface *primary_if; - struct batadv_softif_vlan *vlan; struct hlist_head *head; unsigned short vid; u32 i; @@ -1027,14 +1029,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) last_seen_msecs = last_seen_msecs % 1000; no_purge = tt_common_entry->flags & np_flag; - - vlan = batadv_softif_vlan_get(bat_priv, vid); - if (!vlan) { - seq_printf(seq, "Cannot retrieve VLAN %d\n", - BATADV_PRINT_VID(vid)); - continue; - } - seq_printf(seq, " * %pM %4i [%c%c%c%c%c%c] %3u.%03u (%#.8x)\n", tt_common_entry->addr, @@ -1052,9 +1046,7 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), no_purge ? 0 : last_seen_secs, no_purge ? 0 : last_seen_msecs, - vlan->tt.crc); - - batadv_softif_vlan_put(vlan); + tt_local->vlan->tt.crc); } rcu_read_unlock(); } @@ -1099,7 +1091,6 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, { struct batadv_tt_local_entry *tt_local_entry; u16 flags, curr_flags = BATADV_NO_FLAGS; - struct batadv_softif_vlan *vlan; void *tt_entry_exists; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); @@ -1139,14 +1130,6 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, /* extra call to free the local tt entry */ batadv_tt_local_entry_put(tt_local_entry); - /* decrease the reference held for this vlan */ - vlan = batadv_softif_vlan_get(bat_priv, vid); - if (!vlan) - goto out; - - batadv_softif_vlan_put(vlan); - batadv_softif_vlan_put(vlan); - out: if (tt_local_entry) batadv_tt_local_entry_put(tt_local_entry); @@ -1219,7 +1202,6 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local; - struct batadv_softif_vlan *vlan; struct hlist_node *node_tmp; struct hlist_head *head; u32 i; @@ -1241,14 +1223,6 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) struct batadv_tt_local_entry, common); - /* decrease the reference held for this vlan */ - vlan = batadv_softif_vlan_get(bat_priv, - tt_common_entry->vid); - if (vlan) { - batadv_softif_vlan_put(vlan); - batadv_softif_vlan_put(vlan); - } - batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); @@ -3309,7 +3283,6 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct batadv_tt_local_entry *tt_local; - struct batadv_softif_vlan *vlan; struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ @@ -3339,13 +3312,6 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) struct batadv_tt_local_entry, common); - /* decrease the reference held for this vlan */ - vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid); - if (vlan) { - batadv_softif_vlan_put(vlan); - batadv_softif_vlan_put(vlan); - } - batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 443e9b84e07d..65afd090ab3e 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1073,10 +1073,12 @@ struct batadv_tt_common_entry { * struct batadv_tt_local_entry - translation table local entry data * @common: general translation table data * @last_seen: timestamp used for purging stale tt local entries + * @vlan: soft-interface vlan of the entry */ struct batadv_tt_local_entry { struct batadv_tt_common_entry common; unsigned long last_seen; + struct batadv_softif_vlan *vlan; }; /** From abe59c65225ccd63a5964e2f2a73dd2995b948e7 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 11 Mar 2016 16:44:06 +0100 Subject: [PATCH 04/18] batman-adv: Fix reference counting of hardif_neigh_node object for neigh_node The batadv_neigh_node was specific to a batadv_hardif_neigh_node and held an implicit reference to it. But this reference was never stored in form of a pointer in the batadv_neigh_node itself. Instead batadv_neigh_node_release depends on a consistent state of hard_iface->neigh_list and that batadv_hardif_neigh_get always returns the batadv_hardif_neigh_node object which it has a reference for. But batadv_hardif_neigh_get cannot guarantee that because it is working only with rcu_read_lock on this list. It can therefore happen that a neigh_addr is in this list twice or that batadv_hardif_neigh_get cannot find the batadv_hardif_neigh_node for an neigh_addr due to some other list operations taking place at the same time. Instead add a batadv_hardif_neigh_node pointer directly in batadv_neigh_node which will be used for the reference counter decremented on release of batadv_neigh_node. Fixes: cef63419f7db ("batman-adv: add list of unique single hop neighbors per hard-interface") Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/originator.c | 16 +++++----------- net/batman-adv/types.h | 2 ++ 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index d52f67a0c057..c355a824713c 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -250,7 +250,6 @@ static void batadv_neigh_node_release(struct kref *ref) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; - struct batadv_hardif_neigh_node *hardif_neigh; struct batadv_neigh_ifinfo *neigh_ifinfo; struct batadv_algo_ops *bao; @@ -262,13 +261,7 @@ static void batadv_neigh_node_release(struct kref *ref) batadv_neigh_ifinfo_put(neigh_ifinfo); } - hardif_neigh = batadv_hardif_neigh_get(neigh_node->if_incoming, - neigh_node->addr); - if (hardif_neigh) { - /* batadv_hardif_neigh_get() increases refcount too */ - batadv_hardif_neigh_put(hardif_neigh); - batadv_hardif_neigh_put(hardif_neigh); - } + batadv_hardif_neigh_put(neigh_node->hardif_neigh); if (bao->bat_neigh_free) bao->bat_neigh_free(neigh_node); @@ -665,6 +658,10 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, neigh_node->orig_node = orig_node; neigh_node->last_seen = jiffies; + /* increment unique neighbor refcount */ + kref_get(&hardif_neigh->refcount); + neigh_node->hardif_neigh = hardif_neigh; + /* extra reference for return */ kref_init(&neigh_node->refcount); kref_get(&neigh_node->refcount); @@ -673,9 +670,6 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); spin_unlock_bh(&orig_node->neigh_list_lock); - /* increment unique neighbor refcount */ - kref_get(&hardif_neigh->refcount); - batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, "Creating new neighbor %pM for orig_node %pM on interface %s\n", neigh_addr, orig_node->orig, hard_iface->net_dev->name); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 65afd090ab3e..1e47fbe8bb7b 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -433,6 +433,7 @@ struct batadv_hardif_neigh_node { * @ifinfo_lock: lock protecting private ifinfo members and list * @if_incoming: pointer to incoming hard-interface * @last_seen: when last packet via this neighbor was received + * @hardif_neigh: hardif_neigh of this neighbor * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner */ @@ -444,6 +445,7 @@ struct batadv_neigh_node { spinlock_t ifinfo_lock; /* protects ifinfo_list and its members */ struct batadv_hard_iface *if_incoming; unsigned long last_seen; + struct batadv_hardif_neigh_node *hardif_neigh; struct kref refcount; struct rcu_head rcu; }; From 6071bd1aa13ed9e41824bafad845b7b7f4df5cfd Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 2 May 2016 12:20:15 -0400 Subject: [PATCH 05/18] netem: Segment GSO packets on enqueue This was recently reported to me, and reproduced on the latest net kernel, when attempting to run netperf from a host that had a netem qdisc attached to the egress interface: [ 788.073771] ---------------------[ cut here ]--------------------------- [ 788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] [] dump_stack+0x19/0x1b [ 788.686193] [] warn_slowpath_common+0x70/0xb0 [ 788.713803] [] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [] ? ___ratelimit+0x93/0x100 [ 788.767018] [] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [] skb_checksum_help+0x17c/0x190 [ 788.823392] [] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [] ip_finish_output+0x53d/0x7d0 ... The problem occurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help in its enqueue path, which cannot manipulate these frames). The solution I think is to simply segment the skb in a simmilar fashion to the way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes. When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt the first segment, and enqueue the remaining ones. tested successfully by myself on the latest net kernel, to which this applies Signed-off-by: Neil Horman CC: Jamal Hadi Salim CC: "David S. Miller" CC: netem@lists.linux-foundation.org CC: eric.dumazet@gmail.com CC: stephen@networkplumber.org Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 61 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 9640bb39a5d2..4befe97a9034 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -395,6 +395,25 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) sch->q.qlen++; } +/* netem can't properly corrupt a megapacket (like we get from GSO), so instead + * when we statistically choose to corrupt one, we instead segment it, returning + * the first packet to be corrupted, and re-enqueue the remaining frames + */ +static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch) +{ + struct sk_buff *segs; + netdev_features_t features = netif_skb_features(skb); + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + + if (IS_ERR_OR_NULL(segs)) { + qdisc_reshape_fail(skb, sch); + return NULL; + } + consume_skb(skb); + return segs; +} + /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. @@ -407,7 +426,11 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* We don't fill cb now as skb_unshare() may invalidate it */ struct netem_skb_cb *cb; struct sk_buff *skb2; + struct sk_buff *segs = NULL; + unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb); + int nb = 0; int count = 1; + int rc = NET_XMIT_SUCCESS; /* Random duplication */ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) @@ -453,10 +476,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) * do it now in software before we mangle it. */ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { + if (skb_is_gso(skb)) { + segs = netem_segment(skb, sch); + if (!segs) + return NET_XMIT_DROP; + } else { + segs = skb; + } + + skb = segs; + segs = segs->next; + if (!(skb = skb_unshare(skb, GFP_ATOMIC)) || (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(skb))) - return qdisc_drop(skb, sch); + skb_checksum_help(skb))) { + rc = qdisc_drop(skb, sch); + goto finish_segs; + } skb->data[prandom_u32() % skb_headlen(skb)] ^= 1<<(prandom_u32() % 8); @@ -516,6 +552,27 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) sch->qstats.requeues++; } +finish_segs: + if (segs) { + while (segs) { + skb2 = segs->next; + segs->next = NULL; + qdisc_skb_cb(segs)->pkt_len = segs->len; + last_len = segs->len; + rc = qdisc_enqueue(segs, sch); + if (rc != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(rc)) + qdisc_qstats_drop(sch); + } else { + nb++; + len += last_len; + } + segs = skb2; + } + sch->q.qlen += nb; + if (nb > 1) + qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); + } return NET_XMIT_SUCCESS; } From 5f8a02a441b861fd3b1b5135a1a6e1c13ee4bb33 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 1 May 2016 22:59:54 +0300 Subject: [PATCH 06/18] net/mlx5: Unmap only the relevant IO memory mapping When freeing UAR the driver tries to unmap uar->map and uar->bf_map which are mutually exclusive thus always unmapping a NULL pointer. Make sure we only call iounmap() once, for the actual mapping. Fixes: 0ba422410bbf ('net/mlx5: Fix global UAR mapping') Signed-off-by: Gal Pressman Reported-by: Doron Tsur Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/uar.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c index 8ba080e441a1..5ff8af472bf5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/uar.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c @@ -269,8 +269,10 @@ EXPORT_SYMBOL(mlx5_alloc_map_uar); void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, struct mlx5_uar *uar) { - iounmap(uar->map); - iounmap(uar->bf_map); + if (uar->map) + iounmap(uar->map); + else + iounmap(uar->bf_map); mlx5_cmd_free_uar(mdev, uar->index); } EXPORT_SYMBOL(mlx5_unmap_free_uar); From 69976fb1045850a742deb9790ea49cbc6f497531 Mon Sep 17 00:00:00 2001 From: Matthew Finlay Date: Sun, 1 May 2016 22:59:55 +0300 Subject: [PATCH 07/18] net/mlx5: Kconfig: Fix MLX5_EN/VXLAN build issue When MLX5_EN=y MLX5_CORE=y and VXLAN=m there is a linker error for vxlan_get_rx_port() due to the fact that VXLAN is a module. Change Kconfig to select VXLAN when MLX5_CORE=y. When MLX5_CORE=m there is no dependency on the value of VXLAN. Fixes: b3f63c3d5e2c ('net/mlx5e: Add netdev support for VXLAN tunneling') Signed-off-by: Matthew Finlay Reported-by: Arnd Bergmann Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index 1cf722eba607..559d11a443bc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -14,6 +14,7 @@ config MLX5_CORE_EN bool "Mellanox Technologies ConnectX-4 Ethernet support" depends on NETDEVICES && ETHERNET && PCI && MLX5_CORE select PTP_1588_CLOCK + select VXLAN if MLX5_CORE=y default n ---help--- Ethernet support in Mellanox Technologies ConnectX-4 NIC. From 7bb2975599210097115021e542b6137781a09588 Mon Sep 17 00:00:00 2001 From: Matthew Finlay Date: Sun, 1 May 2016 22:59:56 +0300 Subject: [PATCH 08/18] net/mlx5e: Implement a mlx5e workqueue Implement a mlx5e workqueue to handle all mlx5e specific tasks. Move all tasks currently using the system workqueue to the new workqueue. This is in preparation for vxlan using the mlx5e workqueue in order to schedule port add/remove operations. Signed-off-by: Matthew Finlay Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + .../net/ethernet/mellanox/mlx5/core/en_main.c | 30 ++++++++++++------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index e80ce94b5dcf..3881dce0cc30 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -567,6 +567,7 @@ struct mlx5e_priv { struct mlx5e_vxlan_db vxlan; struct mlx5e_params params; + struct workqueue_struct *wq; struct work_struct update_carrier_work; struct work_struct set_rx_mode_work; struct delayed_work update_stats_work; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 67d548b70e14..9ab08419c557 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -262,9 +262,8 @@ static void mlx5e_update_stats_work(struct work_struct *work) mutex_lock(&priv->state_lock); if (test_bit(MLX5E_STATE_OPENED, &priv->state)) { mlx5e_update_stats(priv); - schedule_delayed_work(dwork, - msecs_to_jiffies( - MLX5E_UPDATE_STATS_INTERVAL)); + queue_delayed_work(priv->wq, dwork, + msecs_to_jiffies(MLX5E_UPDATE_STATS_INTERVAL)); } mutex_unlock(&priv->state_lock); } @@ -280,7 +279,7 @@ static void mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv, switch (event) { case MLX5_DEV_EVENT_PORT_UP: case MLX5_DEV_EVENT_PORT_DOWN: - schedule_work(&priv->update_carrier_work); + queue_work(priv->wq, &priv->update_carrier_work); break; default: @@ -1505,7 +1504,7 @@ int mlx5e_open_locked(struct net_device *netdev) mlx5e_update_carrier(priv); mlx5e_timestamp_init(priv); - schedule_delayed_work(&priv->update_stats_work, 0); + queue_delayed_work(priv->wq, &priv->update_stats_work, 0); return 0; @@ -1961,7 +1960,7 @@ static void mlx5e_set_rx_mode(struct net_device *dev) { struct mlx5e_priv *priv = netdev_priv(dev); - schedule_work(&priv->set_rx_mode_work); + queue_work(priv->wq, &priv->set_rx_mode_work); } static int mlx5e_set_mac(struct net_device *netdev, void *addr) @@ -1976,7 +1975,7 @@ static int mlx5e_set_mac(struct net_device *netdev, void *addr) ether_addr_copy(netdev->dev_addr, saddr->sa_data); netif_addr_unlock_bh(netdev); - schedule_work(&priv->set_rx_mode_work); + queue_work(priv->wq, &priv->set_rx_mode_work); return 0; } @@ -2498,10 +2497,14 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev) priv = netdev_priv(netdev); + priv->wq = create_singlethread_workqueue("mlx5e"); + if (!priv->wq) + goto err_free_netdev; + err = mlx5_alloc_map_uar(mdev, &priv->cq_uar, false); if (err) { mlx5_core_err(mdev, "alloc_map uar failed, %d\n", err); - goto err_free_netdev; + goto err_destroy_wq; } err = mlx5_core_alloc_pd(mdev, &priv->pdn); @@ -2580,7 +2583,7 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev) vxlan_get_rx_port(netdev); mlx5e_enable_async_events(priv); - schedule_work(&priv->set_rx_mode_work); + queue_work(priv->wq, &priv->set_rx_mode_work); return priv; @@ -2617,6 +2620,9 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev) err_unmap_free_uar: mlx5_unmap_free_uar(mdev, &priv->cq_uar); +err_destroy_wq: + destroy_workqueue(priv->wq); + err_free_netdev: free_netdev(netdev); @@ -2630,9 +2636,9 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv) set_bit(MLX5E_STATE_DESTROYING, &priv->state); - schedule_work(&priv->set_rx_mode_work); + queue_work(priv->wq, &priv->set_rx_mode_work); mlx5e_disable_async_events(priv); - flush_scheduled_work(); + flush_workqueue(priv->wq); if (test_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &mdev->intf_state)) { netif_device_detach(netdev); mutex_lock(&priv->state_lock); @@ -2655,6 +2661,8 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv) mlx5_core_dealloc_transport_domain(priv->mdev, priv->tdn); mlx5_core_dealloc_pd(priv->mdev, priv->pdn); mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar); + cancel_delayed_work_sync(&priv->update_stats_work); + destroy_workqueue(priv->wq); if (!test_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &mdev->intf_state)) free_netdev(netdev); From d8cf2dda3de6e6293fb01539fb4e180a7ab42afd Mon Sep 17 00:00:00 2001 From: Matthew Finlay Date: Sun, 1 May 2016 22:59:57 +0300 Subject: [PATCH 09/18] net/mlx5e: Use workqueue for vxlan ops The vxlan add/delete port NDOs are called under rcu lock. The current mlx5e implementation can potentially block in these calls, which is not allowed. Move to using the mlx5e workqueue to handle these NDOs. Fixes: b3f63c3d5e2c ('net/mlx5e: Add netdev support for VXLAN tunneling') Signed-off-by: Matthew Finlay Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 4 +- .../net/ethernet/mellanox/mlx5/core/vxlan.c | 50 ++++++++++++++----- .../net/ethernet/mellanox/mlx5/core/vxlan.h | 11 +++- 3 files changed, 49 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 9ab08419c557..d4dfc5ce516a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2157,7 +2157,7 @@ static void mlx5e_add_vxlan_port(struct net_device *netdev, if (!mlx5e_vxlan_allowed(priv->mdev)) return; - mlx5e_vxlan_add_port(priv, be16_to_cpu(port)); + mlx5e_vxlan_queue_work(priv, sa_family, be16_to_cpu(port), 1); } static void mlx5e_del_vxlan_port(struct net_device *netdev, @@ -2168,7 +2168,7 @@ static void mlx5e_del_vxlan_port(struct net_device *netdev, if (!mlx5e_vxlan_allowed(priv->mdev)) return; - mlx5e_vxlan_del_port(priv, be16_to_cpu(port)); + mlx5e_vxlan_queue_work(priv, sa_family, be16_to_cpu(port), 0); } static netdev_features_t mlx5e_vxlan_features_check(struct mlx5e_priv *priv, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c index 9f10df25f3cd..f2fd1ef16da7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c @@ -95,21 +95,22 @@ struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port) return vxlan; } -int mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port) +static void mlx5e_vxlan_add_port(struct work_struct *work) { + struct mlx5e_vxlan_work *vxlan_work = + container_of(work, struct mlx5e_vxlan_work, work); + struct mlx5e_priv *priv = vxlan_work->priv; struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan; + u16 port = vxlan_work->port; struct mlx5e_vxlan *vxlan; int err; - err = mlx5e_vxlan_core_add_port_cmd(priv->mdev, port); - if (err) - return err; + if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port)) + goto free_work; vxlan = kzalloc(sizeof(*vxlan), GFP_KERNEL); - if (!vxlan) { - err = -ENOMEM; + if (!vxlan) goto err_delete_port; - } vxlan->udp_port = port; @@ -119,13 +120,14 @@ int mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port) if (err) goto err_free; - return 0; + goto free_work; err_free: kfree(vxlan); err_delete_port: mlx5e_vxlan_core_del_port_cmd(priv->mdev, port); - return err; +free_work: + kfree(vxlan_work); } static void __mlx5e_vxlan_core_del_port(struct mlx5e_priv *priv, u16 port) @@ -145,12 +147,36 @@ static void __mlx5e_vxlan_core_del_port(struct mlx5e_priv *priv, u16 port) kfree(vxlan); } -void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 port) +static void mlx5e_vxlan_del_port(struct work_struct *work) { - if (!mlx5e_vxlan_lookup_port(priv, port)) - return; + struct mlx5e_vxlan_work *vxlan_work = + container_of(work, struct mlx5e_vxlan_work, work); + struct mlx5e_priv *priv = vxlan_work->priv; + u16 port = vxlan_work->port; __mlx5e_vxlan_core_del_port(priv, port); + + kfree(vxlan_work); +} + +void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, sa_family_t sa_family, + u16 port, int add) +{ + struct mlx5e_vxlan_work *vxlan_work; + + vxlan_work = kmalloc(sizeof(*vxlan_work), GFP_ATOMIC); + if (!vxlan_work) + return; + + if (add) + INIT_WORK(&vxlan_work->work, mlx5e_vxlan_add_port); + else + INIT_WORK(&vxlan_work->work, mlx5e_vxlan_del_port); + + vxlan_work->priv = priv; + vxlan_work->port = port; + vxlan_work->sa_family = sa_family; + queue_work(priv->wq, &vxlan_work->work); } void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h index a01685056ab1..129f3527aa14 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h @@ -39,6 +39,13 @@ struct mlx5e_vxlan { u16 udp_port; }; +struct mlx5e_vxlan_work { + struct work_struct work; + struct mlx5e_priv *priv; + sa_family_t sa_family; + u16 port; +}; + static inline bool mlx5e_vxlan_allowed(struct mlx5_core_dev *mdev) { return (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan) && @@ -46,8 +53,8 @@ static inline bool mlx5e_vxlan_allowed(struct mlx5_core_dev *mdev) } void mlx5e_vxlan_init(struct mlx5e_priv *priv); -int mlx5e_vxlan_add_port(struct mlx5e_priv *priv, u16 port); -void mlx5e_vxlan_del_port(struct mlx5e_priv *priv, u16 port); +void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, sa_family_t sa_family, + u16 port, int add); struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port); void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv); From 6dd745425807dc977bbea810ef703b935002fcc3 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sat, 30 Apr 2016 23:35:11 +0300 Subject: [PATCH 10/18] pxa168_eth: fix mdiobus_scan() error check Since mdiobus_scan() returns either an error code or NULL on error, the driver should check for both, not only for NULL, otherwise a crash is imminent... Reported-by: Arnd Bergmann Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/pxa168_eth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c index 7ace07dad6a3..c442f6ad15ff 100644 --- a/drivers/net/ethernet/marvell/pxa168_eth.c +++ b/drivers/net/ethernet/marvell/pxa168_eth.c @@ -979,6 +979,8 @@ static int pxa168_init_phy(struct net_device *dev) return 0; pep->phy = mdiobus_scan(pep->smi_bus, pep->phy_addr); + if (IS_ERR(pep->phy)) + return PTR_ERR(pep->phy); if (!pep->phy) return -ENODEV; From ce24c2b8a904753701fe4df313b4cbc2b0649e3e Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sun, 1 May 2016 01:47:36 +0300 Subject: [PATCH 11/18] macb: fix mdiobus_scan() error check Now mdiobus_scan() returns ERR_PTR(-ENODEV) instead of NULL if the PHY device ID was read as all ones. As this was not an error before, this value should be filtered out now in this driver. Fixes: b74766a0a0fe ("phylib: don't return NULL from get_phy_device()") Signed-off-by: Sergei Shtylyov Reviewed-by: Florian Fainelli Acked-by: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c index 48a7d7dee846..898f06f9e286 100644 --- a/drivers/net/ethernet/cadence/macb.c +++ b/drivers/net/ethernet/cadence/macb.c @@ -458,7 +458,8 @@ static int macb_mii_init(struct macb *bp) struct phy_device *phydev; phydev = mdiobus_scan(bp->mii_bus, i); - if (IS_ERR(phydev)) { + if (IS_ERR(phydev) && + PTR_ERR(phydev) != -ENODEV) { err = PTR_ERR(phydev); break; } From 0e28bf93a273cce0db67a17428697abb722c36b8 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Mon, 2 May 2016 11:02:51 +0200 Subject: [PATCH 12/18] net: mvneta: Remove superfluous SMP function call Since commit 3b9d6da67e11 ("cpu/hotplug: Fix rollback during error-out in __cpu_disable()") it is ensured that callbacks of CPU_ONLINE and CPU_DOWN_PREPARE are processed on the hotplugged CPU. Due to this SMP function calls are no longer required. Replace smp_call_function_single() with a direct call to mvneta_percpu_enable() or mvneta_percpu_disable(). The functions do not require to be called with interrupts disabled, therefore the smp_call_function_single() calling convention is not preserved. Cc: Thomas Petazzoni Cc: netdev@vger.kernel.org Signed-off-by: Anna-Maria Gleixner Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 7fc490225da5..a6d26d351dfc 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -3354,8 +3354,7 @@ static int mvneta_percpu_notifier(struct notifier_block *nfb, /* Enable per-CPU interrupts on the CPU that is * brought up. */ - smp_call_function_single(cpu, mvneta_percpu_enable, - pp, true); + mvneta_percpu_enable(pp); /* Enable per-CPU interrupt on the one CPU we care * about. @@ -3387,8 +3386,7 @@ static int mvneta_percpu_notifier(struct notifier_block *nfb, /* Disable per-CPU interrupts on the CPU that is * brought down. */ - smp_call_function_single(cpu, mvneta_percpu_disable, - pp, true); + mvneta_percpu_disable(pp); break; case CPU_DEAD: From 996e802187889f1cd412e6929c9344b92ccb78c4 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 2 May 2016 09:25:10 -0700 Subject: [PATCH 13/18] net: Disable segmentation if checksumming is not supported In the case of the mlx4 and mlx5 driver they do not support IPv6 checksum offload for tunnels. With this being the case we should disable GSO in addition to the checksum offload features when we find that a device cannot perform a checksum on a given packet type. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 77a71cd68535..5c925ac50b95 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2802,7 +2802,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { - features &= ~NETIF_F_CSUM_MASK; + features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } From af67eb9e7e1ab37880459f83153d34b3c42b0075 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 2 May 2016 09:25:16 -0700 Subject: [PATCH 14/18] vxlan: Add checksum check to the features check function We need to perform an additional check on the inner headers to determine if we can offload the checksum for them. Previously this check didn't occur so we would generate an invalid frame in the case of an IPv6 header encapsulated inside of an IPv4 tunnel. To fix this I added a secondary check to vxlan_features_check so that we can verify that we can offload the inner checksum. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/if_ether.h | 5 +++++ include/net/vxlan.h | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h index d5569734f672..548fd535fd02 100644 --- a/include/linux/if_ether.h +++ b/include/linux/if_ether.h @@ -28,6 +28,11 @@ static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) return (struct ethhdr *)skb_mac_header(skb); } +static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb) +{ + return (struct ethhdr *)skb_inner_mac_header(skb); +} + int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len); diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 73ed2e951c02..35437c779da8 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -252,7 +252,9 @@ static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, (skb->inner_protocol_type != ENCAP_TYPE_ETHER || skb->inner_protocol != htons(ETH_P_TEB) || (skb_inner_mac_header(skb) - skb_transport_header(skb) != - sizeof(struct udphdr) + sizeof(struct vxlanhdr)))) + sizeof(struct udphdr) + sizeof(struct vxlanhdr)) || + (skb->ip_summed != CHECKSUM_NONE && + !can_checksum_protocol(features, inner_eth_hdr(skb)->h_proto)))) return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); return features; From eb192840266fab3e3da644018121eed30153355d Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 2 May 2016 11:24:51 -0700 Subject: [PATCH 15/18] RDS:TCP: Synchronize rds_tcp_accept_one with rds_send_xmit when resetting t_sock There is a race condition between rds_send_xmit -> rds_tcp_xmit and the code that deals with resolution of duelling syns added by commit 241b271952eb ("RDS-TCP: Reset tcp callbacks if re-using an outgoing socket in rds_tcp_accept_one()"). Specifically, we may end up derefencing a null pointer in rds_send_xmit if we have the interleaving sequence: rds_tcp_accept_one rds_send_xmit conn is RDS_CONN_UP, so invoke rds_tcp_xmit tc = conn->c_transport_data rds_tcp_restore_callbacks /* reset t_sock */ null ptr deref from tc->t_sock The race condition can be avoided without adding the overhead of additional locking in the xmit path: have rds_tcp_accept_one wait for rds_tcp_xmit threads to complete before resetting callbacks. The synchronization can be done in the same manner as rds_conn_shutdown(). First set the rds_conn_state to something other than RDS_CONN_UP (so that new threads cannot get into rds_tcp_xmit()), then wait for RDS_IN_XMIT to be cleared in the conn->c_flags indicating that any threads in rds_tcp_xmit are done. Fixes: 241b271952eb ("RDS-TCP: Reset tcp callbacks if re-using an outgoing socket in rds_tcp_accept_one()") Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/tcp.c | 2 +- net/rds/tcp_listen.c | 40 ++++++++++++++++++++++++---------------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 61ed2a8764ba..9134544941c2 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -127,7 +127,7 @@ void rds_tcp_restore_callbacks(struct socket *sock, /* * This is the only path that sets tc->t_sock. Send and receive trust that - * it is set. The RDS_CONN_CONNECTED bit protects those paths from being + * it is set. The RDS_CONN_UP bit protects those paths from being * called while it isn't set. */ void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 0936a4a32b47..0896187243d6 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -115,24 +115,32 @@ int rds_tcp_accept_one(struct socket *sock) * rds_tcp_state_change() will do that cleanup */ rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; - if (rs_tcp->t_sock && - ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) { - struct sock *nsk = new_sock->sk; - - nsk->sk_user_data = NULL; - nsk->sk_prot->disconnect(nsk, 0); - tcp_done(nsk); - new_sock = NULL; - ret = 0; - goto out; - } else if (rs_tcp->t_sock) { - rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp); - conn->c_outgoing = 0; - } - rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING); + if (rs_tcp->t_sock) { + /* Need to resolve a duelling SYN between peers. + * We have an outstanding SYN to this peer, which may + * potentially have transitioned to the RDS_CONN_UP state, + * so we must quiesce any send threads before resetting + * c_transport_data. + */ + wait_event(conn->c_waitq, + !test_bit(RDS_IN_XMIT, &conn->c_flags)); + if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) { + struct sock *nsk = new_sock->sk; + + nsk->sk_user_data = NULL; + nsk->sk_prot->disconnect(nsk, 0); + tcp_done(nsk); + new_sock = NULL; + ret = 0; + goto out; + } else if (rs_tcp->t_sock) { + rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp); + conn->c_outgoing = 0; + } + } rds_tcp_set_callbacks(new_sock, conn); - rds_connect_complete(conn); + rds_connect_complete(conn); /* marks RDS_CONN_UP */ new_sock = NULL; ret = 0; From bd7c5f983f3185b75cc23bdd5dbc3a676aef3d1e Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 2 May 2016 11:24:52 -0700 Subject: [PATCH 16/18] RDS: TCP: Synchronize accept() and connect() paths on t_conn_lock. An arbitration scheme for duelling SYNs is implemented as part of commit 241b271952eb ("RDS-TCP: Reset tcp callbacks if re-using an outgoing socket in rds_tcp_accept_one()") which ensures that both nodes involved will arrive at the same arbitration decision. However, this needs to be synchronized with an outgoing SYN to be generated by rds_tcp_conn_connect(). This commit achieves the synchronization through the t_conn_lock mutex in struct rds_tcp_connection. The rds_conn_state is checked in rds_tcp_conn_connect() after acquiring the t_conn_lock mutex. A SYN is sent out only if the RDS connection is not already UP (an UP would indicate that rds_tcp_accept_one() has completed 3WH, so no SYN needs to be generated). Similarly, the rds_conn_state is checked in rds_tcp_accept_one() after acquiring the t_conn_lock mutex. The only acceptable states (to allow continuation of the arbitration logic) are UP (i.e., outgoing SYN was SYN-ACKed by peer after it sent us the SYN) or CONNECTING (we sent outgoing SYN before we saw incoming SYN). Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/tcp.c | 1 + net/rds/tcp.h | 4 ++++ net/rds/tcp_connect.c | 8 ++++++++ net/rds/tcp_listen.c | 30 ++++++++++++++++++++---------- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 9134544941c2..86187dad1440 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -216,6 +216,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) if (!tc) return -ENOMEM; + mutex_init(&tc->t_conn_lock); tc->t_sock = NULL; tc->t_tinc = NULL; tc->t_tinc_hdr_rem = sizeof(struct rds_header); diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 64f873c0c6b6..41c228300525 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -12,6 +12,10 @@ struct rds_tcp_connection { struct list_head t_tcp_node; struct rds_connection *conn; + /* t_conn_lock synchronizes the connection establishment between + * rds_tcp_accept_one and rds_tcp_conn_connect + */ + struct mutex t_conn_lock; struct socket *t_sock; void *t_orig_write_space; void *t_orig_data_ready; diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 5cb16875c460..49a3fcfed360 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -78,7 +78,14 @@ int rds_tcp_conn_connect(struct rds_connection *conn) struct socket *sock = NULL; struct sockaddr_in src, dest; int ret; + struct rds_tcp_connection *tc = conn->c_transport_data; + mutex_lock(&tc->t_conn_lock); + + if (rds_conn_up(conn)) { + mutex_unlock(&tc->t_conn_lock); + return 0; + } ret = sock_create_kern(rds_conn_net(conn), PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret < 0) @@ -120,6 +127,7 @@ int rds_tcp_conn_connect(struct rds_connection *conn) } out: + mutex_unlock(&tc->t_conn_lock); if (sock) sock_release(sock); return ret; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 0896187243d6..be263cdf268b 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -76,7 +76,9 @@ int rds_tcp_accept_one(struct socket *sock) struct rds_connection *conn; int ret; struct inet_sock *inet; - struct rds_tcp_connection *rs_tcp; + struct rds_tcp_connection *rs_tcp = NULL; + int conn_state; + struct sock *nsk; ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, @@ -116,6 +118,10 @@ int rds_tcp_accept_one(struct socket *sock) */ rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING); + mutex_lock(&rs_tcp->t_conn_lock); + conn_state = rds_conn_state(conn); + if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP) + goto rst_nsk; if (rs_tcp->t_sock) { /* Need to resolve a duelling SYN between peers. * We have an outstanding SYN to this peer, which may @@ -126,14 +132,7 @@ int rds_tcp_accept_one(struct socket *sock) wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags)); if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) { - struct sock *nsk = new_sock->sk; - - nsk->sk_user_data = NULL; - nsk->sk_prot->disconnect(nsk, 0); - tcp_done(nsk); - new_sock = NULL; - ret = 0; - goto out; + goto rst_nsk; } else if (rs_tcp->t_sock) { rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp); conn->c_outgoing = 0; @@ -143,8 +142,19 @@ int rds_tcp_accept_one(struct socket *sock) rds_connect_complete(conn); /* marks RDS_CONN_UP */ new_sock = NULL; ret = 0; - + goto out; +rst_nsk: + /* reset the newly returned accept sock and bail */ + nsk = new_sock->sk; + rds_tcp_stats_inc(s_tcp_listen_closed_stale); + nsk->sk_user_data = NULL; + nsk->sk_prot->disconnect(nsk, 0); + tcp_done(nsk); + new_sock = NULL; + ret = 0; out: + if (rs_tcp) + mutex_unlock(&rs_tcp->t_conn_lock); if (new_sock) sock_release(new_sock); return ret; From cf6696608a069aaac10a253207deeb63dcd6a653 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 2 May 2016 18:38:45 -0700 Subject: [PATCH 17/18] net: macb: Probe MDIO bus before registering netdev The current sequence makes us register for a network device prior to registering and probing the MDIO bus which could lead to some unwanted consequences, like a thread of execution calling into ndo_open before register_netdev() returns, while the MDIO bus is not ready yet. Rework the sequence to register for the MDIO bus, and therefore attach to a PHY prior to calling register_netdev(), which implies reworking the error path a bit. Signed-off-by: Florian Fainelli Acked-by: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb.c | 31 ++++++++++++++++++----------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c index 898f06f9e286..a63551d0a18a 100644 --- a/drivers/net/ethernet/cadence/macb.c +++ b/drivers/net/ethernet/cadence/macb.c @@ -441,7 +441,7 @@ static int macb_mii_init(struct macb *bp) snprintf(bp->mii_bus->id, MII_BUS_ID_SIZE, "%s-%x", bp->pdev->name, bp->pdev->id); bp->mii_bus->priv = bp; - bp->mii_bus->parent = &bp->dev->dev; + bp->mii_bus->parent = &bp->pdev->dev; pdata = dev_get_platdata(&bp->pdev->dev); dev_set_drvdata(&bp->dev->dev, bp->mii_bus); @@ -3020,29 +3020,36 @@ static int macb_probe(struct platform_device *pdev) if (err) goto err_out_free_netdev; + err = macb_mii_init(bp); + if (err) + goto err_out_free_netdev; + + phydev = bp->phy_dev; + + netif_carrier_off(dev); + err = register_netdev(dev); if (err) { dev_err(&pdev->dev, "Cannot register net device, aborting.\n"); - goto err_out_unregister_netdev; + goto err_out_unregister_mdio; } - err = macb_mii_init(bp); - if (err) - goto err_out_unregister_netdev; - - netif_carrier_off(dev); + phy_attached_info(phydev); netdev_info(dev, "Cadence %s rev 0x%08x at 0x%08lx irq %d (%pM)\n", macb_is_gem(bp) ? "GEM" : "MACB", macb_readl(bp, MID), dev->base_addr, dev->irq, dev->dev_addr); - phydev = bp->phy_dev; - phy_attached_info(phydev); - return 0; -err_out_unregister_netdev: - unregister_netdev(dev); +err_out_unregister_mdio: + phy_disconnect(bp->phy_dev); + mdiobus_unregister(bp->mii_bus); + mdiobus_free(bp->mii_bus); + + /* Shutdown the PHY if there is a GPIO reset */ + if (bp->reset_gpio) + gpiod_set_value(bp->reset_gpio, 0); err_out_free_netdev: free_netdev(dev); From 79e8dc8b80bff0bc5bbb90ca5e73044bf207c8ac Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 3 May 2016 09:58:27 +0200 Subject: [PATCH 18/18] ipv6/ila: fix nlsize calculation for lwtunnel The handler 'ila_fill_encap_info' adds one attribute: ILA_ATTR_LOCATOR. Fixes: 65d7ab8de582 ("net: Identifier Locator Addressing module") CC: Tom Herbert Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/ila/ila_lwt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 2ae3c4fd8aab..41f18de5dcc2 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -120,8 +120,7 @@ static int ila_fill_encap_info(struct sk_buff *skb, static int ila_encap_nlsize(struct lwtunnel_state *lwtstate) { - /* No encapsulation overhead */ - return 0; + return nla_total_size(sizeof(u64)); /* ILA_ATTR_LOCATOR */ } static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)