From f0b07bb151b098d291fd1fd71ef7a2df56fb124a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 19:20:32 +0300 Subject: [PATCH 1/5] net: Introduce net_rwsem to protect net_namespace_list rtnl_lock() is used everywhere, and contention is very high. When someone wants to iterate over alive net namespaces, he/she has no a possibility to do that without exclusive lock. But the exclusive rtnl_lock() in such places is overkill, and it just increases the contention. Yes, there is already for_each_net_rcu() in kernel, but it requires rcu_read_lock(), and this can't be sleepable. Also, sometimes it may be need really prevent net_namespace_list growth, so for_each_net_rcu() is not fit there. This patch introduces new rw_semaphore, which will be used instead of rtnl_mutex to protect net_namespace_list. It is sleepable and allows not-exclusive iterations over net namespaces list. It allows to stop using rtnl_lock() in several places (what is made in next patches) and makes less the time, we keep rtnl_mutex. Here we just add new lock, while the explanation of we can remove rtnl_lock() there are in next patches. Fine grained locks generally are better, then one big lock, so let's do that with net_namespace_list, while the situation allows that. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- drivers/infiniband/core/roce_gid_mgmt.c | 2 ++ include/linux/rtnetlink.h | 1 + include/net/net_namespace.h | 1 + net/core/dev.c | 5 +++++ net/core/fib_notifier.c | 2 ++ net/core/net_namespace.c | 18 +++++++++++++----- net/core/rtnetlink.c | 5 +++++ net/netfilter/nf_conntrack_core.c | 2 ++ net/openvswitch/datapath.c | 2 ++ net/wireless/wext-core.c | 2 ++ security/selinux/include/xfrm.h | 2 ++ 11 files changed, 37 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 5a52ec77940a..cc2966380c0c 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -403,10 +403,12 @@ static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, * our feet */ rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev)) add_netdev_ips(ib_dev, port, rdma_ndev, ndev); + up_read(&net_rwsem); rtnl_unlock(); } diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index c7d1e4689325..5225832bd6ff 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -37,6 +37,7 @@ extern int rtnl_lock_killable(void); extern wait_queue_head_t netdev_unregistering_wq; extern struct rw_semaphore pernet_ops_rwsem; +extern struct rw_semaphore net_rwsem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 1ab4f920f109..47e35cce3b64 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -291,6 +291,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet) #endif } +/* Protected by net_rwsem */ #define for_each_net(VAR) \ list_for_each_entry(VAR, &net_namespace_list, list) diff --git a/net/core/dev.c b/net/core/dev.c index e13807b5c84d..eca5458b2753 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1629,6 +1629,7 @@ int register_netdevice_notifier(struct notifier_block *nb) goto unlock; if (dev_boot_phase) goto unlock; + down_read(&net_rwsem); for_each_net(net) { for_each_netdev(net, dev) { err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); @@ -1642,6 +1643,7 @@ int register_netdevice_notifier(struct notifier_block *nb) call_netdevice_notifier(nb, NETDEV_UP, dev); } } + up_read(&net_rwsem); unlock: rtnl_unlock(); @@ -1664,6 +1666,7 @@ int register_netdevice_notifier(struct notifier_block *nb) } outroll: + up_read(&net_rwsem); raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } @@ -1694,6 +1697,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) if (err) goto unlock; + down_read(&net_rwsem); for_each_net(net) { for_each_netdev(net, dev) { if (dev->flags & IFF_UP) { @@ -1704,6 +1708,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } + up_read(&net_rwsem); unlock: rtnl_unlock(); return err; diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 0c048bdeb016..614b985c92a4 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -33,6 +33,7 @@ static unsigned int fib_seq_sum(void) struct net *net; rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) { rcu_read_lock(); list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) { @@ -43,6 +44,7 @@ static unsigned int fib_seq_sum(void) } rcu_read_unlock(); } + up_read(&net_rwsem); rtnl_unlock(); return fib_seq; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b5796d17a302..7fdf321d4997 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -33,6 +33,10 @@ static struct list_head *first_device = &pernet_list; LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); +/* Protects net_namespace_list. Nests iside rtnl_lock() */ +DECLARE_RWSEM(net_rwsem); +EXPORT_SYMBOL_GPL(net_rwsem); + struct net init_net = { .count = REFCOUNT_INIT(1), .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), @@ -309,9 +313,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) if (error < 0) goto out_undo; } - rtnl_lock(); + down_write(&net_rwsem); list_add_tail_rcu(&net->list, &net_namespace_list); - rtnl_unlock(); + up_write(&net_rwsem); out: return error; @@ -450,7 +454,7 @@ static void unhash_nsid(struct net *net, struct net *last) * and this work is the only process, that may delete * a net from net_namespace_list. So, when the below * is executing, the list may only grow. Thus, we do not - * use for_each_net_rcu() or rtnl_lock(). + * use for_each_net_rcu() or net_rwsem. */ for_each_net(tmp) { int id; @@ -485,7 +489,7 @@ static void cleanup_net(struct work_struct *work) down_read(&pernet_ops_rwsem); /* Don't let anyone else find us. */ - rtnl_lock(); + down_write(&net_rwsem); llist_for_each_entry(net, net_kill_list, cleanup_list) list_del_rcu(&net->list); /* Cache last net. After we unlock rtnl, no one new net @@ -499,7 +503,7 @@ static void cleanup_net(struct work_struct *work) * useless anyway, as netns_ids are destroyed there. */ last = list_last_entry(&net_namespace_list, struct net, list); - rtnl_unlock(); + up_write(&net_rwsem); llist_for_each_entry(net, net_kill_list, cleanup_list) { unhash_nsid(net, last); @@ -900,6 +904,9 @@ static int __register_pernet_operations(struct list_head *list, list_add_tail(&ops->list, list); if (ops->init || (ops->id && ops->size)) { + /* We held write locked pernet_ops_rwsem, and parallel + * setup_net() and cleanup_net() are not possible. + */ for_each_net(net) { error = ops_init(ops, net); if (error) @@ -923,6 +930,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) LIST_HEAD(net_exit_list); list_del(&ops->list); + /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); ops_exit_list(ops, &net_exit_list); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2d3949789cef..e86b28482ca7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -418,9 +418,11 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; + down_read(&net_rwsem); for_each_net(net) { __rtnl_kill_links(net, ops); } + up_read(&net_rwsem); list_del(&ops->list); } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); @@ -438,6 +440,9 @@ static void rtnl_lock_unregistering_all(void) for (;;) { unregistering = false; rtnl_lock(); + /* We held write locked pernet_ops_rwsem, and parallel + * setup_net() and cleanup_net() are not possible. + */ for_each_net(net) { if (net->dev_unreg_count > 0) { unregistering = true; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 705198de671d..370f9b7f051b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1764,12 +1764,14 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) struct net *net; rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) { if (atomic_read(&net->ct.count) == 0) continue; __nf_ct_unconfirmed_destroy(net); nf_queue_nf_hook_drop(net); } + up_read(&net_rwsem); rtnl_unlock(); /* Need to wait for netns cleanup worker to finish, if its diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ef38e5aecd28..9746ee30a99b 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2364,8 +2364,10 @@ static void __net_exit ovs_exit_net(struct net *dnet) __dp_destroy(dp); rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) list_vports_from_net(net, dnet, &head); + up_read(&net_rwsem); rtnl_unlock(); /* Detach all vports from given namespace. */ diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 9efbfc753347..544d7b62d7ca 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -349,11 +349,13 @@ void wireless_nlevent_flush(void) ASSERT_RTNL(); + down_read(&net_rwsem); for_each_net(net) { while ((skb = skb_dequeue(&net->wext_nlevents))) rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); } + up_read(&net_rwsem); } EXPORT_SYMBOL_GPL(wireless_nlevent_flush); diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index 1f173a7a4daa..31d66431be1e 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h @@ -48,8 +48,10 @@ static inline void selinux_xfrm_notify_policyload(void) struct net *net; rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) rt_genid_bump_all(net); + up_read(&net_rwsem); rtnl_unlock(); } #else From 10256debb918aea083d0ddada64d29014c642a7b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 19:20:44 +0300 Subject: [PATCH 2/5] net: Don't take rtnl_lock() in wireless_nlevent_flush() This function iterates over net_namespace_list and flushes the queue for every of them. What does this rtnl_lock() protects?! Since we may add skbs to net::wext_nlevents without rtnl_lock(), it does not protects us about queuers. It guarantees, two threads can't flush the queue in parallel, that can change the order, but since skb can be queued in any order, it doesn't matter, how many threads do this in parallel. In case of several threads, this will be even faster. So, we can remove rtnl_lock() here, as it was used for iteration over net_namespace_list only. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/wireless/wext-core.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 544d7b62d7ca..5e677dac2a0c 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -347,8 +347,6 @@ void wireless_nlevent_flush(void) struct sk_buff *skb; struct net *net; - ASSERT_RTNL(); - down_read(&net_rwsem); for_each_net(net) { while ((skb = skb_dequeue(&net->wext_nlevents))) @@ -412,9 +410,7 @@ subsys_initcall(wireless_nlevent_init); /* Process events generated by the wireless layer or the driver. */ static void wireless_nlevent_process(struct work_struct *work) { - rtnl_lock(); wireless_nlevent_flush(); - rtnl_unlock(); } static DECLARE_WORK(wireless_nlevent_work, wireless_nlevent_process); From 350311aab4c0b2477f9cf3fb03cef2e4cd6c3b18 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 19:20:56 +0300 Subject: [PATCH 3/5] security: Remove rtnl_lock() in selinux_xfrm_notify_policyload() rt_genid_bump_all() consists of ipv4 and ipv6 part. ipv4 part is incrementing of net::ipv4::rt_genid, and I see many places, where it's read without rtnl_lock(). ipv6 part calls __fib6_clean_all(), and it's also called without rtnl_lock() in other places. So, rtnl_lock() here was used to iterate net_namespace_list only, and we can remove it. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- security/selinux/include/xfrm.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index 31d66431be1e..a0b465316292 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h @@ -47,12 +47,10 @@ static inline void selinux_xfrm_notify_policyload(void) { struct net *net; - rtnl_lock(); down_read(&net_rwsem); for_each_net(net) rt_genid_bump_all(net); up_read(&net_rwsem); - rtnl_unlock(); } #else static inline int selinux_xfrm_enabled(void) From ec9c780925c57588637e1dbd8650d294107311c0 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 19:21:09 +0300 Subject: [PATCH 4/5] ovs: Remove rtnl_lock() from ovs_exit_net() Here we iterate for_each_net() and removes vport from alive net to the exiting net. ovs_net::dps are protected by ovs_mutex(), and the others, who change it (ovs_dp_cmd_new(), __dp_destroy()) also take it. The same with datapath::ports list. So, we remove rtnl_lock() here. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 9746ee30a99b..015e24e08909 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2363,12 +2363,10 @@ static void __net_exit ovs_exit_net(struct net *dnet) list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); - rtnl_lock(); down_read(&net_rwsem); for_each_net(net) list_vports_from_net(net, dnet, &head); up_read(&net_rwsem); - rtnl_unlock(); /* Detach all vports from given namespace. */ list_for_each_entry_safe(vport, vport_next, &head, detach_list) { From 152f253152cc08f5d26ba76659723d2d83b363f8 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 19:21:20 +0300 Subject: [PATCH 5/5] net: Remove rtnl_lock() in nf_ct_iterate_destroy() rtnl_lock() doesn't protect net::ct::count, and it's not needed for__nf_ct_unconfirmed_destroy() and for nf_queue_nf_hook_drop(). Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 370f9b7f051b..41ff04ee2554 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1763,7 +1763,6 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) { struct net *net; - rtnl_lock(); down_read(&net_rwsem); for_each_net(net) { if (atomic_read(&net->ct.count) == 0) @@ -1772,7 +1771,6 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) nf_queue_nf_hook_drop(net); } up_read(&net_rwsem); - rtnl_unlock(); /* Need to wait for netns cleanup worker to finish, if its * running -- it might have deleted a net namespace from