From 6493517eaea9b052e081e557f7c8bb06cc6b1852 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Mar 2015 00:04:51 -0500 Subject: [PATCH 1/6] tcp_metrics: panic when tcp_metrics_init fails. There is not a practical way to cleanup during boot so just panic if there is a problem initializing tcp_metrics. That will at least give us a clear place to start debugging if something does go wrong. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index e5f41bd5ec1b..4206b14d956d 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1175,16 +1175,10 @@ void __init tcp_metrics_init(void) ret = register_pernet_subsys(&tcp_net_metrics_ops); if (ret < 0) - goto cleanup; + panic("Could not allocate the tcp_metrics hash table\n"); + ret = genl_register_family_with_ops(&tcp_metrics_nl_family, tcp_metrics_nl_ops); if (ret < 0) - goto cleanup_subsys; - return; - -cleanup_subsys: - unregister_pernet_subsys(&tcp_net_metrics_ops); - -cleanup: - return; + panic("Could not register tcp_metrics generic netlink\n"); } From 3e5da62d0bcbfa86332f66cca0e3983e70557fac Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Mar 2015 00:05:24 -0500 Subject: [PATCH 2/6] tcp_metrics: Mix the network namespace into the hash function. In preparation for using one hash table for all network namespaces mix the network namespace into the hash value. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 4206b14d956d..fbb42f44501e 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -252,6 +252,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, } net = dev_net(dst->dev); + hash ^= net_hash_mix(net); hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; @@ -299,6 +300,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock return NULL; net = twsk_net(tw); + hash ^= net_hash_mix(net); hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; @@ -347,6 +349,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, return NULL; net = dev_net(dst->dev); + hash ^= net_hash_mix(net); hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); tm = __tcp_get_metrics(&saddr, &daddr, net, hash); @@ -994,6 +997,7 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) if (!reply) goto nla_put_failure; + hash ^= net_hash_mix(net); hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); ret = -ESRCH; rcu_read_lock(); @@ -1070,6 +1074,7 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) if (ret < 0) src = false; + hash ^= net_hash_mix(net); hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); hb = net->ipv4.tcp_metrics_hash + hash; pp = &hb->chain; From 849e8a0ca8d5d286510ab30b5f67b91aa6965ef6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Mar 2015 00:05:52 -0500 Subject: [PATCH 3/6] tcp_metrics: Add a field tcpm_net and verify it matches on lookup In preparation for using one tcp metrics hash table for all network namespaces add a field tcpm_net to struct tcp_metrics_block, and verify that field on all hash table lookups. Make the field tcpm_net of type possible_net_t so it takes no space when network namespaces are disabled. Further add a function tm_net to read that field so we can be efficient when network namespaces are disabled and concise the rest of the time. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index fbb42f44501e..461c3d2e1ca4 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -40,6 +40,7 @@ struct tcp_fastopen_metrics { struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; + possible_net_t tcpm_net; struct inetpeer_addr tcpm_saddr; struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; @@ -52,6 +53,11 @@ struct tcp_metrics_block { struct rcu_head rcu_head; }; +static inline struct net *tm_net(struct tcp_metrics_block *tm) +{ + return read_pnet(&tm->tcpm_net); +} + static bool tcp_metric_locked(struct tcp_metrics_block *tm, enum tcp_metric_index idx) { @@ -183,6 +189,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, if (!tm) goto out_unlock; } + write_pnet(&tm->tcpm_net, net); tm->tcpm_saddr = *saddr; tm->tcpm_daddr = *daddr; @@ -217,7 +224,8 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *s for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, saddr) && - addr_same(&tm->tcpm_daddr, daddr)) + addr_same(&tm->tcpm_daddr, daddr) && + net_eq(tm_net(tm), net)) break; depth++; } @@ -258,7 +266,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, &saddr) && - addr_same(&tm->tcpm_daddr, &daddr)) + addr_same(&tm->tcpm_daddr, &daddr) && + net_eq(tm_net(tm), net)) break; } tcpm_check_stamp(tm, dst); @@ -306,7 +315,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, &saddr) && - addr_same(&tm->tcpm_daddr, &daddr)) + addr_same(&tm->tcpm_daddr, &daddr) && + net_eq(tm_net(tm), net)) break; } return tm; @@ -912,6 +922,8 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb, rcu_read_lock(); for (col = 0, tm = rcu_dereference(hb->chain); tm; tm = rcu_dereference(tm->tcpm_next), col++) { + if (!net_eq(tm_net(tm), net)) + continue; if (col < s_col) continue; if (tcp_metrics_dump_info(skb, cb, tm) < 0) { @@ -1004,7 +1016,8 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_daddr, &daddr) && - (!src || addr_same(&tm->tcpm_saddr, &saddr))) { + (!src || addr_same(&tm->tcpm_saddr, &saddr)) && + net_eq(tm_net(tm), net)) { ret = tcp_metrics_fill_info(msg, tm); break; } @@ -1081,7 +1094,8 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) spin_lock_bh(&tcp_metrics_lock); for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { if (addr_same(&tm->tcpm_daddr, &daddr) && - (!src || addr_same(&tm->tcpm_saddr, &saddr))) { + (!src || addr_same(&tm->tcpm_saddr, &saddr)) && + net_eq(tm_net(tm), net)) { *pp = tm->tcpm_next; kfree_rcu(tm, rcu_head); found = true; From 8a4bff714fc088729abdd479acbfe934ddf16f7e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Mar 2015 00:06:43 -0500 Subject: [PATCH 4/6] tcp_metrics: Remove the unused return code from tcp_metrics_flush_all tcp_metrics_flush_all always returns 0. Remove the unnecessary return code. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 461c3d2e1ca4..0d07e14f2ca5 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1043,7 +1043,7 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) #define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held()) -static int tcp_metrics_flush_all(struct net *net) +static void tcp_metrics_flush_all(struct net *net) { unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash; @@ -1064,7 +1064,6 @@ static int tcp_metrics_flush_all(struct net *net) tm = next; } } - return 0; } static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) @@ -1081,8 +1080,10 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) ret = parse_nl_addr(info, &daddr, &hash, 1); if (ret < 0) return ret; - if (ret > 0) - return tcp_metrics_flush_all(net); + if (ret > 0) { + tcp_metrics_flush_all(net); + return 0; + } ret = parse_nl_saddr(info, &saddr); if (ret < 0) src = false; From 04f721c671656f93de888b1d176ba30b7336cca3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Mar 2015 00:07:10 -0500 Subject: [PATCH 5/6] tcp_metrics: Rewrite tcp_metrics_flush_all Rewrite tcp_metrics_flush_all so that it can cope with entries from different network namespaces on it's hash chain. This is based on the logic in tcp_metrics_nl_cmd_del for deleting a selection of entries from a tcp metrics hash chain. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 0d07e14f2ca5..baccb070427d 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1051,18 +1051,19 @@ static void tcp_metrics_flush_all(struct net *net) unsigned int row; for (row = 0; row < max_rows; row++, hb++) { + struct tcp_metrics_block __rcu **pp; spin_lock_bh(&tcp_metrics_lock); - tm = deref_locked_genl(hb->chain); - if (tm) - hb->chain = NULL; - spin_unlock_bh(&tcp_metrics_lock); - while (tm) { - struct tcp_metrics_block *next; - - next = deref_genl(tm->tcpm_next); - kfree_rcu(tm, rcu_head); - tm = next; + pp = &hb->chain; + for (tm = deref_locked_genl(*pp); tm; + tm = deref_locked_genl(*pp)) { + if (net_eq(tm_net(tm), net)) { + *pp = tm->tcpm_next; + kfree_rcu(tm, rcu_head); + } else { + pp = &tm->tcpm_next; + } } + spin_unlock_bh(&tcp_metrics_lock); } } From 098a697b497e3154a1a583c1d34c67568acaadcc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Mar 2015 00:07:44 -0500 Subject: [PATCH 6/6] tcp_metrics: Use a single hash table for all network namespaces. Now that all of the operations are safe on a single hash table accross network namespaces, allocate a single global hash table and update the code to use it. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 -- net/ipv4/tcp_metrics.c | 66 ++++++++++++++++++---------------------- 2 files changed, 30 insertions(+), 38 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8f3a1a1a5a94..614a49be68a9 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -54,8 +54,6 @@ struct netns_ipv4 { struct sock *mc_autojoin_sk; struct inet_peer_base *peers; - struct tcpm_hash_bucket *tcp_metrics_hash; - unsigned int tcp_metrics_hash_log; struct sock * __percpu *tcp_sk; struct netns_frags frags; #ifdef CONFIG_NETFILTER diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index baccb070427d..366728cbee4a 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -97,6 +97,9 @@ struct tcpm_hash_bucket { struct tcp_metrics_block __rcu *chain; }; +static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly; +static unsigned int tcp_metrics_hash_log __read_mostly; + static DEFINE_SPINLOCK(tcp_metrics_lock); static void tcpm_suck_dst(struct tcp_metrics_block *tm, @@ -177,7 +180,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, if (unlikely(reclaim)) { struct tcp_metrics_block *oldest; - oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); + oldest = rcu_dereference(tcp_metrics_hash[hash].chain); for (tm = rcu_dereference(oldest->tcpm_next); tm; tm = rcu_dereference(tm->tcpm_next)) { if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) @@ -196,8 +199,8 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, tcpm_suck_dst(tm, dst, true); if (likely(!reclaim)) { - tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; - rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm); + tm->tcpm_next = tcp_metrics_hash[hash].chain; + rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm); } out_unlock: @@ -221,7 +224,7 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *s struct tcp_metrics_block *tm; int depth = 0; - for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, saddr) && addr_same(&tm->tcpm_daddr, daddr) && @@ -261,9 +264,9 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, net = dev_net(dst->dev); hash ^= net_hash_mix(net); - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + hash = hash_32(hash, tcp_metrics_hash_log); - for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, &saddr) && addr_same(&tm->tcpm_daddr, &daddr) && @@ -310,9 +313,9 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock net = twsk_net(tw); hash ^= net_hash_mix(net); - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + hash = hash_32(hash, tcp_metrics_hash_log); - for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, &saddr) && addr_same(&tm->tcpm_daddr, &daddr) && @@ -360,7 +363,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, net = dev_net(dst->dev); hash ^= net_hash_mix(net); - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + hash = hash_32(hash, tcp_metrics_hash_log); tm = __tcp_get_metrics(&saddr, &daddr, net, hash); if (tm == TCP_METRICS_RECLAIM_PTR) @@ -911,13 +914,13 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; + unsigned int max_rows = 1U << tcp_metrics_hash_log; unsigned int row, s_row = cb->args[0]; int s_col = cb->args[1], col = s_col; for (row = s_row; row < max_rows; row++, s_col = 0) { struct tcp_metrics_block *tm; - struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row; + struct tcpm_hash_bucket *hb = tcp_metrics_hash + row; rcu_read_lock(); for (col = 0, tm = rcu_dereference(hb->chain); tm; @@ -1010,10 +1013,10 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) goto nla_put_failure; hash ^= net_hash_mix(net); - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + hash = hash_32(hash, tcp_metrics_hash_log); ret = -ESRCH; rcu_read_lock(); - for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_daddr, &daddr) && (!src || addr_same(&tm->tcpm_saddr, &saddr)) && @@ -1045,8 +1048,8 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) static void tcp_metrics_flush_all(struct net *net) { - unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; - struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash; + unsigned int max_rows = 1U << tcp_metrics_hash_log; + struct tcpm_hash_bucket *hb = tcp_metrics_hash; struct tcp_metrics_block *tm; unsigned int row; @@ -1090,8 +1093,8 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) src = false; hash ^= net_hash_mix(net); - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); - hb = net->ipv4.tcp_metrics_hash + hash; + hash = hash_32(hash, tcp_metrics_hash_log); + hb = tcp_metrics_hash + hash; pp = &hb->chain; spin_lock_bh(&tcp_metrics_lock); for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { @@ -1147,6 +1150,9 @@ static int __net_init tcp_net_metrics_init(struct net *net) size_t size; unsigned int slots; + if (!net_eq(net, &init_net)) + return 0; + slots = tcpmhash_entries; if (!slots) { if (totalram_pages >= 128 * 1024) @@ -1155,14 +1161,14 @@ static int __net_init tcp_net_metrics_init(struct net *net) slots = 8 * 1024; } - net->ipv4.tcp_metrics_hash_log = order_base_2(slots); - size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log; + tcp_metrics_hash_log = order_base_2(slots); + size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log; - net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!net->ipv4.tcp_metrics_hash) - net->ipv4.tcp_metrics_hash = vzalloc(size); + tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!tcp_metrics_hash) + tcp_metrics_hash = vzalloc(size); - if (!net->ipv4.tcp_metrics_hash) + if (!tcp_metrics_hash) return -ENOMEM; return 0; @@ -1170,19 +1176,7 @@ static int __net_init tcp_net_metrics_init(struct net *net) static void __net_exit tcp_net_metrics_exit(struct net *net) { - unsigned int i; - - for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) { - struct tcp_metrics_block *tm, *next; - - tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1); - while (tm) { - next = rcu_dereference_protected(tm->tcpm_next, 1); - kfree(tm); - tm = next; - } - } - kvfree(net->ipv4.tcp_metrics_hash); + tcp_metrics_flush_all(net); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = {