Merge branch 'tcp_metrics_netns_debloat'

Eric W. Biederman says:

====================
tcp_metrics: Network namespace bloat reduction v3

This is a small pile of patches that convert tcp_metrics from using a
hash table per network namespace to using a single hash table for all
network namespaces.

This is broken up into several patches so that each small step along
the way could be carefully scrutinized as I wrote it, and equally so
that each small step can be reviewed.

There are several cleanups included in this series.  The addition of
panic calls during boot where we can not handle failure, and not trying
simplifies the code.  The removal of the return code from
tcp_metrics_flush_all.

The motivation for this change is that the tcp_metrics hash table at
128KiB is one of the largest components of a freshly allocated network
namespace.

I am resending the the previous version I sent has suffered bitrot, so I
have respun the patches so that they apply.  I believe I have addressed
all of the review concerns except optimal behavior on little machines
with 32-byte cache lines, which is beyond me as even the current code
has bad behavior in that case.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2015-03-13 01:57:16 -04:00
commit 5e1459ca13
2 changed files with 73 additions and 66 deletions

View File

@ -54,8 +54,6 @@ struct netns_ipv4 {
struct sock *mc_autojoin_sk; struct sock *mc_autojoin_sk;
struct inet_peer_base *peers; struct inet_peer_base *peers;
struct tcpm_hash_bucket *tcp_metrics_hash;
unsigned int tcp_metrics_hash_log;
struct sock * __percpu *tcp_sk; struct sock * __percpu *tcp_sk;
struct netns_frags frags; struct netns_frags frags;
#ifdef CONFIG_NETFILTER #ifdef CONFIG_NETFILTER

View File

@ -40,6 +40,7 @@ struct tcp_fastopen_metrics {
struct tcp_metrics_block { struct tcp_metrics_block {
struct tcp_metrics_block __rcu *tcpm_next; struct tcp_metrics_block __rcu *tcpm_next;
possible_net_t tcpm_net;
struct inetpeer_addr tcpm_saddr; struct inetpeer_addr tcpm_saddr;
struct inetpeer_addr tcpm_daddr; struct inetpeer_addr tcpm_daddr;
unsigned long tcpm_stamp; unsigned long tcpm_stamp;
@ -52,6 +53,11 @@ struct tcp_metrics_block {
struct rcu_head rcu_head; struct rcu_head rcu_head;
}; };
static inline struct net *tm_net(struct tcp_metrics_block *tm)
{
return read_pnet(&tm->tcpm_net);
}
static bool tcp_metric_locked(struct tcp_metrics_block *tm, static bool tcp_metric_locked(struct tcp_metrics_block *tm,
enum tcp_metric_index idx) enum tcp_metric_index idx)
{ {
@ -91,6 +97,9 @@ struct tcpm_hash_bucket {
struct tcp_metrics_block __rcu *chain; struct tcp_metrics_block __rcu *chain;
}; };
static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly;
static unsigned int tcp_metrics_hash_log __read_mostly;
static DEFINE_SPINLOCK(tcp_metrics_lock); static DEFINE_SPINLOCK(tcp_metrics_lock);
static void tcpm_suck_dst(struct tcp_metrics_block *tm, static void tcpm_suck_dst(struct tcp_metrics_block *tm,
@ -171,7 +180,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
if (unlikely(reclaim)) { if (unlikely(reclaim)) {
struct tcp_metrics_block *oldest; struct tcp_metrics_block *oldest;
oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); oldest = rcu_dereference(tcp_metrics_hash[hash].chain);
for (tm = rcu_dereference(oldest->tcpm_next); tm; for (tm = rcu_dereference(oldest->tcpm_next); tm;
tm = rcu_dereference(tm->tcpm_next)) { tm = rcu_dereference(tm->tcpm_next)) {
if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
@ -183,14 +192,15 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
if (!tm) if (!tm)
goto out_unlock; goto out_unlock;
} }
write_pnet(&tm->tcpm_net, net);
tm->tcpm_saddr = *saddr; tm->tcpm_saddr = *saddr;
tm->tcpm_daddr = *daddr; tm->tcpm_daddr = *daddr;
tcpm_suck_dst(tm, dst, true); tcpm_suck_dst(tm, dst, true);
if (likely(!reclaim)) { if (likely(!reclaim)) {
tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; tm->tcpm_next = tcp_metrics_hash[hash].chain;
rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm); rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm);
} }
out_unlock: out_unlock:
@ -214,10 +224,11 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *s
struct tcp_metrics_block *tm; struct tcp_metrics_block *tm;
int depth = 0; int depth = 0;
for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) { tm = rcu_dereference(tm->tcpm_next)) {
if (addr_same(&tm->tcpm_saddr, saddr) && if (addr_same(&tm->tcpm_saddr, saddr) &&
addr_same(&tm->tcpm_daddr, daddr)) addr_same(&tm->tcpm_daddr, daddr) &&
net_eq(tm_net(tm), net))
break; break;
depth++; depth++;
} }
@ -252,12 +263,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
} }
net = dev_net(dst->dev); net = dev_net(dst->dev);
hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) { tm = rcu_dereference(tm->tcpm_next)) {
if (addr_same(&tm->tcpm_saddr, &saddr) && if (addr_same(&tm->tcpm_saddr, &saddr) &&
addr_same(&tm->tcpm_daddr, &daddr)) addr_same(&tm->tcpm_daddr, &daddr) &&
net_eq(tm_net(tm), net))
break; break;
} }
tcpm_check_stamp(tm, dst); tcpm_check_stamp(tm, dst);
@ -299,12 +312,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock
return NULL; return NULL;
net = twsk_net(tw); net = twsk_net(tw);
hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) { tm = rcu_dereference(tm->tcpm_next)) {
if (addr_same(&tm->tcpm_saddr, &saddr) && if (addr_same(&tm->tcpm_saddr, &saddr) &&
addr_same(&tm->tcpm_daddr, &daddr)) addr_same(&tm->tcpm_daddr, &daddr) &&
net_eq(tm_net(tm), net))
break; break;
} }
return tm; return tm;
@ -347,7 +362,8 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
return NULL; return NULL;
net = dev_net(dst->dev); net = dev_net(dst->dev);
hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
tm = __tcp_get_metrics(&saddr, &daddr, net, hash); tm = __tcp_get_metrics(&saddr, &daddr, net, hash);
if (tm == TCP_METRICS_RECLAIM_PTR) if (tm == TCP_METRICS_RECLAIM_PTR)
@ -898,17 +914,19 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb,
struct netlink_callback *cb) struct netlink_callback *cb)
{ {
struct net *net = sock_net(skb->sk); struct net *net = sock_net(skb->sk);
unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; unsigned int max_rows = 1U << tcp_metrics_hash_log;
unsigned int row, s_row = cb->args[0]; unsigned int row, s_row = cb->args[0];
int s_col = cb->args[1], col = s_col; int s_col = cb->args[1], col = s_col;
for (row = s_row; row < max_rows; row++, s_col = 0) { for (row = s_row; row < max_rows; row++, s_col = 0) {
struct tcp_metrics_block *tm; struct tcp_metrics_block *tm;
struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row; struct tcpm_hash_bucket *hb = tcp_metrics_hash + row;
rcu_read_lock(); rcu_read_lock();
for (col = 0, tm = rcu_dereference(hb->chain); tm; for (col = 0, tm = rcu_dereference(hb->chain); tm;
tm = rcu_dereference(tm->tcpm_next), col++) { tm = rcu_dereference(tm->tcpm_next), col++) {
if (!net_eq(tm_net(tm), net))
continue;
if (col < s_col) if (col < s_col)
continue; continue;
if (tcp_metrics_dump_info(skb, cb, tm) < 0) { if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
@ -994,13 +1012,15 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
if (!reply) if (!reply)
goto nla_put_failure; goto nla_put_failure;
hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
ret = -ESRCH; ret = -ESRCH;
rcu_read_lock(); rcu_read_lock();
for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm;
tm = rcu_dereference(tm->tcpm_next)) { tm = rcu_dereference(tm->tcpm_next)) {
if (addr_same(&tm->tcpm_daddr, &daddr) && if (addr_same(&tm->tcpm_daddr, &daddr) &&
(!src || addr_same(&tm->tcpm_saddr, &saddr))) { (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
net_eq(tm_net(tm), net)) {
ret = tcp_metrics_fill_info(msg, tm); ret = tcp_metrics_fill_info(msg, tm);
break; break;
} }
@ -1026,28 +1046,28 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
#define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held()) #define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held())
static int tcp_metrics_flush_all(struct net *net) static void tcp_metrics_flush_all(struct net *net)
{ {
unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; unsigned int max_rows = 1U << tcp_metrics_hash_log;
struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash; struct tcpm_hash_bucket *hb = tcp_metrics_hash;
struct tcp_metrics_block *tm; struct tcp_metrics_block *tm;
unsigned int row; unsigned int row;
for (row = 0; row < max_rows; row++, hb++) { for (row = 0; row < max_rows; row++, hb++) {
struct tcp_metrics_block __rcu **pp;
spin_lock_bh(&tcp_metrics_lock); spin_lock_bh(&tcp_metrics_lock);
tm = deref_locked_genl(hb->chain); pp = &hb->chain;
if (tm) for (tm = deref_locked_genl(*pp); tm;
hb->chain = NULL; tm = deref_locked_genl(*pp)) {
spin_unlock_bh(&tcp_metrics_lock); if (net_eq(tm_net(tm), net)) {
while (tm) { *pp = tm->tcpm_next;
struct tcp_metrics_block *next; kfree_rcu(tm, rcu_head);
} else {
next = deref_genl(tm->tcpm_next); pp = &tm->tcpm_next;
kfree_rcu(tm, rcu_head); }
tm = next;
} }
spin_unlock_bh(&tcp_metrics_lock);
} }
return 0;
} }
static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
@ -1064,19 +1084,23 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
ret = parse_nl_addr(info, &daddr, &hash, 1); ret = parse_nl_addr(info, &daddr, &hash, 1);
if (ret < 0) if (ret < 0)
return ret; return ret;
if (ret > 0) if (ret > 0) {
return tcp_metrics_flush_all(net); tcp_metrics_flush_all(net);
return 0;
}
ret = parse_nl_saddr(info, &saddr); ret = parse_nl_saddr(info, &saddr);
if (ret < 0) if (ret < 0)
src = false; src = false;
hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); hash ^= net_hash_mix(net);
hb = net->ipv4.tcp_metrics_hash + hash; hash = hash_32(hash, tcp_metrics_hash_log);
hb = tcp_metrics_hash + hash;
pp = &hb->chain; pp = &hb->chain;
spin_lock_bh(&tcp_metrics_lock); spin_lock_bh(&tcp_metrics_lock);
for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) {
if (addr_same(&tm->tcpm_daddr, &daddr) && if (addr_same(&tm->tcpm_daddr, &daddr) &&
(!src || addr_same(&tm->tcpm_saddr, &saddr))) { (!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
net_eq(tm_net(tm), net)) {
*pp = tm->tcpm_next; *pp = tm->tcpm_next;
kfree_rcu(tm, rcu_head); kfree_rcu(tm, rcu_head);
found = true; found = true;
@ -1126,6 +1150,9 @@ static int __net_init tcp_net_metrics_init(struct net *net)
size_t size; size_t size;
unsigned int slots; unsigned int slots;
if (!net_eq(net, &init_net))
return 0;
slots = tcpmhash_entries; slots = tcpmhash_entries;
if (!slots) { if (!slots) {
if (totalram_pages >= 128 * 1024) if (totalram_pages >= 128 * 1024)
@ -1134,14 +1161,14 @@ static int __net_init tcp_net_metrics_init(struct net *net)
slots = 8 * 1024; slots = 8 * 1024;
} }
net->ipv4.tcp_metrics_hash_log = order_base_2(slots); tcp_metrics_hash_log = order_base_2(slots);
size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log; size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log;
net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!net->ipv4.tcp_metrics_hash) if (!tcp_metrics_hash)
net->ipv4.tcp_metrics_hash = vzalloc(size); tcp_metrics_hash = vzalloc(size);
if (!net->ipv4.tcp_metrics_hash) if (!tcp_metrics_hash)
return -ENOMEM; return -ENOMEM;
return 0; return 0;
@ -1149,19 +1176,7 @@ static int __net_init tcp_net_metrics_init(struct net *net)
static void __net_exit tcp_net_metrics_exit(struct net *net) static void __net_exit tcp_net_metrics_exit(struct net *net)
{ {
unsigned int i; tcp_metrics_flush_all(net);
for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) {
struct tcp_metrics_block *tm, *next;
tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1);
while (tm) {
next = rcu_dereference_protected(tm->tcpm_next, 1);
kfree(tm);
tm = next;
}
}
kvfree(net->ipv4.tcp_metrics_hash);
} }
static __net_initdata struct pernet_operations tcp_net_metrics_ops = { static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
@ -1175,16 +1190,10 @@ void __init tcp_metrics_init(void)
ret = register_pernet_subsys(&tcp_net_metrics_ops); ret = register_pernet_subsys(&tcp_net_metrics_ops);
if (ret < 0) if (ret < 0)
goto cleanup; panic("Could not allocate the tcp_metrics hash table\n");
ret = genl_register_family_with_ops(&tcp_metrics_nl_family, ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
tcp_metrics_nl_ops); tcp_metrics_nl_ops);
if (ret < 0) if (ret < 0)
goto cleanup_subsys; panic("Could not register tcp_metrics generic netlink\n");
return;
cleanup_subsys:
unregister_pernet_subsys(&tcp_net_metrics_ops);
cleanup:
return;
} }