2019-05-27 14:55:01 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2011-11-22 13:10:51 +08:00
|
|
|
/*
|
|
|
|
* net/core/netprio_cgroup.c Priority Control Group
|
|
|
|
*
|
|
|
|
* Authors: Neil Horman <nhorman@tuxdriver.com>
|
|
|
|
*/
|
|
|
|
|
2012-05-17 03:58:40 +08:00
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2017-03-29 05:45:06 +08:00
|
|
|
#include <linux/module.h>
|
2011-11-22 13:10:51 +08:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/skbuff.h>
|
|
|
|
#include <linux/cgroup.h>
|
|
|
|
#include <linux/rcupdate.h>
|
|
|
|
#include <linux/atomic.h>
|
2017-02-06 17:57:33 +08:00
|
|
|
#include <linux/sched/task.h>
|
|
|
|
|
2011-11-22 13:10:51 +08:00
|
|
|
#include <net/rtnetlink.h>
|
|
|
|
#include <net/pkt_cls.h>
|
|
|
|
#include <net/sock.h>
|
|
|
|
#include <net/netprio_cgroup.h>
|
|
|
|
|
2012-07-20 18:39:25 +08:00
|
|
|
#include <linux/fdtable.h>
|
|
|
|
|
2015-12-08 06:38:51 +08:00
|
|
|
/*
|
|
|
|
* netprio allocates per-net_device priomap array which is indexed by
|
|
|
|
* css->id. Limiting css ID to 16bits doesn't lose anything.
|
|
|
|
*/
|
|
|
|
#define NETPRIO_ID_MAX USHRT_MAX
|
|
|
|
|
2012-11-22 23:32:46 +08:00
|
|
|
#define PRIOMAP_MIN_SZ 128
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2012-11-22 23:32:46 +08:00
|
|
|
/*
|
2013-12-09 04:15:44 +08:00
|
|
|
* Extend @dev->priomap so that it's large enough to accommodate
|
2012-11-22 23:32:46 +08:00
|
|
|
* @target_idx. @dev->priomap.priomap_len > @target_idx after successful
|
|
|
|
* return. Must be called under rtnl lock.
|
|
|
|
*/
|
|
|
|
static int extend_netdev_table(struct net_device *dev, u32 target_idx)
|
2011-11-22 13:10:51 +08:00
|
|
|
{
|
2012-11-22 23:32:46 +08:00
|
|
|
struct netprio_map *old, *new;
|
|
|
|
size_t new_sz, new_len;
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2012-11-22 23:32:46 +08:00
|
|
|
/* is the existing priomap large enough? */
|
2012-11-22 23:32:46 +08:00
|
|
|
old = rtnl_dereference(dev->priomap);
|
2012-11-22 23:32:46 +08:00
|
|
|
if (old && old->priomap_len > target_idx)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine the new size. Let's keep it power-of-two. We start
|
|
|
|
* from PRIOMAP_MIN_SZ and double it until it's large enough to
|
|
|
|
* accommodate @target_idx.
|
|
|
|
*/
|
|
|
|
new_sz = PRIOMAP_MIN_SZ;
|
|
|
|
while (true) {
|
|
|
|
new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
|
|
|
|
sizeof(new->priomap[0]);
|
|
|
|
if (new_len > target_idx)
|
|
|
|
break;
|
|
|
|
new_sz *= 2;
|
|
|
|
/* overflowed? */
|
|
|
|
if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
|
|
|
|
return -ENOSPC;
|
|
|
|
}
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2012-11-22 23:32:46 +08:00
|
|
|
/* allocate & copy */
|
|
|
|
new = kzalloc(new_sz, GFP_KERNEL);
|
2013-02-05 00:48:16 +08:00
|
|
|
if (!new)
|
net: cgroup: fix access the unallocated memory in netprio cgroup
there are some out of bound accesses in netprio cgroup.
now before accessing the dev->priomap.priomap array,we only check
if the dev->priomap exist.and because we don't want to see
additional bound checkings in fast path, so we should make sure
that dev->priomap is null or array size of dev->priomap.priomap
is equal to max_prioidx + 1;
so in write_priomap logic,we should call extend_netdev_table when
dev->priomap is null and dev->priomap.priomap_len < max_len.
and in cgrp_create->update_netdev_tables logic,we should call
extend_netdev_table only when dev->priomap exist and
dev->priomap.priomap_len < max_len.
and it's not needed to call update_netdev_tables in write_priomap,
we can only allocate the net device's priomap which we change through
net_prio.ifpriomap.
this patch also add a return value for update_netdev_tables &
extend_netdev_table, so when new_priomap is allocated failed,
write_priomap will stop to access the priomap,and return -ENOMEM
back to the userspace to tell the user what happend.
Change From v3:
1. add rtnl protect when reading max_prioidx in write_priomap.
2. only call extend_netdev_table when map->priomap_len < max_len,
this will make sure array size of dev->map->priomap always
bigger than any prioidx.
3. add a function write_update_netdev_table to make codes clear.
Change From v2:
1. protect extend_netdev_table by RTNL.
2. when extend_netdev_table failed,call dev_put to reduce device's refcount.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Eric Dumazet <edumazet@google.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-07-12 05:50:15 +08:00
|
|
|
return -ENOMEM;
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2012-11-22 23:32:46 +08:00
|
|
|
if (old)
|
|
|
|
memcpy(new->priomap, old->priomap,
|
|
|
|
old->priomap_len * sizeof(old->priomap[0]));
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2012-11-22 23:32:46 +08:00
|
|
|
new->priomap_len = new_len;
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2012-11-22 23:32:46 +08:00
|
|
|
/* install the new priomap */
|
2012-11-22 23:32:46 +08:00
|
|
|
rcu_assign_pointer(dev->priomap, new);
|
|
|
|
if (old)
|
|
|
|
kfree_rcu(old, rcu);
|
net: cgroup: fix access the unallocated memory in netprio cgroup
there are some out of bound accesses in netprio cgroup.
now before accessing the dev->priomap.priomap array,we only check
if the dev->priomap exist.and because we don't want to see
additional bound checkings in fast path, so we should make sure
that dev->priomap is null or array size of dev->priomap.priomap
is equal to max_prioidx + 1;
so in write_priomap logic,we should call extend_netdev_table when
dev->priomap is null and dev->priomap.priomap_len < max_len.
and in cgrp_create->update_netdev_tables logic,we should call
extend_netdev_table only when dev->priomap exist and
dev->priomap.priomap_len < max_len.
and it's not needed to call update_netdev_tables in write_priomap,
we can only allocate the net device's priomap which we change through
net_prio.ifpriomap.
this patch also add a return value for update_netdev_tables &
extend_netdev_table, so when new_priomap is allocated failed,
write_priomap will stop to access the priomap,and return -ENOMEM
back to the userspace to tell the user what happend.
Change From v3:
1. add rtnl protect when reading max_prioidx in write_priomap.
2. only call extend_netdev_table when map->priomap_len < max_len,
this will make sure array size of dev->map->priomap always
bigger than any prioidx.
3. add a function write_update_netdev_table to make codes clear.
Change From v2:
1. protect extend_netdev_table by RTNL.
2. when extend_netdev_table failed,call dev_put to reduce device's refcount.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Eric Dumazet <edumazet@google.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-07-12 05:50:15 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-11-22 23:32:47 +08:00
|
|
|
/**
|
|
|
|
* netprio_prio - return the effective netprio of a cgroup-net_device pair
|
2013-08-09 08:11:22 +08:00
|
|
|
* @css: css part of the target pair
|
2012-11-22 23:32:47 +08:00
|
|
|
* @dev: net_device part of the target pair
|
|
|
|
*
|
|
|
|
* Should be called under RCU read or rtnl lock.
|
|
|
|
*/
|
2013-08-09 08:11:22 +08:00
|
|
|
static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
|
2012-11-22 23:32:47 +08:00
|
|
|
{
|
|
|
|
struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
|
2013-08-09 08:11:22 +08:00
|
|
|
int id = css->cgroup->id;
|
2012-11-22 23:32:47 +08:00
|
|
|
|
2013-08-09 08:11:22 +08:00
|
|
|
if (map && id < map->priomap_len)
|
|
|
|
return map->priomap[id];
|
2012-11-22 23:32:47 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* netprio_set_prio - set netprio on a cgroup-net_device pair
|
2013-08-09 08:11:22 +08:00
|
|
|
* @css: css part of the target pair
|
2012-11-22 23:32:47 +08:00
|
|
|
* @dev: net_device part of the target pair
|
|
|
|
* @prio: prio to set
|
|
|
|
*
|
2013-08-09 08:11:22 +08:00
|
|
|
* Set netprio to @prio on @css-@dev pair. Should be called under rtnl
|
2012-11-22 23:32:47 +08:00
|
|
|
* lock and may fail under memory pressure for non-zero @prio.
|
|
|
|
*/
|
2013-08-09 08:11:22 +08:00
|
|
|
static int netprio_set_prio(struct cgroup_subsys_state *css,
|
|
|
|
struct net_device *dev, u32 prio)
|
2012-11-22 23:32:47 +08:00
|
|
|
{
|
|
|
|
struct netprio_map *map;
|
2013-08-09 08:11:22 +08:00
|
|
|
int id = css->cgroup->id;
|
2012-11-22 23:32:47 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* avoid extending priomap for zero writes */
|
|
|
|
map = rtnl_dereference(dev->priomap);
|
2013-08-09 08:11:22 +08:00
|
|
|
if (!prio && (!map || map->priomap_len <= id))
|
2012-11-22 23:32:47 +08:00
|
|
|
return 0;
|
|
|
|
|
2013-08-09 08:11:22 +08:00
|
|
|
ret = extend_netdev_table(dev, id);
|
2012-11-22 23:32:47 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
map = rtnl_dereference(dev->priomap);
|
2013-08-09 08:11:22 +08:00
|
|
|
map->priomap[id] = prio;
|
2012-11-22 23:32:47 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-08-09 08:11:23 +08:00
|
|
|
static struct cgroup_subsys_state *
|
|
|
|
cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
|
2011-11-22 13:10:51 +08:00
|
|
|
{
|
2013-08-09 08:11:22 +08:00
|
|
|
struct cgroup_subsys_state *css;
|
2012-11-22 23:32:47 +08:00
|
|
|
|
2013-08-09 08:11:22 +08:00
|
|
|
css = kzalloc(sizeof(*css), GFP_KERNEL);
|
|
|
|
if (!css)
|
2011-11-22 13:10:51 +08:00
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
2013-08-09 08:11:22 +08:00
|
|
|
return css;
|
2011-11-22 13:10:51 +08:00
|
|
|
}
|
|
|
|
|
2013-08-09 08:11:23 +08:00
|
|
|
static int cgrp_css_online(struct cgroup_subsys_state *css)
|
2011-11-22 13:10:51 +08:00
|
|
|
{
|
2014-05-17 01:22:48 +08:00
|
|
|
struct cgroup_subsys_state *parent_css = css->parent;
|
2011-11-22 13:10:51 +08:00
|
|
|
struct net_device *dev;
|
2012-11-22 23:32:47 +08:00
|
|
|
int ret = 0;
|
|
|
|
|
2015-12-08 06:38:51 +08:00
|
|
|
if (css->id > NETPRIO_ID_MAX)
|
|
|
|
return -ENOSPC;
|
|
|
|
|
2013-08-09 08:11:23 +08:00
|
|
|
if (!parent_css)
|
2012-11-22 23:32:47 +08:00
|
|
|
return 0;
|
2011-11-22 13:10:51 +08:00
|
|
|
|
|
|
|
rtnl_lock();
|
2012-11-22 23:32:47 +08:00
|
|
|
/*
|
|
|
|
* Inherit prios from the parent. As all prios are set during
|
|
|
|
* onlining, there is no need to clear them on offline.
|
|
|
|
*/
|
|
|
|
for_each_netdev(&init_net, dev) {
|
2013-08-09 08:11:22 +08:00
|
|
|
u32 prio = netprio_prio(parent_css, dev);
|
2012-11-22 23:32:47 +08:00
|
|
|
|
2013-08-09 08:11:22 +08:00
|
|
|
ret = netprio_set_prio(css, dev, prio);
|
2012-11-22 23:32:47 +08:00
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
2011-11-22 13:10:51 +08:00
|
|
|
rtnl_unlock();
|
2012-11-22 23:32:47 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-08-09 08:11:23 +08:00
|
|
|
static void cgrp_css_free(struct cgroup_subsys_state *css)
|
2012-11-22 23:32:47 +08:00
|
|
|
{
|
2013-08-09 08:11:23 +08:00
|
|
|
kfree(css);
|
2011-11-22 13:10:51 +08:00
|
|
|
}
|
|
|
|
|
2013-08-09 08:11:24 +08:00
|
|
|
static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
|
2011-11-22 13:10:51 +08:00
|
|
|
{
|
2013-08-09 08:11:24 +08:00
|
|
|
return css->cgroup->id;
|
2011-11-22 13:10:51 +08:00
|
|
|
}
|
|
|
|
|
2013-12-06 01:28:04 +08:00
|
|
|
static int read_priomap(struct seq_file *sf, void *v)
|
2011-11-22 13:10:51 +08:00
|
|
|
{
|
|
|
|
struct net_device *dev;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
2012-11-22 23:32:47 +08:00
|
|
|
for_each_netdev_rcu(&init_net, dev)
|
2013-12-06 01:28:04 +08:00
|
|
|
seq_printf(sf, "%s %u\n", dev->name,
|
|
|
|
netprio_prio(seq_css(sf), dev));
|
2011-11-22 13:10:51 +08:00
|
|
|
rcu_read_unlock();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-05-14 00:16:21 +08:00
|
|
|
static ssize_t write_priomap(struct kernfs_open_file *of,
|
|
|
|
char *buf, size_t nbytes, loff_t off)
|
2011-11-22 13:10:51 +08:00
|
|
|
{
|
2012-11-22 23:32:46 +08:00
|
|
|
char devname[IFNAMSIZ + 1];
|
2011-11-22 13:10:51 +08:00
|
|
|
struct net_device *dev;
|
2012-11-22 23:32:46 +08:00
|
|
|
u32 prio;
|
|
|
|
int ret;
|
2011-11-22 13:10:51 +08:00
|
|
|
|
2014-05-14 00:16:21 +08:00
|
|
|
if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
|
2012-11-22 23:32:46 +08:00
|
|
|
return -EINVAL;
|
2011-11-22 13:10:51 +08:00
|
|
|
|
|
|
|
dev = dev_get_by_name(&init_net, devname);
|
|
|
|
if (!dev)
|
2012-11-22 23:32:46 +08:00
|
|
|
return -ENODEV;
|
2011-11-22 13:10:51 +08:00
|
|
|
|
sock, cgroup: add sock->sk_cgroup
In cgroup v1, dealing with cgroup membership was difficult because the
number of membership associations was unbound. As a result, cgroup v1
grew several controllers whose primary purpose is either tagging
membership or pull in configuration knobs from other subsystems so
that cgroup membership test can be avoided.
net_cls and net_prio controllers are examples of the latter. They
allow configuring network-specific attributes from cgroup side so that
network subsystem can avoid testing cgroup membership; unfortunately,
these are not only cumbersome but also problematic.
Both net_cls and net_prio aren't properly hierarchical. Both inherit
configuration from the parent on creation but there's no interaction
afterwards. An ancestor doesn't restrict the behavior in its subtree
in anyway and configuration changes aren't propagated downwards.
Especially when combined with cgroup delegation, this is problematic
because delegatees can mess up whatever network configuration
implemented at the system level. net_prio would allow the delegatees
to set whatever priority value regardless of CAP_NET_ADMIN and net_cls
the same for classid.
While it is possible to solve these issues from controller side by
implementing hierarchical allowable ranges in both controllers, it
would involve quite a bit of complexity in the controllers and further
obfuscate network configuration as it becomes even more difficult to
tell what's actually being configured looking from the network side.
While not much can be done for v1 at this point, as membership
handling is sane on cgroup v2, it'd be better to make cgroup matching
behave like other network matches and classifiers than introducing
further complications.
In preparation, this patch updates sock->sk_cgrp_data handling so that
it points to the v2 cgroup that sock was created in until either
net_prio or net_cls is used. Once either of the two is used,
sock->sk_cgrp_data reverts to its previous role of carrying prioidx
and classid. This is to avoid adding yet another cgroup related field
to struct sock.
As the mode switching can happen at most once per boot, the switching
mechanism is aimed at lowering hot path overhead. It may leak a
finite, likely small, number of cgroup refs and report spurious
prioidx or classid on switching; however, dynamic updates of prioidx
and classid have always been racy and lossy - socks between creation
and fd installation are never updated, config changes don't update
existing sockets at all, and prioidx may index with dead and recycled
cgroup IDs. Non-critical inaccuracies from small race windows won't
make any noticeable difference.
This patch doesn't make use of the pointer yet. The following patch
will implement netfilter match for cgroup2 membership.
v2: Use sock_cgroup_data to avoid inflating struct sock w/ another
cgroup specific field.
v3: Add comments explaining why sock_data_prioidx() and
sock_data_classid() use different fallback values.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Daniel Wagner <daniel.wagner@bmw-carit.de>
CC: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-08 06:38:53 +08:00
|
|
|
cgroup_sk_alloc_disable();
|
|
|
|
|
2012-08-14 20:34:35 +08:00
|
|
|
rtnl_lock();
|
2012-11-22 23:32:46 +08:00
|
|
|
|
2014-05-14 00:16:21 +08:00
|
|
|
ret = netprio_set_prio(of_css(of), dev, prio);
|
net: cgroup: fix access the unallocated memory in netprio cgroup
there are some out of bound accesses in netprio cgroup.
now before accessing the dev->priomap.priomap array,we only check
if the dev->priomap exist.and because we don't want to see
additional bound checkings in fast path, so we should make sure
that dev->priomap is null or array size of dev->priomap.priomap
is equal to max_prioidx + 1;
so in write_priomap logic,we should call extend_netdev_table when
dev->priomap is null and dev->priomap.priomap_len < max_len.
and in cgrp_create->update_netdev_tables logic,we should call
extend_netdev_table only when dev->priomap exist and
dev->priomap.priomap_len < max_len.
and it's not needed to call update_netdev_tables in write_priomap,
we can only allocate the net device's priomap which we change through
net_prio.ifpriomap.
this patch also add a return value for update_netdev_tables &
extend_netdev_table, so when new_priomap is allocated failed,
write_priomap will stop to access the priomap,and return -ENOMEM
back to the userspace to tell the user what happend.
Change From v3:
1. add rtnl protect when reading max_prioidx in write_priomap.
2. only call extend_netdev_table when map->priomap_len < max_len,
this will make sure array size of dev->map->priomap always
bigger than any prioidx.
3. add a function write_update_netdev_table to make codes clear.
Change From v2:
1. protect extend_netdev_table by RTNL.
2. when extend_netdev_table failed,call dev_put to reduce device's refcount.
Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Eric Dumazet <edumazet@google.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-07-12 05:50:15 +08:00
|
|
|
|
2012-08-14 20:34:35 +08:00
|
|
|
rtnl_unlock();
|
2011-11-22 13:10:51 +08:00
|
|
|
dev_put(dev);
|
2014-05-14 00:16:21 +08:00
|
|
|
return ret ?: nbytes;
|
2011-11-22 13:10:51 +08:00
|
|
|
}
|
|
|
|
|
2012-08-22 10:32:06 +08:00
|
|
|
static int update_netprio(const void *v, struct file *file, unsigned n)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
struct socket *sock = sock_from_file(file, &err);
|
sock, cgroup: add sock->sk_cgroup
In cgroup v1, dealing with cgroup membership was difficult because the
number of membership associations was unbound. As a result, cgroup v1
grew several controllers whose primary purpose is either tagging
membership or pull in configuration knobs from other subsystems so
that cgroup membership test can be avoided.
net_cls and net_prio controllers are examples of the latter. They
allow configuring network-specific attributes from cgroup side so that
network subsystem can avoid testing cgroup membership; unfortunately,
these are not only cumbersome but also problematic.
Both net_cls and net_prio aren't properly hierarchical. Both inherit
configuration from the parent on creation but there's no interaction
afterwards. An ancestor doesn't restrict the behavior in its subtree
in anyway and configuration changes aren't propagated downwards.
Especially when combined with cgroup delegation, this is problematic
because delegatees can mess up whatever network configuration
implemented at the system level. net_prio would allow the delegatees
to set whatever priority value regardless of CAP_NET_ADMIN and net_cls
the same for classid.
While it is possible to solve these issues from controller side by
implementing hierarchical allowable ranges in both controllers, it
would involve quite a bit of complexity in the controllers and further
obfuscate network configuration as it becomes even more difficult to
tell what's actually being configured looking from the network side.
While not much can be done for v1 at this point, as membership
handling is sane on cgroup v2, it'd be better to make cgroup matching
behave like other network matches and classifiers than introducing
further complications.
In preparation, this patch updates sock->sk_cgrp_data handling so that
it points to the v2 cgroup that sock was created in until either
net_prio or net_cls is used. Once either of the two is used,
sock->sk_cgrp_data reverts to its previous role of carrying prioidx
and classid. This is to avoid adding yet another cgroup related field
to struct sock.
As the mode switching can happen at most once per boot, the switching
mechanism is aimed at lowering hot path overhead. It may leak a
finite, likely small, number of cgroup refs and report spurious
prioidx or classid on switching; however, dynamic updates of prioidx
and classid have always been racy and lossy - socks between creation
and fd installation are never updated, config changes don't update
existing sockets at all, and prioidx may index with dead and recycled
cgroup IDs. Non-critical inaccuracies from small race windows won't
make any noticeable difference.
This patch doesn't make use of the pointer yet. The following patch
will implement netfilter match for cgroup2 membership.
v2: Use sock_cgroup_data to avoid inflating struct sock w/ another
cgroup specific field.
v3: Add comments explaining why sock_data_prioidx() and
sock_data_classid() use different fallback values.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Daniel Wagner <daniel.wagner@bmw-carit.de>
CC: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-08 06:38:53 +08:00
|
|
|
if (sock) {
|
|
|
|
spin_lock(&cgroup_sk_update_lock);
|
2015-12-08 06:38:52 +08:00
|
|
|
sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
|
|
|
|
(unsigned long)v);
|
sock, cgroup: add sock->sk_cgroup
In cgroup v1, dealing with cgroup membership was difficult because the
number of membership associations was unbound. As a result, cgroup v1
grew several controllers whose primary purpose is either tagging
membership or pull in configuration knobs from other subsystems so
that cgroup membership test can be avoided.
net_cls and net_prio controllers are examples of the latter. They
allow configuring network-specific attributes from cgroup side so that
network subsystem can avoid testing cgroup membership; unfortunately,
these are not only cumbersome but also problematic.
Both net_cls and net_prio aren't properly hierarchical. Both inherit
configuration from the parent on creation but there's no interaction
afterwards. An ancestor doesn't restrict the behavior in its subtree
in anyway and configuration changes aren't propagated downwards.
Especially when combined with cgroup delegation, this is problematic
because delegatees can mess up whatever network configuration
implemented at the system level. net_prio would allow the delegatees
to set whatever priority value regardless of CAP_NET_ADMIN and net_cls
the same for classid.
While it is possible to solve these issues from controller side by
implementing hierarchical allowable ranges in both controllers, it
would involve quite a bit of complexity in the controllers and further
obfuscate network configuration as it becomes even more difficult to
tell what's actually being configured looking from the network side.
While not much can be done for v1 at this point, as membership
handling is sane on cgroup v2, it'd be better to make cgroup matching
behave like other network matches and classifiers than introducing
further complications.
In preparation, this patch updates sock->sk_cgrp_data handling so that
it points to the v2 cgroup that sock was created in until either
net_prio or net_cls is used. Once either of the two is used,
sock->sk_cgrp_data reverts to its previous role of carrying prioidx
and classid. This is to avoid adding yet another cgroup related field
to struct sock.
As the mode switching can happen at most once per boot, the switching
mechanism is aimed at lowering hot path overhead. It may leak a
finite, likely small, number of cgroup refs and report spurious
prioidx or classid on switching; however, dynamic updates of prioidx
and classid have always been racy and lossy - socks between creation
and fd installation are never updated, config changes don't update
existing sockets at all, and prioidx may index with dead and recycled
cgroup IDs. Non-critical inaccuracies from small race windows won't
make any noticeable difference.
This patch doesn't make use of the pointer yet. The following patch
will implement netfilter match for cgroup2 membership.
v2: Use sock_cgroup_data to avoid inflating struct sock w/ another
cgroup specific field.
v3: Add comments explaining why sock_data_prioidx() and
sock_data_classid() use different fallback values.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Daniel Wagner <daniel.wagner@bmw-carit.de>
CC: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-12-08 06:38:53 +08:00
|
|
|
spin_unlock(&cgroup_sk_update_lock);
|
|
|
|
}
|
2012-08-22 10:32:06 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
static void net_prio_attach(struct cgroup_taskset *tset)
|
2012-07-20 18:39:25 +08:00
|
|
|
{
|
|
|
|
struct task_struct *p;
|
cgroup: fix handling of multi-destination migration from subtree_control enabling
Consider the following v2 hierarchy.
P0 (+memory) --- P1 (-memory) --- A
\- B
P0 has memory enabled in its subtree_control while P1 doesn't. If
both A and B contain processes, they would belong to the memory css of
P1. Now if memory is enabled on P1's subtree_control, memory csses
should be created on both A and B and A's processes should be moved to
the former and B's processes the latter. IOW, enabling controllers
can cause atomic migrations into different csses.
The core cgroup migration logic has been updated accordingly but the
controller migration methods haven't and still assume that all tasks
migrate to a single target css; furthermore, the methods were fed the
css in which subtree_control was updated which is the parent of the
target csses. pids controller depends on the migration methods to
move charges and this made the controller attribute charges to the
wrong csses often triggering the following warning by driving a
counter negative.
WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40()
Modules linked in:
CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29
...
ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000
ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00
ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8
Call Trace:
[<ffffffff81551ffc>] dump_stack+0x4e/0x82
[<ffffffff810de202>] warn_slowpath_common+0x82/0xc0
[<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40
[<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0
[<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330
[<ffffffff81188e05>] cgroup_migrate+0xf5/0x190
[<ffffffff81189016>] cgroup_attach_task+0x176/0x200
[<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460
[<ffffffff81189684>] cgroup_procs_write+0x14/0x20
[<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0
[<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190
[<ffffffff81265f88>] __vfs_write+0x28/0xe0
[<ffffffff812666fc>] vfs_write+0xac/0x1a0
[<ffffffff81267019>] SyS_write+0x49/0xb0
[<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76
This patch fixes the bug by removing @css parameter from the three
migration methods, ->can_attach, ->cancel_attach() and ->attach() and
updating cgroup_taskset iteration helpers also return the destination
css in addition to the task being migrated. All controllers are
updated accordingly.
* Controllers which don't care whether there are one or multiple
target csses can be converted trivially. cpu, io, freezer, perf,
netclassid and netprio fall in this category.
* cpuset's current implementation assumes that there's single source
and destination and thus doesn't support v2 hierarchy already. The
only change made by this patchset is how that single destination css
is obtained.
* memory migration path already doesn't do anything on v2. How the
single destination css is obtained is updated and the prep stage of
mem_cgroup_can_attach() is reordered to accomodate the change.
* pids is the only controller which was affected by this bug. It now
correctly handles multi-destination migrations and no longer causes
counter underflow from incorrect accounting.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Aleksa Sarai <cyphar@cyphar.com>
2015-12-03 23:18:21 +08:00
|
|
|
struct cgroup_subsys_state *css;
|
|
|
|
|
|
|
|
cgroup_taskset_for_each(p, css, tset) {
|
|
|
|
void *v = (void *)(unsigned long)css->cgroup->id;
|
2012-07-20 18:39:25 +08:00
|
|
|
|
|
|
|
task_lock(p);
|
2012-08-22 10:32:06 +08:00
|
|
|
iterate_fd(p->files, 0, update_netprio, v);
|
2012-07-20 18:39:25 +08:00
|
|
|
task_unlock(p);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-11-22 13:10:51 +08:00
|
|
|
static struct cftype ss_files[] = {
|
|
|
|
{
|
|
|
|
.name = "prioidx",
|
|
|
|
.read_u64 = read_prioidx,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "ifpriomap",
|
2013-12-06 01:28:04 +08:00
|
|
|
.seq_show = read_priomap,
|
2014-05-14 00:16:21 +08:00
|
|
|
.write = write_priomap,
|
2011-11-22 13:10:51 +08:00
|
|
|
},
|
2012-04-02 03:09:55 +08:00
|
|
|
{ } /* terminate */
|
2011-11-22 13:10:51 +08:00
|
|
|
};
|
|
|
|
|
2014-02-08 23:36:58 +08:00
|
|
|
struct cgroup_subsys net_prio_cgrp_subsys = {
|
2012-11-20 00:13:38 +08:00
|
|
|
.css_alloc = cgrp_css_alloc,
|
2012-11-22 23:32:47 +08:00
|
|
|
.css_online = cgrp_css_online,
|
2012-11-20 00:13:38 +08:00
|
|
|
.css_free = cgrp_css_free,
|
2012-07-20 18:39:25 +08:00
|
|
|
.attach = net_prio_attach,
|
2014-07-15 23:05:09 +08:00
|
|
|
.legacy_cftypes = ss_files,
|
2012-04-02 03:09:55 +08:00
|
|
|
};
|
2011-11-22 13:10:51 +08:00
|
|
|
|
|
|
|
static int netprio_device_event(struct notifier_block *unused,
|
|
|
|
unsigned long event, void *ptr)
|
|
|
|
{
|
2013-05-28 09:30:21 +08:00
|
|
|
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
2011-11-22 13:10:51 +08:00
|
|
|
struct netprio_map *old;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note this is called with rtnl_lock held so we have update side
|
|
|
|
* protection on our rcu assignments
|
|
|
|
*/
|
|
|
|
|
|
|
|
switch (event) {
|
|
|
|
case NETDEV_UNREGISTER:
|
|
|
|
old = rtnl_dereference(dev->priomap);
|
2011-11-23 15:09:32 +08:00
|
|
|
RCU_INIT_POINTER(dev->priomap, NULL);
|
2011-11-22 13:10:51 +08:00
|
|
|
if (old)
|
|
|
|
kfree_rcu(old, rcu);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct notifier_block netprio_device_notifier = {
|
|
|
|
.notifier_call = netprio_device_event
|
|
|
|
};
|
|
|
|
|
|
|
|
static int __init init_cgroup_netprio(void)
|
|
|
|
{
|
|
|
|
register_netdevice_notifier(&netprio_device_notifier);
|
2014-02-08 23:36:58 +08:00
|
|
|
return 0;
|
2011-11-22 13:10:51 +08:00
|
|
|
}
|
2014-02-08 23:36:58 +08:00
|
|
|
subsys_initcall(init_cgroup_netprio);
|