Merge branch 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:

 - cgroup.kill is added which implements atomic killing of the whole
   subtree.

   Down the line, this should be able to replace the multiple userland
   implementations of "keep killing till empty".

 - PSI can now be turned off at boot time to avoid overhead for
   configurations which don't care about PSI.

* 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup: make per-cgroup pressure stall tracking configurable
  cgroup: Fix kernel-doc
  cgroup: inline cgroup_task_freeze()
  tests/cgroup: test cgroup.kill
  tests/cgroup: move cg_wait_for(), cg_prepare_for_wait()
  tests/cgroup: use cgroup.kill in cg_killall()
  docs/cgroup: add entry for cgroup.kill
  cgroup: introduce cgroup.kill
This commit is contained in:
Linus Torvalds 2021-07-01 17:22:14 -07:00
commit 3dbdb38e28
13 changed files with 569 additions and 108 deletions

View File

@ -953,6 +953,21 @@ All cgroup core files are prefixed with "cgroup."
it's possible to delete a frozen (and empty) cgroup, as well as it's possible to delete a frozen (and empty) cgroup, as well as
create new sub-cgroups. create new sub-cgroups.
cgroup.kill
A write-only single value file which exists in non-root cgroups.
The only allowed value is "1".
Writing "1" to the file causes the cgroup and all descendant cgroups to
be killed. This means that all processes located in the affected cgroup
tree will be killed via SIGKILL.
Killing a cgroup tree will deal with concurrent forks appropriately and
is protected against migrations.
In a threaded cgroup, writing this file fails with EOPNOTSUPP as
killing cgroups is a process directed operation, i.e. it affects
the whole thread-group.
Controllers Controllers
=========== ===========

View File

@ -497,16 +497,21 @@
ccw_timeout_log [S390] ccw_timeout_log [S390]
See Documentation/s390/common_io.rst for details. See Documentation/s390/common_io.rst for details.
cgroup_disable= [KNL] Disable a particular controller cgroup_disable= [KNL] Disable a particular controller or optional feature
Format: {name of the controller(s) to disable} Format: {name of the controller(s) or feature(s) to disable}
The effects of cgroup_disable=foo are: The effects of cgroup_disable=foo are:
- foo isn't auto-mounted if you mount all cgroups in - foo isn't auto-mounted if you mount all cgroups in
a single hierarchy a single hierarchy
- foo isn't visible as an individually mountable - foo isn't visible as an individually mountable
subsystem subsystem
- if foo is an optional feature then the feature is
disabled and corresponding cgroup files are not
created
{Currently only "memory" controller deal with this and {Currently only "memory" controller deal with this and
cut the overhead, others just disable the usage. So cut the overhead, others just disable the usage. So
only cgroup_disable=memory is actually worthy} only cgroup_disable=memory is actually worthy}
Specifying "pressure" disables per-cgroup pressure
stall information accounting feature
cgroup_no_v1= [KNL] Disable cgroup controllers and named hierarchies in v1 cgroup_no_v1= [KNL] Disable cgroup controllers and named hierarchies in v1
Format: { { controller | "all" | "named" } Format: { { controller | "all" | "named" }

View File

@ -71,6 +71,9 @@ enum {
/* Cgroup is frozen. */ /* Cgroup is frozen. */
CGRP_FROZEN, CGRP_FROZEN,
/* Control group has to be killed. */
CGRP_KILL,
}; };
/* cgroup_root->flags */ /* cgroup_root->flags */
@ -110,6 +113,7 @@ enum {
CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */
CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */
CFTYPE_PRESSURE = (1 << 6), /* only if pressure feature is enabled */
/* internal flags, do not use outside cgroup core proper */ /* internal flags, do not use outside cgroup core proper */
__CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */

View File

@ -676,6 +676,8 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
return &cgrp->psi; return &cgrp->psi;
} }
bool cgroup_psi_enabled(void);
static inline void cgroup_init_kthreadd(void) static inline void cgroup_init_kthreadd(void)
{ {
/* /*
@ -735,6 +737,11 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
return NULL; return NULL;
} }
static inline bool cgroup_psi_enabled(void)
{
return false;
}
static inline bool task_under_cgroup_hierarchy(struct task_struct *task, static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
struct cgroup *ancestor) struct cgroup *ancestor)
{ {
@ -906,20 +913,6 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze);
void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
struct cgroup *dst); struct cgroup *dst);
static inline bool cgroup_task_freeze(struct task_struct *task)
{
bool ret;
if (task->flags & PF_KTHREAD)
return false;
rcu_read_lock();
ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags);
rcu_read_unlock();
return ret;
}
static inline bool cgroup_task_frozen(struct task_struct *task) static inline bool cgroup_task_frozen(struct task_struct *task)
{ {
return task->frozen; return task->frozen;
@ -929,10 +922,6 @@ static inline bool cgroup_task_frozen(struct task_struct *task)
static inline void cgroup_enter_frozen(void) { } static inline void cgroup_enter_frozen(void) { }
static inline void cgroup_leave_frozen(bool always_leave) { } static inline void cgroup_leave_frozen(bool always_leave) { }
static inline bool cgroup_task_freeze(struct task_struct *task)
{
return false;
}
static inline bool cgroup_task_frozen(struct task_struct *task) static inline bool cgroup_task_frozen(struct task_struct *task)
{ {
return false; return false;

View File

@ -209,6 +209,22 @@ struct cgroup_namespace init_cgroup_ns = {
static struct file_system_type cgroup2_fs_type; static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[]; static struct cftype cgroup_base_files[];
/* cgroup optional features */
enum cgroup_opt_features {
#ifdef CONFIG_PSI
OPT_FEATURE_PRESSURE,
#endif
OPT_FEATURE_COUNT
};
static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
#ifdef CONFIG_PSI
"pressure",
#endif
};
static u16 cgroup_feature_disable_mask __read_mostly;
static int cgroup_apply_control(struct cgroup *cgrp); static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it, static void css_task_iter_skip(struct css_task_iter *it,
@ -2390,7 +2406,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
} }
/** /**
* cgroup_taskset_migrate - migrate a taskset * cgroup_migrate_execute - migrate a taskset
* @mgctx: migration context * @mgctx: migration context
* *
* Migrate tasks in @mgctx as setup by migration preparation functions. * Migrate tasks in @mgctx as setup by migration preparation functions.
@ -3632,6 +3648,18 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
{ {
psi_trigger_replace(&of->priv, NULL); psi_trigger_replace(&of->priv, NULL);
} }
bool cgroup_psi_enabled(void)
{
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}
#else /* CONFIG_PSI */
bool cgroup_psi_enabled(void)
{
return false;
}
#endif /* CONFIG_PSI */ #endif /* CONFIG_PSI */
static int cgroup_freeze_show(struct seq_file *seq, void *v) static int cgroup_freeze_show(struct seq_file *seq, void *v)
@ -3668,6 +3696,80 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
return nbytes; return nbytes;
} }
static void __cgroup_kill(struct cgroup *cgrp)
{
struct css_task_iter it;
struct task_struct *task;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
set_bit(CGRP_KILL, &cgrp->flags);
spin_unlock_irq(&css_set_lock);
css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
while ((task = css_task_iter_next(&it))) {
/* Ignore kernel threads here. */
if (task->flags & PF_KTHREAD)
continue;
/* Skip tasks that are already dying. */
if (__fatal_signal_pending(task))
continue;
send_sig(SIGKILL, task, 0);
}
css_task_iter_end(&it);
spin_lock_irq(&css_set_lock);
clear_bit(CGRP_KILL, &cgrp->flags);
spin_unlock_irq(&css_set_lock);
}
static void cgroup_kill(struct cgroup *cgrp)
{
struct cgroup_subsys_state *css;
struct cgroup *dsct;
lockdep_assert_held(&cgroup_mutex);
cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
__cgroup_kill(dsct);
}
static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
ssize_t ret = 0;
int kill;
struct cgroup *cgrp;
ret = kstrtoint(strstrip(buf), 0, &kill);
if (ret)
return ret;
if (kill != 1)
return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENOENT;
/*
* Killing is a process directed operation, i.e. the whole thread-group
* is taken down so act like we do for cgroup.procs and only make this
* writable in non-threaded cgroups.
*/
if (cgroup_is_threaded(cgrp))
ret = -EOPNOTSUPP;
else
cgroup_kill(cgrp);
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
static int cgroup_file_open(struct kernfs_open_file *of) static int cgroup_file_open(struct kernfs_open_file *of)
{ {
struct cftype *cft = of_cft(of); struct cftype *cft = of_cft(of);
@ -3882,6 +3984,8 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
restart: restart:
for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */ /* does cft->flags tell us to skip this file on @cgrp? */
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
continue;
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue; continue;
if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@ -3959,6 +4063,9 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
WARN_ON(cft->ss || cft->kf_ops); WARN_ON(cft->ss || cft->kf_ops);
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
continue;
if (cft->seq_start) if (cft->seq_start)
kf_ops = &cgroup_kf_ops; kf_ops = &cgroup_kf_ops;
else else
@ -4860,6 +4967,11 @@ static struct cftype cgroup_base_files[] = {
.seq_show = cgroup_freeze_show, .seq_show = cgroup_freeze_show,
.write = cgroup_freeze_write, .write = cgroup_freeze_write,
}, },
{
.name = "cgroup.kill",
.flags = CFTYPE_NOT_ON_ROOT,
.write = cgroup_kill_write,
},
{ {
.name = "cpu.stat", .name = "cpu.stat",
.seq_show = cpu_stat_show, .seq_show = cpu_stat_show,
@ -4867,6 +4979,7 @@ static struct cftype cgroup_base_files[] = {
#ifdef CONFIG_PSI #ifdef CONFIG_PSI
{ {
.name = "io.pressure", .name = "io.pressure",
.flags = CFTYPE_PRESSURE,
.seq_show = cgroup_io_pressure_show, .seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write, .write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll, .poll = cgroup_pressure_poll,
@ -4874,6 +4987,7 @@ static struct cftype cgroup_base_files[] = {
}, },
{ {
.name = "memory.pressure", .name = "memory.pressure",
.flags = CFTYPE_PRESSURE,
.seq_show = cgroup_memory_pressure_show, .seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write, .write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll, .poll = cgroup_pressure_poll,
@ -4881,6 +4995,7 @@ static struct cftype cgroup_base_files[] = {
}, },
{ {
.name = "cpu.pressure", .name = "cpu.pressure",
.flags = CFTYPE_PRESSURE,
.seq_show = cgroup_cpu_pressure_show, .seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write, .write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll, .poll = cgroup_pressure_poll,
@ -6080,6 +6195,8 @@ void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs) struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{ {
unsigned long cgrp_flags = 0;
bool kill = false;
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
struct css_set *cset; struct css_set *cset;
int i; int i;
@ -6091,6 +6208,11 @@ void cgroup_post_fork(struct task_struct *child,
/* init tasks are special, only link regular threads */ /* init tasks are special, only link regular threads */
if (likely(child->pid)) { if (likely(child->pid)) {
if (kargs->cgrp)
cgrp_flags = kargs->cgrp->flags;
else
cgrp_flags = cset->dfl_cgrp->flags;
WARN_ON_ONCE(!list_empty(&child->cg_list)); WARN_ON_ONCE(!list_empty(&child->cg_list));
cset->nr_tasks++; cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false); css_set_move_task(child, NULL, cset, false);
@ -6099,23 +6221,32 @@ void cgroup_post_fork(struct task_struct *child,
cset = NULL; cset = NULL;
} }
/* if (!(child->flags & PF_KTHREAD)) {
* If the cgroup has to be frozen, the new task has too. Let's set if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
* the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the /*
* frozen state. * If the cgroup has to be frozen, the new task has
*/ * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
if (unlikely(cgroup_task_freeze(child))) { * get the task into the frozen state.
spin_lock(&child->sighand->siglock); */
WARN_ON_ONCE(child->frozen); spin_lock(&child->sighand->siglock);
child->jobctl |= JOBCTL_TRAP_FREEZE; WARN_ON_ONCE(child->frozen);
spin_unlock(&child->sighand->siglock); child->jobctl |= JOBCTL_TRAP_FREEZE;
spin_unlock(&child->sighand->siglock);
/*
* Calling cgroup_update_frozen() isn't required here,
* because it will be called anyway a bit later from
* do_freezer_trap(). So we avoid cgroup's transient
* switch from the frozen state and back.
*/
}
/* /*
* Calling cgroup_update_frozen() isn't required here, * If the cgroup is to be killed notice it now and take the
* because it will be called anyway a bit later from * child down right after we finished preparing it for
* do_freezer_trap(). So we avoid cgroup's transient switch * userspace.
* from the frozen state and back.
*/ */
kill = test_bit(CGRP_KILL, &cgrp_flags);
} }
spin_unlock_irq(&css_set_lock); spin_unlock_irq(&css_set_lock);
@ -6138,6 +6269,10 @@ void cgroup_post_fork(struct task_struct *child,
put_css_set(rcset); put_css_set(rcset);
} }
/* Cgroup has to be killed so take down child immediately. */
if (unlikely(kill))
do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
cgroup_css_set_put_fork(kargs); cgroup_css_set_put_fork(kargs);
} }
@ -6163,7 +6298,8 @@ void cgroup_exit(struct task_struct *tsk)
cset->nr_tasks--; cset->nr_tasks--;
WARN_ON_ONCE(cgroup_task_frozen(tsk)); WARN_ON_ONCE(cgroup_task_frozen(tsk));
if (unlikely(cgroup_task_freeze(tsk))) if (unlikely(!(tsk->flags & PF_KTHREAD) &&
test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk)); cgroup_update_frozen(task_dfl_cgroup(tsk));
spin_unlock_irq(&css_set_lock); spin_unlock_irq(&css_set_lock);
@ -6214,6 +6350,15 @@ static int __init cgroup_disable(char *str)
pr_info("Disabling %s control group subsystem\n", pr_info("Disabling %s control group subsystem\n",
ss->name); ss->name);
} }
for (i = 0; i < OPT_FEATURE_COUNT; i++) {
if (strcmp(token, cgroup_opt_feature_names[i]))
continue;
cgroup_feature_disable_mask |= 1 << i;
pr_info("Disabling %s control group feature\n",
cgroup_opt_feature_names[i]);
break;
}
} }
return 1; return 1;
} }
@ -6512,6 +6657,9 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
continue; continue;
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
continue;
if (prefix) if (prefix)
ret += snprintf(buf + ret, size - ret, "%s.", prefix); ret += snprintf(buf + ret, size - ret, "%s.", prefix);

View File

@ -220,7 +220,7 @@ void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
} }
/** /**
* cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
* @cgrp: target cgroup * @cgrp: target cgroup
* *
* Flush stats in @cgrp's subtree and prevent further flushes. Must be * Flush stats in @cgrp's subtree and prevent further flushes. Must be

View File

@ -148,6 +148,7 @@
static int psi_bug __read_mostly; static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled); DEFINE_STATIC_KEY_FALSE(psi_disabled);
DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED #ifdef CONFIG_PSI_DEFAULT_DISABLED
static bool psi_enable; static bool psi_enable;
@ -215,6 +216,9 @@ void __init psi_init(void)
return; return;
} }
if (!cgroup_psi_enabled())
static_branch_disable(&psi_cgroups_enabled);
psi_period = jiffies_to_nsecs(PSI_FREQ); psi_period = jiffies_to_nsecs(PSI_FREQ);
group_init(&psi_system); group_init(&psi_system);
} }
@ -748,23 +752,23 @@ static void psi_group_change(struct psi_group *group, int cpu,
static struct psi_group *iterate_groups(struct task_struct *task, void **iter) static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
{ {
if (*iter == &psi_system)
return NULL;
#ifdef CONFIG_CGROUPS #ifdef CONFIG_CGROUPS
struct cgroup *cgroup = NULL; if (static_branch_likely(&psi_cgroups_enabled)) {
struct cgroup *cgroup = NULL;
if (!*iter) if (!*iter)
cgroup = task->cgroups->dfl_cgrp; cgroup = task->cgroups->dfl_cgrp;
else if (*iter == &psi_system) else
return NULL; cgroup = cgroup_parent(*iter);
else
cgroup = cgroup_parent(*iter);
if (cgroup && cgroup_parent(cgroup)) { if (cgroup && cgroup_parent(cgroup)) {
*iter = cgroup; *iter = cgroup;
return cgroup_psi(cgroup); return cgroup_psi(cgroup);
}
} }
#else
if (*iter)
return NULL;
#endif #endif
*iter = &psi_system; *iter = &psi_system;
return &psi_system; return &psi_system;

View File

@ -2,4 +2,5 @@
test_memcontrol test_memcontrol
test_core test_core
test_freezer test_freezer
test_kmem test_kmem
test_kill

View File

@ -9,6 +9,7 @@ TEST_GEN_PROGS = test_memcontrol
TEST_GEN_PROGS += test_kmem TEST_GEN_PROGS += test_kmem
TEST_GEN_PROGS += test_core TEST_GEN_PROGS += test_core
TEST_GEN_PROGS += test_freezer TEST_GEN_PROGS += test_freezer
TEST_GEN_PROGS += test_kill
include ../lib.mk include ../lib.mk
@ -16,3 +17,4 @@ $(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h
$(OUTPUT)/test_kmem: cgroup_util.c ../clone3/clone3_selftests.h $(OUTPUT)/test_kmem: cgroup_util.c ../clone3/clone3_selftests.h
$(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h $(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h
$(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h $(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h
$(OUTPUT)/test_kill: cgroup_util.c ../clone3/clone3_selftests.h ../pidfd/pidfd.h

View File

@ -5,10 +5,12 @@
#include <errno.h> #include <errno.h>
#include <fcntl.h> #include <fcntl.h>
#include <linux/limits.h> #include <linux/limits.h>
#include <poll.h>
#include <signal.h> #include <signal.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <sys/inotify.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/wait.h> #include <sys/wait.h>
@ -252,6 +254,10 @@ int cg_killall(const char *cgroup)
char buf[PAGE_SIZE]; char buf[PAGE_SIZE];
char *ptr = buf; char *ptr = buf;
/* If cgroup.kill exists use it. */
if (!cg_write(cgroup, "cgroup.kill", "1"))
return 0;
if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf))) if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
return -1; return -1;
@ -576,3 +582,48 @@ int clone_into_cgroup_run_wait(const char *cgroup)
(void)clone_reap(pid, WEXITED); (void)clone_reap(pid, WEXITED);
return 0; return 0;
} }
int cg_prepare_for_wait(const char *cgroup)
{
int fd, ret = -1;
fd = inotify_init1(0);
if (fd == -1)
return fd;
ret = inotify_add_watch(fd, cg_control(cgroup, "cgroup.events"),
IN_MODIFY);
if (ret == -1) {
close(fd);
fd = -1;
}
return fd;
}
int cg_wait_for(int fd)
{
int ret = -1;
struct pollfd fds = {
.fd = fd,
.events = POLLIN,
};
while (true) {
ret = poll(&fds, 1, 10000);
if (ret == -1) {
if (errno == EINTR)
continue;
break;
}
if (ret > 0 && fds.revents & POLLIN) {
ret = 0;
break;
}
}
return ret;
}

View File

@ -54,3 +54,5 @@ extern pid_t clone_into_cgroup(int cgroup_fd);
extern int clone_reap(pid_t pid, int options); extern int clone_reap(pid_t pid, int options);
extern int clone_into_cgroup_run_wait(const char *cgroup); extern int clone_into_cgroup_run_wait(const char *cgroup);
extern int dirfd_open_opath(const char *dir); extern int dirfd_open_opath(const char *dir);
extern int cg_prepare_for_wait(const char *cgroup);
extern int cg_wait_for(int fd);

View File

@ -7,9 +7,7 @@
#include <unistd.h> #include <unistd.h>
#include <stdio.h> #include <stdio.h>
#include <errno.h> #include <errno.h>
#include <poll.h>
#include <stdlib.h> #include <stdlib.h>
#include <sys/inotify.h>
#include <string.h> #include <string.h>
#include <sys/wait.h> #include <sys/wait.h>
@ -54,61 +52,6 @@ static int cg_freeze_nowait(const char *cgroup, bool freeze)
return cg_write(cgroup, "cgroup.freeze", freeze ? "1" : "0"); return cg_write(cgroup, "cgroup.freeze", freeze ? "1" : "0");
} }
/*
* Prepare for waiting on cgroup.events file.
*/
static int cg_prepare_for_wait(const char *cgroup)
{
int fd, ret = -1;
fd = inotify_init1(0);
if (fd == -1) {
debug("Error: inotify_init1() failed\n");
return fd;
}
ret = inotify_add_watch(fd, cg_control(cgroup, "cgroup.events"),
IN_MODIFY);
if (ret == -1) {
debug("Error: inotify_add_watch() failed\n");
close(fd);
fd = -1;
}
return fd;
}
/*
* Wait for an event. If there are no events for 10 seconds,
* treat this an error.
*/
static int cg_wait_for(int fd)
{
int ret = -1;
struct pollfd fds = {
.fd = fd,
.events = POLLIN,
};
while (true) {
ret = poll(&fds, 1, 10000);
if (ret == -1) {
if (errno == EINTR)
continue;
debug("Error: poll() failed\n");
break;
}
if (ret > 0 && fds.revents & POLLIN) {
ret = 0;
break;
}
}
return ret;
}
/* /*
* Attach a task to the given cgroup and wait for a cgroup frozen event. * Attach a task to the given cgroup and wait for a cgroup frozen event.
* All transient events (e.g. populated) are ignored. * All transient events (e.g. populated) are ignored.

View File

@ -0,0 +1,297 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <errno.h>
#include <linux/limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include "../kselftest.h"
#include "../pidfd/pidfd.h"
#include "cgroup_util.h"
/*
* Kill the given cgroup and wait for the inotify signal.
* If there are no events in 10 seconds, treat this as an error.
* Then check that the cgroup is in the desired state.
*/
static int cg_kill_wait(const char *cgroup)
{
int fd, ret = -1;
fd = cg_prepare_for_wait(cgroup);
if (fd < 0)
return fd;
ret = cg_write(cgroup, "cgroup.kill", "1");
if (ret)
goto out;
ret = cg_wait_for(fd);
if (ret)
goto out;
out:
close(fd);
return ret;
}
/*
* A simple process running in a sleep loop until being
* re-parented.
*/
static int child_fn(const char *cgroup, void *arg)
{
int ppid = getppid();
while (getppid() == ppid)
usleep(1000);
return getppid() == ppid;
}
static int test_cgkill_simple(const char *root)
{
pid_t pids[100];
int ret = KSFT_FAIL;
char *cgroup = NULL;
int i;
cgroup = cg_name(root, "cg_test_simple");
if (!cgroup)
goto cleanup;
if (cg_create(cgroup))
goto cleanup;
for (i = 0; i < 100; i++)
pids[i] = cg_run_nowait(cgroup, child_fn, NULL);
if (cg_wait_for_proc_count(cgroup, 100))
goto cleanup;
if (cg_read_strcmp(cgroup, "cgroup.events", "populated 1\n"))
goto cleanup;
if (cg_kill_wait(cgroup))
goto cleanup;
ret = KSFT_PASS;
cleanup:
for (i = 0; i < 100; i++)
wait_for_pid(pids[i]);
if (ret == KSFT_PASS &&
cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n"))
ret = KSFT_FAIL;
if (cgroup)
cg_destroy(cgroup);
free(cgroup);
return ret;
}
/*
* The test creates the following hierarchy:
* A
* / / \ \
* B E I K
* /\ |
* C D F
* |
* G
* |
* H
*
* with a process in C, H and 3 processes in K.
* Then it tries to kill the whole tree.
*/
static int test_cgkill_tree(const char *root)
{
pid_t pids[5];
char *cgroup[10] = {0};
int ret = KSFT_FAIL;
int i;
cgroup[0] = cg_name(root, "cg_test_tree_A");
if (!cgroup[0])
goto cleanup;
cgroup[1] = cg_name(cgroup[0], "B");
if (!cgroup[1])
goto cleanup;
cgroup[2] = cg_name(cgroup[1], "C");
if (!cgroup[2])
goto cleanup;
cgroup[3] = cg_name(cgroup[1], "D");
if (!cgroup[3])
goto cleanup;
cgroup[4] = cg_name(cgroup[0], "E");
if (!cgroup[4])
goto cleanup;
cgroup[5] = cg_name(cgroup[4], "F");
if (!cgroup[5])
goto cleanup;
cgroup[6] = cg_name(cgroup[5], "G");
if (!cgroup[6])
goto cleanup;
cgroup[7] = cg_name(cgroup[6], "H");
if (!cgroup[7])
goto cleanup;
cgroup[8] = cg_name(cgroup[0], "I");
if (!cgroup[8])
goto cleanup;
cgroup[9] = cg_name(cgroup[0], "K");
if (!cgroup[9])
goto cleanup;
for (i = 0; i < 10; i++)
if (cg_create(cgroup[i]))
goto cleanup;
pids[0] = cg_run_nowait(cgroup[2], child_fn, NULL);
pids[1] = cg_run_nowait(cgroup[7], child_fn, NULL);
pids[2] = cg_run_nowait(cgroup[9], child_fn, NULL);
pids[3] = cg_run_nowait(cgroup[9], child_fn, NULL);
pids[4] = cg_run_nowait(cgroup[9], child_fn, NULL);
/*
* Wait until all child processes will enter
* corresponding cgroups.
*/
if (cg_wait_for_proc_count(cgroup[2], 1) ||
cg_wait_for_proc_count(cgroup[7], 1) ||
cg_wait_for_proc_count(cgroup[9], 3))
goto cleanup;
/*
* Kill A and check that we get an empty notification.
*/
if (cg_kill_wait(cgroup[0]))
goto cleanup;
ret = KSFT_PASS;
cleanup:
for (i = 0; i < 5; i++)
wait_for_pid(pids[i]);
if (ret == KSFT_PASS &&
cg_read_strcmp(cgroup[0], "cgroup.events", "populated 0\n"))
ret = KSFT_FAIL;
for (i = 9; i >= 0 && cgroup[i]; i--) {
cg_destroy(cgroup[i]);
free(cgroup[i]);
}
return ret;
}
static int forkbomb_fn(const char *cgroup, void *arg)
{
int ppid;
fork();
fork();
ppid = getppid();
while (getppid() == ppid)
usleep(1000);
return getppid() == ppid;
}
/*
* The test runs a fork bomb in a cgroup and tries to kill it.
*/
static int test_cgkill_forkbomb(const char *root)
{
int ret = KSFT_FAIL;
char *cgroup = NULL;
pid_t pid = -ESRCH;
cgroup = cg_name(root, "cg_forkbomb_test");
if (!cgroup)
goto cleanup;
if (cg_create(cgroup))
goto cleanup;
pid = cg_run_nowait(cgroup, forkbomb_fn, NULL);
if (pid < 0)
goto cleanup;
usleep(100000);
if (cg_kill_wait(cgroup))
goto cleanup;
if (cg_wait_for_proc_count(cgroup, 0))
goto cleanup;
ret = KSFT_PASS;
cleanup:
if (pid > 0)
wait_for_pid(pid);
if (ret == KSFT_PASS &&
cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n"))
ret = KSFT_FAIL;
if (cgroup)
cg_destroy(cgroup);
free(cgroup);
return ret;
}
#define T(x) { x, #x }
struct cgkill_test {
int (*fn)(const char *root);
const char *name;
} tests[] = {
T(test_cgkill_simple),
T(test_cgkill_tree),
T(test_cgkill_forkbomb),
};
#undef T
int main(int argc, char *argv[])
{
char root[PATH_MAX];
int i, ret = EXIT_SUCCESS;
if (cg_find_unified_root(root, sizeof(root)))
ksft_exit_skip("cgroup v2 isn't mounted\n");
for (i = 0; i < ARRAY_SIZE(tests); i++) {
switch (tests[i].fn(root)) {
case KSFT_PASS:
ksft_test_result_pass("%s\n", tests[i].name);
break;
case KSFT_SKIP:
ksft_test_result_skip("%s\n", tests[i].name);
break;
default:
ret = EXIT_FAILURE;
ksft_test_result_fail("%s\n", tests[i].name);
break;
}
}
return ret;
}