memcg, oom: move out_of_memory back to the charge path

Commit 3812c8c8f3 ("mm: memcg: do not trap chargers with full
callstack on OOM") has changed the ENOMEM semantic of memcg charges.
Rather than invoking the oom killer from the charging context it delays
the oom killer to the page fault path (pagefault_out_of_memory).  This
in turn means that many users (e.g.  slab or g-u-p) will get ENOMEM when
the corresponding memcg hits the hard limit and the memcg is is OOM.
This is behavior is inconsistent with !memcg case where the oom killer
is invoked from the allocation context and the allocator keeps retrying
until it succeeds.

The difference in the behavior is user visible.  mmap(MAP_POPULATE)
might result in not fully populated ranges while the mmap return code
doesn't tell that to the userspace.  Random syscalls might fail with
ENOMEM etc.

The primary motivation of the different memcg oom semantic was the
deadlock avoidance.  Things have changed since then, though.  We have an
async oom teardown by the oom reaper now and so we do not have to rely
on the victim to tear down its memory anymore.  Therefore we can return
to the original semantic as long as the memcg oom killer is not handed
over to the users space.

There is still one thing to be careful about here though.  If the oom
killer is not able to make any forward progress - e.g.  because there is
no eligible task to kill - then we have to bail out of the charge path
to prevent from same class of deadlocks.  We have basically two options
here.  Either we fail the charge with ENOMEM or force the charge and
allow overcharge.  The first option has been considered more harmful
than useful because rare inconsistencies in the ENOMEM behavior is hard
to test for and error prone.  Basically the same reason why the page
allocator doesn't fail allocations under such conditions.  The later
might allow runaways but those should be really unlikely unless somebody
misconfigures the system.  E.g.  allowing to migrate tasks away from the
memcg to a different unlimited memcg with move_charge_at_immigrate
disabled.

Link: http://lkml.kernel.org/r/20180628151101.25307-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Michal Hocko 2018-08-17 15:47:11 -07:00 committed by Linus Torvalds
parent d39f8fb4b7
commit 29ef680ae7
4 changed files with 71 additions and 26 deletions

View File

@ -507,16 +507,16 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
struct task_struct *p); struct task_struct *p);
static inline void mem_cgroup_oom_enable(void) static inline void mem_cgroup_enter_user_fault(void)
{ {
WARN_ON(current->memcg_may_oom); WARN_ON(current->in_user_fault);
current->memcg_may_oom = 1; current->in_user_fault = 1;
} }
static inline void mem_cgroup_oom_disable(void) static inline void mem_cgroup_exit_user_fault(void)
{ {
WARN_ON(!current->memcg_may_oom); WARN_ON(!current->in_user_fault);
current->memcg_may_oom = 0; current->in_user_fault = 0;
} }
static inline bool task_in_memcg_oom(struct task_struct *p) static inline bool task_in_memcg_oom(struct task_struct *p)
@ -961,11 +961,11 @@ static inline void mem_cgroup_handle_over_high(void)
{ {
} }
static inline void mem_cgroup_oom_enable(void) static inline void mem_cgroup_enter_user_fault(void)
{ {
} }
static inline void mem_cgroup_oom_disable(void) static inline void mem_cgroup_exit_user_fault(void)
{ {
} }

View File

@ -722,7 +722,7 @@ struct task_struct {
unsigned restore_sigmask:1; unsigned restore_sigmask:1;
#endif #endif
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
unsigned memcg_may_oom:1; unsigned in_user_fault:1;
#ifndef CONFIG_SLOB #ifndef CONFIG_SLOB
unsigned memcg_kmem_skip_account:1; unsigned memcg_kmem_skip_account:1;
#endif #endif

View File

@ -1534,28 +1534,53 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
} }
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) enum oom_status {
OOM_SUCCESS,
OOM_FAILED,
OOM_ASYNC,
OOM_SKIPPED
};
static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{ {
if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER) if (order > PAGE_ALLOC_COSTLY_ORDER)
return; return OOM_SKIPPED;
/* /*
* We are in the middle of the charge context here, so we * We are in the middle of the charge context here, so we
* don't want to block when potentially sitting on a callstack * don't want to block when potentially sitting on a callstack
* that holds all kinds of filesystem and mm locks. * that holds all kinds of filesystem and mm locks.
* *
* Also, the caller may handle a failed allocation gracefully * cgroup1 allows disabling the OOM killer and waiting for outside
* (like optional page cache readahead) and so an OOM killer * handling until the charge can succeed; remember the context and put
* invocation might not even be necessary. * the task to sleep at the end of the page fault when all locks are
* released.
* *
* That's why we don't do anything here except remember the * On the other hand, in-kernel OOM killer allows for an async victim
* OOM context and then deal with it at the end of the page * memory reclaim (oom_reaper) and that means that we are not solely
* fault when the stack is unwound, the locks are released, * relying on the oom victim to make a forward progress and we can
* and when we know whether the fault was overall successful. * invoke the oom killer here.
*
* Please note that mem_cgroup_out_of_memory might fail to find a
* victim and then we have to bail out from the charge path.
*/ */
if (memcg->oom_kill_disable) {
if (!current->in_user_fault)
return OOM_SKIPPED;
css_get(&memcg->css); css_get(&memcg->css);
current->memcg_in_oom = memcg; current->memcg_in_oom = memcg;
current->memcg_oom_gfp_mask = mask; current->memcg_oom_gfp_mask = mask;
current->memcg_oom_order = order; current->memcg_oom_order = order;
return OOM_ASYNC;
}
if (mem_cgroup_out_of_memory(memcg, mask, order))
return OOM_SUCCESS;
WARN(1,"Memory cgroup charge failed because of no reclaimable memory! "
"This looks like a misconfiguration or a kernel bug.");
return OOM_FAILED;
} }
/** /**
@ -1950,6 +1975,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long nr_reclaimed; unsigned long nr_reclaimed;
bool may_swap = true; bool may_swap = true;
bool drained = false; bool drained = false;
bool oomed = false;
enum oom_status oom_status;
if (mem_cgroup_is_root(memcg)) if (mem_cgroup_is_root(memcg))
return 0; return 0;
@ -2037,6 +2064,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (nr_retries--) if (nr_retries--)
goto retry; goto retry;
if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
goto nomem;
if (gfp_mask & __GFP_NOFAIL) if (gfp_mask & __GFP_NOFAIL)
goto force; goto force;
@ -2045,8 +2075,23 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
memcg_memory_event(mem_over_limit, MEMCG_OOM); memcg_memory_event(mem_over_limit, MEMCG_OOM);
mem_cgroup_oom(mem_over_limit, gfp_mask, /*
* keep retrying as long as the memcg oom killer is able to make
* a forward progress or bypass the charge if the oom killer
* couldn't make any progress.
*/
oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
get_order(nr_pages * PAGE_SIZE)); get_order(nr_pages * PAGE_SIZE));
switch (oom_status) {
case OOM_SUCCESS:
nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
oomed = true;
goto retry;
case OOM_FAILED:
goto force;
default:
goto nomem;
}
nomem: nomem:
if (!(gfp_mask & __GFP_NOFAIL)) if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM; return -ENOMEM;

View File

@ -4153,7 +4153,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
* space. Kernel faults are handled more gracefully. * space. Kernel faults are handled more gracefully.
*/ */
if (flags & FAULT_FLAG_USER) if (flags & FAULT_FLAG_USER)
mem_cgroup_oom_enable(); mem_cgroup_enter_user_fault();
if (unlikely(is_vm_hugetlb_page(vma))) if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags); ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
@ -4161,7 +4161,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
ret = __handle_mm_fault(vma, address, flags); ret = __handle_mm_fault(vma, address, flags);
if (flags & FAULT_FLAG_USER) { if (flags & FAULT_FLAG_USER) {
mem_cgroup_oom_disable(); mem_cgroup_exit_user_fault();
/* /*
* The task may have entered a memcg OOM situation but * The task may have entered a memcg OOM situation but
* if the allocation error was handled gracefully (no * if the allocation error was handled gracefully (no