Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton: "The usual shower of hotfixes. Chris's memcg patches aren't actually fixes - they're mature but a few niggling review issues were late to arrive. The ocfs2 fixes are quite old - those took some time to get reviewer attention. Subsystems affected by this patch series: ocfs2, hotfixes, mm/memcg, mm/slab-generic" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm, sl[aou]b: guarantee natural alignment for kmalloc(power-of-two) mm, sl[ou]b: improve memory accounting mm, memcg: make scan aggression always exclude protection mm, memcg: make memory.emin the baseline for utilisation determination mm, memcg: proportional memory.{low,min} reclaim mm/vmpressure.c: fix a signedness bug in vmpressure_register_event() mm/page_alloc.c: fix a crash in free_pages_prepare() mm/z3fold.c: claim page in the beginning of free kernel/sysctl.c: do not override max_threads provided by userspace memcg: only record foreign writebacks with dirty pages when memcg is not disabled mm: fix -Wmissing-prototypes warnings writeback: fix use-after-free in finish_writeback_work() mm/memremap: drop unused SECTION_SIZE and SECTION_MASK panic: ensure preemption is disabled during panic() fs: ocfs2: fix a possible null-pointer dereference in ocfs2_info_scan_inode_alloc() fs: ocfs2: fix a possible null-pointer dereference in ocfs2_write_end_nolock() fs: ocfs2: fix possible null-pointer dereferences in ocfs2_xa_prepare_entry() ocfs2: clear zero in unaligned direct IO
This commit is contained in:
commit
eda57a0e42
|
@ -615,8 +615,8 @@ on an IO device and is an example of this type.
|
|||
Protections
|
||||
-----------
|
||||
|
||||
A cgroup is protected to be allocated upto the configured amount of
|
||||
the resource if the usages of all its ancestors are under their
|
||||
A cgroup is protected upto the configured amount of the resource
|
||||
as long as the usages of all its ancestors are under their
|
||||
protected levels. Protections can be hard guarantees or best effort
|
||||
soft boundaries. Protections can also be over-committed in which case
|
||||
only upto the amount available to the parent is protected among
|
||||
|
@ -1096,7 +1096,10 @@ PAGE_SIZE multiple when read back.
|
|||
is within its effective min boundary, the cgroup's memory
|
||||
won't be reclaimed under any conditions. If there is no
|
||||
unprotected reclaimable memory available, OOM killer
|
||||
is invoked.
|
||||
is invoked. Above the effective min boundary (or
|
||||
effective low boundary if it is higher), pages are reclaimed
|
||||
proportionally to the overage, reducing reclaim pressure for
|
||||
smaller overages.
|
||||
|
||||
Effective min boundary is limited by memory.min values of
|
||||
all ancestor cgroups. If there is memory.min overcommitment
|
||||
|
@ -1118,7 +1121,10 @@ PAGE_SIZE multiple when read back.
|
|||
Best-effort memory protection. If the memory usage of a
|
||||
cgroup is within its effective low boundary, the cgroup's
|
||||
memory won't be reclaimed unless memory can be reclaimed
|
||||
from unprotected cgroups.
|
||||
from unprotected cgroups. Above the effective low boundary (or
|
||||
effective min boundary if it is higher), pages are reclaimed
|
||||
proportionally to the overage, reducing reclaim pressure for
|
||||
smaller overages.
|
||||
|
||||
Effective low boundary is limited by memory.low values of
|
||||
all ancestor cgroups. If there is memory.low overcommitment
|
||||
|
@ -2482,8 +2488,10 @@ system performance due to overreclaim, to the point where the feature
|
|||
becomes self-defeating.
|
||||
|
||||
The memory.low boundary on the other hand is a top-down allocated
|
||||
reserve. A cgroup enjoys reclaim protection when it's within its low,
|
||||
which makes delegation of subtrees possible.
|
||||
reserve. A cgroup enjoys reclaim protection when it's within its
|
||||
effective low, which makes delegation of subtrees possible. It also
|
||||
enjoys having reclaim pressure proportional to its overage when
|
||||
above its effective low.
|
||||
|
||||
The original high boundary, the hard limit, is defined as a strict
|
||||
limit that can not budge, even if the OOM killer has to be called.
|
||||
|
|
|
@ -98,6 +98,10 @@ limited. The actual limit depends on the hardware and the kernel
|
|||
configuration, but it is a good practice to use `kmalloc` for objects
|
||||
smaller than page size.
|
||||
|
||||
The address of a chunk allocated with `kmalloc` is aligned to at least
|
||||
ARCH_KMALLOC_MINALIGN bytes. For sizes which are a power of two, the
|
||||
alignment is also guaranteed to be at least the respective size.
|
||||
|
||||
For large allocations you can use :c:func:`vmalloc` and
|
||||
:c:func:`vzalloc`, or directly request pages from the page
|
||||
allocator. The memory allocated by `vmalloc` and related functions is
|
||||
|
|
|
@ -164,8 +164,13 @@ static void finish_writeback_work(struct bdi_writeback *wb,
|
|||
|
||||
if (work->auto_free)
|
||||
kfree(work);
|
||||
if (done && atomic_dec_and_test(&done->cnt))
|
||||
wake_up_all(done->waitq);
|
||||
if (done) {
|
||||
wait_queue_head_t *waitq = done->waitq;
|
||||
|
||||
/* @done can't be accessed after the following dec */
|
||||
if (atomic_dec_and_test(&done->cnt))
|
||||
wake_up_all(waitq);
|
||||
}
|
||||
}
|
||||
|
||||
static void wb_queue_work(struct bdi_writeback *wb,
|
||||
|
|
|
@ -2049,7 +2049,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
|
|||
inode->i_mtime = inode->i_ctime = current_time(inode);
|
||||
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
|
||||
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
|
||||
ocfs2_update_inode_fsync_trans(handle, inode, 1);
|
||||
if (handle)
|
||||
ocfs2_update_inode_fsync_trans(handle, inode, 1);
|
||||
}
|
||||
if (handle)
|
||||
ocfs2_journal_dirty(handle, wc->w_di_bh);
|
||||
|
@ -2146,13 +2147,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
|
|||
struct ocfs2_dio_write_ctxt *dwc = NULL;
|
||||
struct buffer_head *di_bh = NULL;
|
||||
u64 p_blkno;
|
||||
loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
|
||||
unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
|
||||
loff_t pos = iblock << i_blkbits;
|
||||
sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
|
||||
unsigned len, total_len = bh_result->b_size;
|
||||
int ret = 0, first_get_block = 0;
|
||||
|
||||
len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
|
||||
len = min(total_len, len);
|
||||
|
||||
/*
|
||||
* bh_result->b_size is count in get_more_blocks according to write
|
||||
* "pos" and "end", we need map twice to return different buffer state:
|
||||
* 1. area in file size, not set NEW;
|
||||
* 2. area out file size, set NEW.
|
||||
*
|
||||
* iblock endblk
|
||||
* |--------|---------|---------|---------
|
||||
* |<-------area in file------->|
|
||||
*/
|
||||
|
||||
if ((iblock <= endblk) &&
|
||||
((iblock + ((len - 1) >> i_blkbits)) > endblk))
|
||||
len = (endblk - iblock + 1) << i_blkbits;
|
||||
|
||||
mlog(0, "get block of %lu at %llu:%u req %u\n",
|
||||
inode->i_ino, pos, len, total_len);
|
||||
|
||||
|
@ -2236,6 +2254,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
|
|||
if (desc->c_needs_zero)
|
||||
set_buffer_new(bh_result);
|
||||
|
||||
if (iblock > endblk)
|
||||
set_buffer_new(bh_result);
|
||||
|
||||
/* May sleep in end_io. It should not happen in a irq context. So defer
|
||||
* it to dio work queue. */
|
||||
set_buffer_defer_completion(bh_result);
|
||||
|
|
|
@ -283,7 +283,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
|
|||
if (inode_alloc)
|
||||
inode_lock(inode_alloc);
|
||||
|
||||
if (o2info_coherent(&fi->ifi_req)) {
|
||||
if (inode_alloc && o2info_coherent(&fi->ifi_req)) {
|
||||
status = ocfs2_inode_lock(inode_alloc, &bh, 0);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
|
|
|
@ -1490,18 +1490,6 @@ static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc,
|
|||
return loc->xl_ops->xlo_check_space(loc, xi);
|
||||
}
|
||||
|
||||
static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash)
|
||||
{
|
||||
loc->xl_ops->xlo_add_entry(loc, name_hash);
|
||||
loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
|
||||
/*
|
||||
* We can't leave the new entry's xe_name_offset at zero or
|
||||
* add_namevalue() will go nuts. We set it to the size of our
|
||||
* storage so that it can never be less than any other entry.
|
||||
*/
|
||||
loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
|
||||
}
|
||||
|
||||
static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc,
|
||||
struct ocfs2_xattr_info *xi)
|
||||
{
|
||||
|
@ -2133,29 +2121,31 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc,
|
|||
if (rc)
|
||||
goto out;
|
||||
|
||||
if (loc->xl_entry) {
|
||||
if (ocfs2_xa_can_reuse_entry(loc, xi)) {
|
||||
orig_value_size = loc->xl_entry->xe_value_size;
|
||||
rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
|
||||
if (rc)
|
||||
goto out;
|
||||
goto alloc_value;
|
||||
}
|
||||
if (!loc->xl_entry) {
|
||||
rc = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!ocfs2_xattr_is_local(loc->xl_entry)) {
|
||||
orig_clusters = ocfs2_xa_value_clusters(loc);
|
||||
rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
|
||||
if (rc) {
|
||||
mlog_errno(rc);
|
||||
ocfs2_xa_cleanup_value_truncate(loc,
|
||||
"overwriting",
|
||||
orig_clusters);
|
||||
goto out;
|
||||
}
|
||||
if (ocfs2_xa_can_reuse_entry(loc, xi)) {
|
||||
orig_value_size = loc->xl_entry->xe_value_size;
|
||||
rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
|
||||
if (rc)
|
||||
goto out;
|
||||
goto alloc_value;
|
||||
}
|
||||
|
||||
if (!ocfs2_xattr_is_local(loc->xl_entry)) {
|
||||
orig_clusters = ocfs2_xa_value_clusters(loc);
|
||||
rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
|
||||
if (rc) {
|
||||
mlog_errno(rc);
|
||||
ocfs2_xa_cleanup_value_truncate(loc,
|
||||
"overwriting",
|
||||
orig_clusters);
|
||||
goto out;
|
||||
}
|
||||
ocfs2_xa_wipe_namevalue(loc);
|
||||
} else
|
||||
ocfs2_xa_add_entry(loc, name_hash);
|
||||
}
|
||||
ocfs2_xa_wipe_namevalue(loc);
|
||||
|
||||
/*
|
||||
* If we get here, we have a blank entry. Fill it. We grow our
|
||||
|
|
|
@ -356,6 +356,19 @@ static inline bool mem_cgroup_disabled(void)
|
|||
return !cgroup_subsys_enabled(memory_cgrp_subsys);
|
||||
}
|
||||
|
||||
static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
|
||||
bool in_low_reclaim)
|
||||
{
|
||||
if (mem_cgroup_disabled())
|
||||
return 0;
|
||||
|
||||
if (in_low_reclaim)
|
||||
return READ_ONCE(memcg->memory.emin);
|
||||
|
||||
return max(READ_ONCE(memcg->memory.emin),
|
||||
READ_ONCE(memcg->memory.elow));
|
||||
}
|
||||
|
||||
enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
|
||||
struct mem_cgroup *memcg);
|
||||
|
||||
|
@ -537,6 +550,8 @@ void mem_cgroup_handle_over_high(void);
|
|||
|
||||
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
|
||||
|
||||
unsigned long mem_cgroup_size(struct mem_cgroup *memcg);
|
||||
|
||||
void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
|
||||
struct task_struct *p);
|
||||
|
||||
|
@ -829,6 +844,12 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
|
|||
{
|
||||
}
|
||||
|
||||
static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
|
||||
bool in_low_reclaim)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline enum mem_cgroup_protection mem_cgroup_protected(
|
||||
struct mem_cgroup *root, struct mem_cgroup *memcg)
|
||||
{
|
||||
|
@ -968,6 +989,11 @@ static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void
|
||||
mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
|
||||
{
|
||||
|
@ -1264,6 +1290,9 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
|
|||
static inline void mem_cgroup_track_foreign_dirty(struct page *page,
|
||||
struct bdi_writeback *wb)
|
||||
{
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
|
||||
if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
|
||||
mem_cgroup_track_foreign_dirty_slowpath(page, wb);
|
||||
}
|
||||
|
|
|
@ -493,6 +493,10 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
|
|||
* kmalloc is the normal method of allocating memory
|
||||
* for objects smaller than page size in the kernel.
|
||||
*
|
||||
* The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN
|
||||
* bytes. For @size of power of two bytes, the alignment is also guaranteed
|
||||
* to be at least to the size.
|
||||
*
|
||||
* The @flags argument may be one of the GFP flags defined at
|
||||
* include/linux/gfp.h and described at
|
||||
* :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
|
||||
|
|
|
@ -2925,7 +2925,7 @@ int sysctl_max_threads(struct ctl_table *table, int write,
|
|||
struct ctl_table t;
|
||||
int ret;
|
||||
int threads = max_threads;
|
||||
int min = MIN_THREADS;
|
||||
int min = 1;
|
||||
int max = MAX_THREADS;
|
||||
|
||||
t = *table;
|
||||
|
@ -2937,7 +2937,7 @@ int sysctl_max_threads(struct ctl_table *table, int write,
|
|||
if (ret || !write)
|
||||
return ret;
|
||||
|
||||
set_max_threads(threads);
|
||||
max_threads = threads;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -180,6 +180,7 @@ void panic(const char *fmt, ...)
|
|||
* after setting panic_cpu) from invoking panic() again.
|
||||
*/
|
||||
local_irq_disable();
|
||||
preempt_disable_notrace();
|
||||
|
||||
/*
|
||||
* It's possible to come here directly from a panic-assertion and
|
||||
|
|
|
@ -1567,6 +1567,11 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
|
|||
return max;
|
||||
}
|
||||
|
||||
unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
|
||||
{
|
||||
return page_counter_read(&memcg->memory);
|
||||
}
|
||||
|
||||
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
||||
int order)
|
||||
{
|
||||
|
|
|
@ -13,8 +13,6 @@
|
|||
#include <linux/xarray.h>
|
||||
|
||||
static DEFINE_XARRAY(pgmap_array);
|
||||
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
|
||||
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
|
||||
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
|
||||
|
|
|
@ -1175,11 +1175,17 @@ static __always_inline bool free_pages_prepare(struct page *page,
|
|||
debug_check_no_obj_freed(page_address(page),
|
||||
PAGE_SIZE << order);
|
||||
}
|
||||
arch_free_page(page, order);
|
||||
if (want_init_on_free())
|
||||
kernel_init_free_pages(page, 1 << order);
|
||||
|
||||
kernel_poison_pages(page, 1 << order, 0);
|
||||
/*
|
||||
* arch_free_page() can make the page's contents inaccessible. s390
|
||||
* does this. So nothing which can access the page's contents should
|
||||
* happen after this.
|
||||
*/
|
||||
arch_free_page(page, order);
|
||||
|
||||
if (debug_pagealloc_enabled())
|
||||
kernel_map_pages(page, 1 << order, 0);
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ __meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
|
|||
}
|
||||
|
||||
static bool shuffle_param;
|
||||
extern int shuffle_show(char *buffer, const struct kernel_param *kp)
|
||||
static int shuffle_show(char *buffer, const struct kernel_param *kp)
|
||||
{
|
||||
return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state)
|
||||
? 'Y' : 'N');
|
||||
|
|
|
@ -1030,10 +1030,19 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
|
|||
unsigned int useroffset, unsigned int usersize)
|
||||
{
|
||||
int err;
|
||||
unsigned int align = ARCH_KMALLOC_MINALIGN;
|
||||
|
||||
s->name = name;
|
||||
s->size = s->object_size = size;
|
||||
s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
|
||||
|
||||
/*
|
||||
* For power of two sizes, guarantee natural alignment for kmalloc
|
||||
* caches, regardless of SL*B debugging options.
|
||||
*/
|
||||
if (is_power_of_2(size))
|
||||
align = max(align, size);
|
||||
s->align = calculate_alignment(flags, align, size);
|
||||
|
||||
s->useroffset = useroffset;
|
||||
s->usersize = usersize;
|
||||
|
||||
|
@ -1287,12 +1296,16 @@ void __init create_kmalloc_caches(slab_flags_t flags)
|
|||
*/
|
||||
void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
|
||||
{
|
||||
void *ret;
|
||||
void *ret = NULL;
|
||||
struct page *page;
|
||||
|
||||
flags |= __GFP_COMP;
|
||||
page = alloc_pages(flags, order);
|
||||
ret = page ? page_address(page) : NULL;
|
||||
if (likely(page)) {
|
||||
ret = page_address(page);
|
||||
mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
|
||||
1 << order);
|
||||
}
|
||||
ret = kasan_kmalloc_large(ret, size, flags);
|
||||
/* As ret might get tagged, call kmemleak hook after KASAN. */
|
||||
kmemleak_alloc(ret, size, 1, flags);
|
||||
|
|
62
mm/slob.c
62
mm/slob.c
|
@ -190,7 +190,7 @@ static int slob_last(slob_t *s)
|
|||
|
||||
static void *slob_new_pages(gfp_t gfp, int order, int node)
|
||||
{
|
||||
void *page;
|
||||
struct page *page;
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
if (node != NUMA_NO_NODE)
|
||||
|
@ -202,14 +202,21 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
|
|||
if (!page)
|
||||
return NULL;
|
||||
|
||||
mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
|
||||
1 << order);
|
||||
return page_address(page);
|
||||
}
|
||||
|
||||
static void slob_free_pages(void *b, int order)
|
||||
{
|
||||
struct page *sp = virt_to_page(b);
|
||||
|
||||
if (current->reclaim_state)
|
||||
current->reclaim_state->reclaimed_slab += 1 << order;
|
||||
free_pages((unsigned long)b, order);
|
||||
|
||||
mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
|
||||
-(1 << order));
|
||||
__free_pages(sp, order);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -217,6 +224,7 @@ static void slob_free_pages(void *b, int order)
|
|||
* @sp: Page to look in.
|
||||
* @size: Size of the allocation.
|
||||
* @align: Allocation alignment.
|
||||
* @align_offset: Offset in the allocated block that will be aligned.
|
||||
* @page_removed_from_list: Return parameter.
|
||||
*
|
||||
* Tries to find a chunk of memory at least @size bytes big within @page.
|
||||
|
@ -227,7 +235,7 @@ static void slob_free_pages(void *b, int order)
|
|||
* true (set to false otherwise).
|
||||
*/
|
||||
static void *slob_page_alloc(struct page *sp, size_t size, int align,
|
||||
bool *page_removed_from_list)
|
||||
int align_offset, bool *page_removed_from_list)
|
||||
{
|
||||
slob_t *prev, *cur, *aligned = NULL;
|
||||
int delta = 0, units = SLOB_UNITS(size);
|
||||
|
@ -236,8 +244,17 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
|
|||
for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
|
||||
slobidx_t avail = slob_units(cur);
|
||||
|
||||
/*
|
||||
* 'aligned' will hold the address of the slob block so that the
|
||||
* address 'aligned'+'align_offset' is aligned according to the
|
||||
* 'align' parameter. This is for kmalloc() which prepends the
|
||||
* allocated block with its size, so that the block itself is
|
||||
* aligned when needed.
|
||||
*/
|
||||
if (align) {
|
||||
aligned = (slob_t *)ALIGN((unsigned long)cur, align);
|
||||
aligned = (slob_t *)
|
||||
(ALIGN((unsigned long)cur + align_offset, align)
|
||||
- align_offset);
|
||||
delta = aligned - cur;
|
||||
}
|
||||
if (avail >= units + delta) { /* room enough? */
|
||||
|
@ -281,7 +298,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
|
|||
/*
|
||||
* slob_alloc: entry point into the slob allocator.
|
||||
*/
|
||||
static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
|
||||
static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
|
||||
int align_offset)
|
||||
{
|
||||
struct page *sp;
|
||||
struct list_head *slob_list;
|
||||
|
@ -312,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
|
|||
if (sp->units < SLOB_UNITS(size))
|
||||
continue;
|
||||
|
||||
b = slob_page_alloc(sp, size, align, &page_removed_from_list);
|
||||
b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list);
|
||||
if (!b)
|
||||
continue;
|
||||
|
||||
|
@ -349,7 +367,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
|
|||
INIT_LIST_HEAD(&sp->slab_list);
|
||||
set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
|
||||
set_slob_page_free(sp, slob_list);
|
||||
b = slob_page_alloc(sp, size, align, &_unused);
|
||||
b = slob_page_alloc(sp, size, align, align_offset, &_unused);
|
||||
BUG_ON(!b);
|
||||
spin_unlock_irqrestore(&slob_lock, flags);
|
||||
}
|
||||
|
@ -451,7 +469,7 @@ static __always_inline void *
|
|||
__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
|
||||
{
|
||||
unsigned int *m;
|
||||
int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
|
||||
int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
|
||||
void *ret;
|
||||
|
||||
gfp &= gfp_allowed_mask;
|
||||
|
@ -459,19 +477,28 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
|
|||
fs_reclaim_acquire(gfp);
|
||||
fs_reclaim_release(gfp);
|
||||
|
||||
if (size < PAGE_SIZE - align) {
|
||||
if (size < PAGE_SIZE - minalign) {
|
||||
int align = minalign;
|
||||
|
||||
/*
|
||||
* For power of two sizes, guarantee natural alignment for
|
||||
* kmalloc()'d objects.
|
||||
*/
|
||||
if (is_power_of_2(size))
|
||||
align = max(minalign, (int) size);
|
||||
|
||||
if (!size)
|
||||
return ZERO_SIZE_PTR;
|
||||
|
||||
m = slob_alloc(size + align, gfp, align, node);
|
||||
m = slob_alloc(size + minalign, gfp, align, node, minalign);
|
||||
|
||||
if (!m)
|
||||
return NULL;
|
||||
*m = size;
|
||||
ret = (void *)m + align;
|
||||
ret = (void *)m + minalign;
|
||||
|
||||
trace_kmalloc_node(caller, ret,
|
||||
size, size + align, gfp, node);
|
||||
size, size + minalign, gfp, node);
|
||||
} else {
|
||||
unsigned int order = get_order(size);
|
||||
|
||||
|
@ -521,8 +548,13 @@ void kfree(const void *block)
|
|||
int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
|
||||
unsigned int *m = (unsigned int *)(block - align);
|
||||
slob_free(m, *m + align);
|
||||
} else
|
||||
__free_pages(sp, compound_order(sp));
|
||||
} else {
|
||||
unsigned int order = compound_order(sp);
|
||||
mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
|
||||
-(1 << order));
|
||||
__free_pages(sp, order);
|
||||
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(kfree);
|
||||
|
||||
|
@ -567,7 +599,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
|
|||
fs_reclaim_release(flags);
|
||||
|
||||
if (c->size < PAGE_SIZE) {
|
||||
b = slob_alloc(c->size, flags, c->align, node);
|
||||
b = slob_alloc(c->size, flags, c->align, node, 0);
|
||||
trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
|
||||
SLOB_UNITS(c->size) * SLOB_UNIT,
|
||||
flags, node);
|
||||
|
|
14
mm/slub.c
14
mm/slub.c
|
@ -3821,11 +3821,15 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
|
|||
{
|
||||
struct page *page;
|
||||
void *ptr = NULL;
|
||||
unsigned int order = get_order(size);
|
||||
|
||||
flags |= __GFP_COMP;
|
||||
page = alloc_pages_node(node, flags, get_order(size));
|
||||
if (page)
|
||||
page = alloc_pages_node(node, flags, order);
|
||||
if (page) {
|
||||
ptr = page_address(page);
|
||||
mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
|
||||
1 << order);
|
||||
}
|
||||
|
||||
return kmalloc_large_node_hook(ptr, size, flags);
|
||||
}
|
||||
|
@ -3951,9 +3955,13 @@ void kfree(const void *x)
|
|||
|
||||
page = virt_to_head_page(x);
|
||||
if (unlikely(!PageSlab(page))) {
|
||||
unsigned int order = compound_order(page);
|
||||
|
||||
BUG_ON(!PageCompound(page));
|
||||
kfree_hook(object);
|
||||
__free_pages(page, compound_order(page));
|
||||
mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
|
||||
-(1 << order));
|
||||
__free_pages(page, order);
|
||||
return;
|
||||
}
|
||||
slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
|
||||
|
|
|
@ -219,7 +219,7 @@ static inline unsigned long first_present_section_nr(void)
|
|||
return next_present_section_nr(-1);
|
||||
}
|
||||
|
||||
void subsection_mask_set(unsigned long *map, unsigned long pfn,
|
||||
static void subsection_mask_set(unsigned long *map, unsigned long pfn,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
int idx = subsection_map_index(pfn);
|
||||
|
|
|
@ -355,6 +355,9 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
|
|||
* "hierarchy" or "local").
|
||||
*
|
||||
* To be used as memcg event method.
|
||||
*
|
||||
* Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could
|
||||
* not be parsed.
|
||||
*/
|
||||
int vmpressure_register_event(struct mem_cgroup *memcg,
|
||||
struct eventfd_ctx *eventfd, const char *args)
|
||||
|
@ -362,7 +365,7 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
|
|||
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
|
||||
struct vmpressure_event *ev;
|
||||
enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
|
||||
enum vmpressure_levels level = -1;
|
||||
enum vmpressure_levels level;
|
||||
char *spec, *spec_orig;
|
||||
char *token;
|
||||
int ret = 0;
|
||||
|
@ -375,20 +378,18 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
|
|||
|
||||
/* Find required level */
|
||||
token = strsep(&spec, ",");
|
||||
level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
|
||||
if (level < 0) {
|
||||
ret = level;
|
||||
ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
level = ret;
|
||||
|
||||
/* Find optional mode */
|
||||
token = strsep(&spec, ",");
|
||||
if (token) {
|
||||
mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
|
||||
if (mode < 0) {
|
||||
ret = mode;
|
||||
ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
mode = ret;
|
||||
}
|
||||
|
||||
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
|
||||
|
@ -404,6 +405,7 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
|
|||
mutex_lock(&vmpr->events_lock);
|
||||
list_add(&ev->node, &vmpr->events);
|
||||
mutex_unlock(&vmpr->events_lock);
|
||||
ret = 0;
|
||||
out:
|
||||
kfree(spec_orig);
|
||||
return ret;
|
||||
|
|
72
mm/vmscan.c
72
mm/vmscan.c
|
@ -2459,17 +2459,70 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
|
|||
*lru_pages = 0;
|
||||
for_each_evictable_lru(lru) {
|
||||
int file = is_file_lru(lru);
|
||||
unsigned long size;
|
||||
unsigned long lruvec_size;
|
||||
unsigned long scan;
|
||||
unsigned long protection;
|
||||
|
||||
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
|
||||
protection = mem_cgroup_protection(memcg,
|
||||
sc->memcg_low_reclaim);
|
||||
|
||||
if (protection) {
|
||||
/*
|
||||
* Scale a cgroup's reclaim pressure by proportioning
|
||||
* its current usage to its memory.low or memory.min
|
||||
* setting.
|
||||
*
|
||||
* This is important, as otherwise scanning aggression
|
||||
* becomes extremely binary -- from nothing as we
|
||||
* approach the memory protection threshold, to totally
|
||||
* nominal as we exceed it. This results in requiring
|
||||
* setting extremely liberal protection thresholds. It
|
||||
* also means we simply get no protection at all if we
|
||||
* set it too low, which is not ideal.
|
||||
*
|
||||
* If there is any protection in place, we reduce scan
|
||||
* pressure by how much of the total memory used is
|
||||
* within protection thresholds.
|
||||
*
|
||||
* There is one special case: in the first reclaim pass,
|
||||
* we skip over all groups that are within their low
|
||||
* protection. If that fails to reclaim enough pages to
|
||||
* satisfy the reclaim goal, we come back and override
|
||||
* the best-effort low protection. However, we still
|
||||
* ideally want to honor how well-behaved groups are in
|
||||
* that case instead of simply punishing them all
|
||||
* equally. As such, we reclaim them based on how much
|
||||
* memory they are using, reducing the scan pressure
|
||||
* again by how much of the total memory used is under
|
||||
* hard protection.
|
||||
*/
|
||||
unsigned long cgroup_size = mem_cgroup_size(memcg);
|
||||
|
||||
/* Avoid TOCTOU with earlier protection check */
|
||||
cgroup_size = max(cgroup_size, protection);
|
||||
|
||||
scan = lruvec_size - lruvec_size * protection /
|
||||
cgroup_size;
|
||||
|
||||
/*
|
||||
* Minimally target SWAP_CLUSTER_MAX pages to keep
|
||||
* reclaim moving forwards, avoiding decremeting
|
||||
* sc->priority further than desirable.
|
||||
*/
|
||||
scan = max(scan, SWAP_CLUSTER_MAX);
|
||||
} else {
|
||||
scan = lruvec_size;
|
||||
}
|
||||
|
||||
scan >>= sc->priority;
|
||||
|
||||
size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
|
||||
scan = size >> sc->priority;
|
||||
/*
|
||||
* If the cgroup's already been deleted, make sure to
|
||||
* scrape out the remaining cache.
|
||||
*/
|
||||
if (!scan && !mem_cgroup_online(memcg))
|
||||
scan = min(size, SWAP_CLUSTER_MAX);
|
||||
scan = min(lruvec_size, SWAP_CLUSTER_MAX);
|
||||
|
||||
switch (scan_balance) {
|
||||
case SCAN_EQUAL:
|
||||
|
@ -2489,7 +2542,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
|
|||
case SCAN_ANON:
|
||||
/* Scan one type exclusively */
|
||||
if ((scan_balance == SCAN_FILE) != file) {
|
||||
size = 0;
|
||||
lruvec_size = 0;
|
||||
scan = 0;
|
||||
}
|
||||
break;
|
||||
|
@ -2498,7 +2551,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
|
|||
BUG();
|
||||
}
|
||||
|
||||
*lru_pages += size;
|
||||
*lru_pages += lruvec_size;
|
||||
nr[lru] = scan;
|
||||
}
|
||||
}
|
||||
|
@ -2742,6 +2795,13 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
|
|||
memcg_memory_event(memcg, MEMCG_LOW);
|
||||
break;
|
||||
case MEMCG_PROT_NONE:
|
||||
/*
|
||||
* All protection thresholds breached. We may
|
||||
* still choose to vary the scan pressure
|
||||
* applied based on by how much the cgroup in
|
||||
* question has exceeded its protection
|
||||
* thresholds (see get_scan_count).
|
||||
*/
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
10
mm/z3fold.c
10
mm/z3fold.c
|
@ -998,9 +998,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
|
|||
struct z3fold_header *zhdr;
|
||||
struct page *page;
|
||||
enum buddy bud;
|
||||
bool page_claimed;
|
||||
|
||||
zhdr = handle_to_z3fold_header(handle);
|
||||
page = virt_to_page(zhdr);
|
||||
page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
|
||||
|
||||
if (test_bit(PAGE_HEADLESS, &page->private)) {
|
||||
/* if a headless page is under reclaim, just leave.
|
||||
|
@ -1008,7 +1010,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
|
|||
* has not been set before, we release this page
|
||||
* immediately so we don't care about its value any more.
|
||||
*/
|
||||
if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) {
|
||||
if (!page_claimed) {
|
||||
spin_lock(&pool->lock);
|
||||
list_del(&page->lru);
|
||||
spin_unlock(&pool->lock);
|
||||
|
@ -1044,13 +1046,15 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
|
|||
atomic64_dec(&pool->pages_nr);
|
||||
return;
|
||||
}
|
||||
if (test_bit(PAGE_CLAIMED, &page->private)) {
|
||||
if (page_claimed) {
|
||||
/* the page has not been claimed by us */
|
||||
z3fold_page_unlock(zhdr);
|
||||
return;
|
||||
}
|
||||
if (unlikely(PageIsolated(page)) ||
|
||||
test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
|
||||
z3fold_page_unlock(zhdr);
|
||||
clear_bit(PAGE_CLAIMED, &page->private);
|
||||
return;
|
||||
}
|
||||
if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
|
||||
|
@ -1060,10 +1064,12 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
|
|||
zhdr->cpu = -1;
|
||||
kref_get(&zhdr->refcount);
|
||||
do_compact_page(zhdr, true);
|
||||
clear_bit(PAGE_CLAIMED, &page->private);
|
||||
return;
|
||||
}
|
||||
kref_get(&zhdr->refcount);
|
||||
queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
|
||||
clear_bit(PAGE_CLAIMED, &page->private);
|
||||
z3fold_page_unlock(zhdr);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue