Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton: "14 fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm, page_alloc: do not wake kswapd with zone lock held hugetlbfs: revert "use i_mmap_rwsem for more pmd sharing synchronization" hugetlbfs: revert "Use i_mmap_rwsem to fix page fault/truncate race" mm: page_mapped: don't assume compound page is huge or THP mm/memory.c: initialise mmu_notifier_range correctly tools/vm/page_owner: use page_owner_sort in the use example kasan: fix krealloc handling for tag-based mode kasan: make tag based mode work with CONFIG_HARDENED_USERCOPY kasan, arm64: use ARCH_SLAB_MINALIGN instead of manual aligning mm, memcg: fix reclaim deadlock with writeback mm/usercopy.c: no check page span for stack objects slab: alien caches must not be initialized if the allocation of the alien cache failed fork, memcg: fix cached_stacks case zram: idle writeback fixes and cleanup
This commit is contained in:
commit
a88cc8da02
|
@ -122,11 +122,18 @@ Description:
|
|||
statistics (bd_count, bd_reads, bd_writes) in a format
|
||||
similar to block layer statistics file format.
|
||||
|
||||
What: /sys/block/zram<id>/writeback_limit_enable
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The writeback_limit_enable file is read-write and specifies
|
||||
eanbe of writeback_limit feature. "1" means eable the feature.
|
||||
No limit "0" is the initial state.
|
||||
|
||||
What: /sys/block/zram<id>/writeback_limit
|
||||
Date: November 2018
|
||||
Contact: Minchan Kim <minchan@kernel.org>
|
||||
Description:
|
||||
The writeback_limit file is read-write and specifies the maximum
|
||||
amount of writeback ZRAM can do. The limit could be changed
|
||||
in run time and "0" means disable the limit.
|
||||
No limit is the initial state.
|
||||
in run time.
|
||||
|
|
|
@ -156,22 +156,23 @@ Per-device statistics are exported as various nodes under /sys/block/zram<id>/
|
|||
A brief description of exported device attributes. For more details please
|
||||
read Documentation/ABI/testing/sysfs-block-zram.
|
||||
|
||||
Name access description
|
||||
---- ------ -----------
|
||||
disksize RW show and set the device's disk size
|
||||
initstate RO shows the initialization state of the device
|
||||
reset WO trigger device reset
|
||||
mem_used_max WO reset the `mem_used_max' counter (see later)
|
||||
mem_limit WO specifies the maximum amount of memory ZRAM can use
|
||||
to store the compressed data
|
||||
writeback_limit WO specifies the maximum amount of write IO zram can
|
||||
write out to backing device as 4KB unit
|
||||
max_comp_streams RW the number of possible concurrent compress operations
|
||||
comp_algorithm RW show and change the compression algorithm
|
||||
compact WO trigger memory compaction
|
||||
debug_stat RO this file is used for zram debugging purposes
|
||||
backing_dev RW set up backend storage for zram to write out
|
||||
idle WO mark allocated slot as idle
|
||||
Name access description
|
||||
---- ------ -----------
|
||||
disksize RW show and set the device's disk size
|
||||
initstate RO shows the initialization state of the device
|
||||
reset WO trigger device reset
|
||||
mem_used_max WO reset the `mem_used_max' counter (see later)
|
||||
mem_limit WO specifies the maximum amount of memory ZRAM can use
|
||||
to store the compressed data
|
||||
writeback_limit WO specifies the maximum amount of write IO zram can
|
||||
write out to backing device as 4KB unit
|
||||
writeback_limit_enable RW show and set writeback_limit feature
|
||||
max_comp_streams RW the number of possible concurrent compress operations
|
||||
comp_algorithm RW show and change the compression algorithm
|
||||
compact WO trigger memory compaction
|
||||
debug_stat RO this file is used for zram debugging purposes
|
||||
backing_dev RW set up backend storage for zram to write out
|
||||
idle WO mark allocated slot as idle
|
||||
|
||||
|
||||
User space is advised to use the following files to read the device statistics.
|
||||
|
@ -280,32 +281,51 @@ With the command, zram writeback idle pages from memory to the storage.
|
|||
If there are lots of write IO with flash device, potentially, it has
|
||||
flash wearout problem so that admin needs to design write limitation
|
||||
to guarantee storage health for entire product life.
|
||||
To overcome the concern, zram supports "writeback_limit".
|
||||
The "writeback_limit"'s default value is 0 so that it doesn't limit
|
||||
any writeback. If admin want to measure writeback count in a certain
|
||||
period, he could know it via /sys/block/zram0/bd_stat's 3rd column.
|
||||
|
||||
To overcome the concern, zram supports "writeback_limit" feature.
|
||||
The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
|
||||
any writeback. IOW, if admin want to apply writeback budget, he should
|
||||
enable writeback_limit_enable via
|
||||
|
||||
$ echo 1 > /sys/block/zramX/writeback_limit_enable
|
||||
|
||||
Once writeback_limit_enable is set, zram doesn't allow any writeback
|
||||
until admin set the budget via /sys/block/zramX/writeback_limit.
|
||||
|
||||
(If admin doesn't enable writeback_limit_enable, writeback_limit's value
|
||||
assigned via /sys/block/zramX/writeback_limit is meaninless.)
|
||||
|
||||
If admin want to limit writeback as per-day 400M, he could do it
|
||||
like below.
|
||||
|
||||
MB_SHIFT=20
|
||||
4K_SHIFT=12
|
||||
echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
|
||||
/sys/block/zram0/writeback_limit.
|
||||
$ MB_SHIFT=20
|
||||
$ 4K_SHIFT=12
|
||||
$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
|
||||
/sys/block/zram0/writeback_limit.
|
||||
$ echo 1 > /sys/block/zram0/writeback_limit_enable
|
||||
|
||||
If admin want to allow further write again, he could do it like below
|
||||
If admin want to allow further write again once the bugdet is exausted,
|
||||
he could do it like below
|
||||
|
||||
echo 0 > /sys/block/zram0/writeback_limit
|
||||
$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
|
||||
/sys/block/zram0/writeback_limit
|
||||
|
||||
If admin want to see remaining writeback budget since he set,
|
||||
|
||||
cat /sys/block/zram0/writeback_limit
|
||||
$ cat /sys/block/zramX/writeback_limit
|
||||
|
||||
If admin want to disable writeback limit, he could do
|
||||
|
||||
$ echo 0 > /sys/block/zramX/writeback_limit_enable
|
||||
|
||||
The writeback_limit count will reset whenever you reset zram(e.g.,
|
||||
system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
|
||||
writeback happened until you reset the zram to allocate extra writeback
|
||||
budget in next setting is user's job.
|
||||
|
||||
If admin want to measure writeback count in a certain period, he could
|
||||
know it via /sys/block/zram0/bd_stat's 3rd column.
|
||||
|
||||
= memory tracking
|
||||
|
||||
With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the
|
||||
|
|
|
@ -58,6 +58,12 @@
|
|||
*/
|
||||
#define ARCH_DMA_MINALIGN (128)
|
||||
|
||||
#ifdef CONFIG_KASAN_SW_TAGS
|
||||
#define ARCH_SLAB_MINALIGN (1ULL << KASAN_SHADOW_SCALE_SHIFT)
|
||||
#else
|
||||
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
|
||||
#endif
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
#include <linux/bitops.h>
|
||||
|
|
|
@ -316,11 +316,9 @@ static ssize_t idle_store(struct device *dev,
|
|||
* See the comment in writeback_store.
|
||||
*/
|
||||
zram_slot_lock(zram, index);
|
||||
if (!zram_allocated(zram, index) ||
|
||||
zram_test_flag(zram, index, ZRAM_UNDER_WB))
|
||||
goto next;
|
||||
zram_set_flag(zram, index, ZRAM_IDLE);
|
||||
next:
|
||||
if (zram_allocated(zram, index) &&
|
||||
!zram_test_flag(zram, index, ZRAM_UNDER_WB))
|
||||
zram_set_flag(zram, index, ZRAM_IDLE);
|
||||
zram_slot_unlock(zram, index);
|
||||
}
|
||||
|
||||
|
@ -330,6 +328,41 @@ static ssize_t idle_store(struct device *dev,
|
|||
}
|
||||
|
||||
#ifdef CONFIG_ZRAM_WRITEBACK
|
||||
static ssize_t writeback_limit_enable_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t len)
|
||||
{
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
u64 val;
|
||||
ssize_t ret = -EINVAL;
|
||||
|
||||
if (kstrtoull(buf, 10, &val))
|
||||
return ret;
|
||||
|
||||
down_read(&zram->init_lock);
|
||||
spin_lock(&zram->wb_limit_lock);
|
||||
zram->wb_limit_enable = val;
|
||||
spin_unlock(&zram->wb_limit_lock);
|
||||
up_read(&zram->init_lock);
|
||||
ret = len;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t writeback_limit_enable_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
bool val;
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
|
||||
down_read(&zram->init_lock);
|
||||
spin_lock(&zram->wb_limit_lock);
|
||||
val = zram->wb_limit_enable;
|
||||
spin_unlock(&zram->wb_limit_lock);
|
||||
up_read(&zram->init_lock);
|
||||
|
||||
return scnprintf(buf, PAGE_SIZE, "%d\n", val);
|
||||
}
|
||||
|
||||
static ssize_t writeback_limit_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t len)
|
||||
{
|
||||
|
@ -341,9 +374,9 @@ static ssize_t writeback_limit_store(struct device *dev,
|
|||
return ret;
|
||||
|
||||
down_read(&zram->init_lock);
|
||||
atomic64_set(&zram->stats.bd_wb_limit, val);
|
||||
if (val == 0)
|
||||
zram->stop_writeback = false;
|
||||
spin_lock(&zram->wb_limit_lock);
|
||||
zram->bd_wb_limit = val;
|
||||
spin_unlock(&zram->wb_limit_lock);
|
||||
up_read(&zram->init_lock);
|
||||
ret = len;
|
||||
|
||||
|
@ -357,7 +390,9 @@ static ssize_t writeback_limit_show(struct device *dev,
|
|||
struct zram *zram = dev_to_zram(dev);
|
||||
|
||||
down_read(&zram->init_lock);
|
||||
val = atomic64_read(&zram->stats.bd_wb_limit);
|
||||
spin_lock(&zram->wb_limit_lock);
|
||||
val = zram->bd_wb_limit;
|
||||
spin_unlock(&zram->wb_limit_lock);
|
||||
up_read(&zram->init_lock);
|
||||
|
||||
return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
|
||||
|
@ -588,8 +623,8 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
|
|||
return 1;
|
||||
}
|
||||
|
||||
#define HUGE_WRITEBACK 0x1
|
||||
#define IDLE_WRITEBACK 0x2
|
||||
#define HUGE_WRITEBACK 1
|
||||
#define IDLE_WRITEBACK 2
|
||||
|
||||
static ssize_t writeback_store(struct device *dev,
|
||||
struct device_attribute *attr, const char *buf, size_t len)
|
||||
|
@ -602,7 +637,7 @@ static ssize_t writeback_store(struct device *dev,
|
|||
struct page *page;
|
||||
ssize_t ret, sz;
|
||||
char mode_buf[8];
|
||||
unsigned long mode = -1UL;
|
||||
int mode = -1;
|
||||
unsigned long blk_idx = 0;
|
||||
|
||||
sz = strscpy(mode_buf, buf, sizeof(mode_buf));
|
||||
|
@ -618,7 +653,7 @@ static ssize_t writeback_store(struct device *dev,
|
|||
else if (!strcmp(mode_buf, "huge"))
|
||||
mode = HUGE_WRITEBACK;
|
||||
|
||||
if (mode == -1UL)
|
||||
if (mode == -1)
|
||||
return -EINVAL;
|
||||
|
||||
down_read(&zram->init_lock);
|
||||
|
@ -645,10 +680,13 @@ static ssize_t writeback_store(struct device *dev,
|
|||
bvec.bv_len = PAGE_SIZE;
|
||||
bvec.bv_offset = 0;
|
||||
|
||||
if (zram->stop_writeback) {
|
||||
spin_lock(&zram->wb_limit_lock);
|
||||
if (zram->wb_limit_enable && !zram->bd_wb_limit) {
|
||||
spin_unlock(&zram->wb_limit_lock);
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
spin_unlock(&zram->wb_limit_lock);
|
||||
|
||||
if (!blk_idx) {
|
||||
blk_idx = alloc_block_bdev(zram);
|
||||
|
@ -667,10 +705,11 @@ static ssize_t writeback_store(struct device *dev,
|
|||
zram_test_flag(zram, index, ZRAM_UNDER_WB))
|
||||
goto next;
|
||||
|
||||
if ((mode & IDLE_WRITEBACK &&
|
||||
!zram_test_flag(zram, index, ZRAM_IDLE)) &&
|
||||
(mode & HUGE_WRITEBACK &&
|
||||
!zram_test_flag(zram, index, ZRAM_HUGE)))
|
||||
if (mode == IDLE_WRITEBACK &&
|
||||
!zram_test_flag(zram, index, ZRAM_IDLE))
|
||||
goto next;
|
||||
if (mode == HUGE_WRITEBACK &&
|
||||
!zram_test_flag(zram, index, ZRAM_HUGE))
|
||||
goto next;
|
||||
/*
|
||||
* Clearing ZRAM_UNDER_WB is duty of caller.
|
||||
|
@ -732,11 +771,10 @@ static ssize_t writeback_store(struct device *dev,
|
|||
zram_set_element(zram, index, blk_idx);
|
||||
blk_idx = 0;
|
||||
atomic64_inc(&zram->stats.pages_stored);
|
||||
if (atomic64_add_unless(&zram->stats.bd_wb_limit,
|
||||
-1 << (PAGE_SHIFT - 12), 0)) {
|
||||
if (atomic64_read(&zram->stats.bd_wb_limit) == 0)
|
||||
zram->stop_writeback = true;
|
||||
}
|
||||
spin_lock(&zram->wb_limit_lock);
|
||||
if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
|
||||
zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
|
||||
spin_unlock(&zram->wb_limit_lock);
|
||||
next:
|
||||
zram_slot_unlock(zram, index);
|
||||
}
|
||||
|
@ -1812,6 +1850,7 @@ static DEVICE_ATTR_RW(comp_algorithm);
|
|||
static DEVICE_ATTR_RW(backing_dev);
|
||||
static DEVICE_ATTR_WO(writeback);
|
||||
static DEVICE_ATTR_RW(writeback_limit);
|
||||
static DEVICE_ATTR_RW(writeback_limit_enable);
|
||||
#endif
|
||||
|
||||
static struct attribute *zram_disk_attrs[] = {
|
||||
|
@ -1828,6 +1867,7 @@ static struct attribute *zram_disk_attrs[] = {
|
|||
&dev_attr_backing_dev.attr,
|
||||
&dev_attr_writeback.attr,
|
||||
&dev_attr_writeback_limit.attr,
|
||||
&dev_attr_writeback_limit_enable.attr,
|
||||
#endif
|
||||
&dev_attr_io_stat.attr,
|
||||
&dev_attr_mm_stat.attr,
|
||||
|
@ -1867,7 +1907,9 @@ static int zram_add(void)
|
|||
device_id = ret;
|
||||
|
||||
init_rwsem(&zram->init_lock);
|
||||
|
||||
#ifdef CONFIG_ZRAM_WRITEBACK
|
||||
spin_lock_init(&zram->wb_limit_lock);
|
||||
#endif
|
||||
queue = blk_alloc_queue(GFP_KERNEL);
|
||||
if (!queue) {
|
||||
pr_err("Error allocating disk queue for device %d\n",
|
||||
|
|
|
@ -86,7 +86,6 @@ struct zram_stats {
|
|||
atomic64_t bd_count; /* no. of pages in backing device */
|
||||
atomic64_t bd_reads; /* no. of reads from backing device */
|
||||
atomic64_t bd_writes; /* no. of writes from backing device */
|
||||
atomic64_t bd_wb_limit; /* writeback limit of backing device */
|
||||
#endif
|
||||
};
|
||||
|
||||
|
@ -114,8 +113,10 @@ struct zram {
|
|||
*/
|
||||
bool claim; /* Protected by bdev->bd_mutex */
|
||||
struct file *backing_dev;
|
||||
bool stop_writeback;
|
||||
#ifdef CONFIG_ZRAM_WRITEBACK
|
||||
spinlock_t wb_limit_lock;
|
||||
bool wb_limit_enable;
|
||||
u64 bd_wb_limit;
|
||||
struct block_device *bdev;
|
||||
unsigned int old_block_size;
|
||||
unsigned long *bitmap;
|
||||
|
|
|
@ -383,16 +383,17 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
|
|||
* truncation is indicated by end of range being LLONG_MAX
|
||||
* In this case, we first scan the range and release found pages.
|
||||
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
|
||||
* maps and global counts.
|
||||
* maps and global counts. Page faults can not race with truncation
|
||||
* in this routine. hugetlb_no_page() prevents page faults in the
|
||||
* truncated range. It checks i_size before allocation, and again after
|
||||
* with the page table lock for the page held. The same lock must be
|
||||
* acquired to unmap a page.
|
||||
* hole punch is indicated if end is not LLONG_MAX
|
||||
* In the hole punch case we scan the range and release found pages.
|
||||
* Only when releasing a page is the associated region/reserv map
|
||||
* deleted. The region/reserv map for ranges without associated
|
||||
* pages are not modified.
|
||||
*
|
||||
* Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
|
||||
* races with page faults.
|
||||
*
|
||||
* pages are not modified. Page faults can race with hole punch.
|
||||
* This is indicated if we find a mapped page.
|
||||
* Note: If the passed end of range value is beyond the end of file, but
|
||||
* not LLONG_MAX this routine still performs a hole punch operation.
|
||||
*/
|
||||
|
@ -422,14 +423,32 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
|||
|
||||
for (i = 0; i < pagevec_count(&pvec); ++i) {
|
||||
struct page *page = pvec.pages[i];
|
||||
u32 hash;
|
||||
|
||||
index = page->index;
|
||||
hash = hugetlb_fault_mutex_hash(h, current->mm,
|
||||
&pseudo_vma,
|
||||
mapping, index, 0);
|
||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||
|
||||
/*
|
||||
* A mapped page is impossible as callers should unmap
|
||||
* all references before calling. And, i_mmap_rwsem
|
||||
* prevents the creation of additional mappings.
|
||||
* If page is mapped, it was faulted in after being
|
||||
* unmapped in caller. Unmap (again) now after taking
|
||||
* the fault mutex. The mutex will prevent faults
|
||||
* until we finish removing the page.
|
||||
*
|
||||
* This race can only happen in the hole punch case.
|
||||
* Getting here in a truncate operation is a bug.
|
||||
*/
|
||||
VM_BUG_ON(page_mapped(page));
|
||||
if (unlikely(page_mapped(page))) {
|
||||
BUG_ON(truncate_op);
|
||||
|
||||
i_mmap_lock_write(mapping);
|
||||
hugetlb_vmdelete_list(&mapping->i_mmap,
|
||||
index * pages_per_huge_page(h),
|
||||
(index + 1) * pages_per_huge_page(h));
|
||||
i_mmap_unlock_write(mapping);
|
||||
}
|
||||
|
||||
lock_page(page);
|
||||
/*
|
||||
|
@ -451,6 +470,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
|||
}
|
||||
|
||||
unlock_page(page);
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
}
|
||||
huge_pagevec_release(&pvec);
|
||||
cond_resched();
|
||||
|
@ -462,20 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
|||
|
||||
static void hugetlbfs_evict_inode(struct inode *inode)
|
||||
{
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
struct resv_map *resv_map;
|
||||
|
||||
/*
|
||||
* The vfs layer guarantees that there are no other users of this
|
||||
* inode. Therefore, it would be safe to call remove_inode_hugepages
|
||||
* without holding i_mmap_rwsem. We acquire and hold here to be
|
||||
* consistent with other callers. Since there will be no contention
|
||||
* on the semaphore, overhead is negligible.
|
||||
*/
|
||||
i_mmap_lock_write(mapping);
|
||||
remove_inode_hugepages(inode, 0, LLONG_MAX);
|
||||
i_mmap_unlock_write(mapping);
|
||||
|
||||
resv_map = (struct resv_map *)inode->i_mapping->private_data;
|
||||
/* root inode doesn't have the resv_map, so we should check it */
|
||||
if (resv_map)
|
||||
|
@ -496,8 +505,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
|
|||
i_mmap_lock_write(mapping);
|
||||
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
|
||||
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
|
||||
remove_inode_hugepages(inode, offset, LLONG_MAX);
|
||||
i_mmap_unlock_write(mapping);
|
||||
remove_inode_hugepages(inode, offset, LLONG_MAX);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -531,8 +540,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
|
|||
hugetlb_vmdelete_list(&mapping->i_mmap,
|
||||
hole_start >> PAGE_SHIFT,
|
||||
hole_end >> PAGE_SHIFT);
|
||||
remove_inode_hugepages(inode, hole_start, hole_end);
|
||||
i_mmap_unlock_write(mapping);
|
||||
remove_inode_hugepages(inode, hole_start, hole_end);
|
||||
inode_unlock(inode);
|
||||
}
|
||||
|
||||
|
@ -615,11 +624,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
|
|||
/* addr is the offset within the file (zero based) */
|
||||
addr = index * hpage_size;
|
||||
|
||||
/*
|
||||
* fault mutex taken here, protects against fault path
|
||||
* and hole punch. inode_lock previously taken protects
|
||||
* against truncation.
|
||||
*/
|
||||
/* mutex taken here, fault path and hole punch */
|
||||
hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
|
||||
index, addr);
|
||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||
|
|
|
@ -520,6 +520,12 @@ enum pgdat_flags {
|
|||
PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
|
||||
};
|
||||
|
||||
enum zone_flags {
|
||||
ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks.
|
||||
* Cleared when kswapd is woken.
|
||||
*/
|
||||
};
|
||||
|
||||
static inline unsigned long zone_managed_pages(struct zone *zone)
|
||||
{
|
||||
return (unsigned long)atomic_long_read(&zone->managed_pages);
|
||||
|
|
|
@ -217,6 +217,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
|
|||
memset(s->addr, 0, THREAD_SIZE);
|
||||
|
||||
tsk->stack_vm_area = s;
|
||||
tsk->stack = s->addr;
|
||||
return s->addr;
|
||||
}
|
||||
|
||||
|
|
81
mm/hugetlb.c
81
mm/hugetlb.c
|
@ -3238,7 +3238,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
|
|||
struct page *ptepage;
|
||||
unsigned long addr;
|
||||
int cow;
|
||||
struct address_space *mapping = vma->vm_file->f_mapping;
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
unsigned long sz = huge_page_size(h);
|
||||
struct mmu_notifier_range range;
|
||||
|
@ -3250,23 +3249,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
|
|||
mmu_notifier_range_init(&range, src, vma->vm_start,
|
||||
vma->vm_end);
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
} else {
|
||||
/*
|
||||
* For shared mappings i_mmap_rwsem must be held to call
|
||||
* huge_pte_alloc, otherwise the returned ptep could go
|
||||
* away if part of a shared pmd and another thread calls
|
||||
* huge_pmd_unshare.
|
||||
*/
|
||||
i_mmap_lock_read(mapping);
|
||||
}
|
||||
|
||||
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
|
||||
spinlock_t *src_ptl, *dst_ptl;
|
||||
|
||||
src_pte = huge_pte_offset(src, addr, sz);
|
||||
if (!src_pte)
|
||||
continue;
|
||||
|
||||
dst_pte = huge_pte_alloc(dst, addr, sz);
|
||||
if (!dst_pte) {
|
||||
ret = -ENOMEM;
|
||||
|
@ -3337,8 +3326,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
|
|||
|
||||
if (cow)
|
||||
mmu_notifier_invalidate_range_end(&range);
|
||||
else
|
||||
i_mmap_unlock_read(mapping);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -3755,16 +3742,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
|
|||
}
|
||||
|
||||
/*
|
||||
* We can not race with truncation due to holding i_mmap_rwsem.
|
||||
* Check once here for faults beyond end of file.
|
||||
* Use page lock to guard against racing truncation
|
||||
* before we get page_table_lock.
|
||||
*/
|
||||
size = i_size_read(mapping->host) >> huge_page_shift(h);
|
||||
if (idx >= size)
|
||||
goto out;
|
||||
|
||||
retry:
|
||||
page = find_lock_page(mapping, idx);
|
||||
if (!page) {
|
||||
size = i_size_read(mapping->host) >> huge_page_shift(h);
|
||||
if (idx >= size)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Check for page in userfault range
|
||||
*/
|
||||
|
@ -3784,18 +3771,14 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
|
|||
};
|
||||
|
||||
/*
|
||||
* hugetlb_fault_mutex and i_mmap_rwsem must be
|
||||
* dropped before handling userfault. Reacquire
|
||||
* after handling fault to make calling code simpler.
|
||||
* hugetlb_fault_mutex must be dropped before
|
||||
* handling userfault. Reacquire after handling
|
||||
* fault to make calling code simpler.
|
||||
*/
|
||||
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
|
||||
idx, haddr);
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
i_mmap_unlock_read(mapping);
|
||||
|
||||
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
|
||||
|
||||
i_mmap_lock_read(mapping);
|
||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||
goto out;
|
||||
}
|
||||
|
@ -3854,6 +3837,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
|
|||
}
|
||||
|
||||
ptl = huge_pte_lock(h, mm, ptep);
|
||||
size = i_size_read(mapping->host) >> huge_page_shift(h);
|
||||
if (idx >= size)
|
||||
goto backout;
|
||||
|
||||
ret = 0;
|
||||
if (!huge_pte_none(huge_ptep_get(ptep)))
|
||||
|
@ -3940,11 +3926,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
|
||||
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
|
||||
if (ptep) {
|
||||
/*
|
||||
* Since we hold no locks, ptep could be stale. That is
|
||||
* OK as we are only making decisions based on content and
|
||||
* not actually modifying content here.
|
||||
*/
|
||||
entry = huge_ptep_get(ptep);
|
||||
if (unlikely(is_hugetlb_entry_migration(entry))) {
|
||||
migration_entry_wait_huge(vma, mm, ptep);
|
||||
|
@ -3952,33 +3933,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
|
||||
return VM_FAULT_HWPOISON_LARGE |
|
||||
VM_FAULT_SET_HINDEX(hstate_index(h));
|
||||
} else {
|
||||
ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
|
||||
if (!ptep)
|
||||
return VM_FAULT_OOM;
|
||||
}
|
||||
|
||||
/*
|
||||
* Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
|
||||
* until finished with ptep. This serves two purposes:
|
||||
* 1) It prevents huge_pmd_unshare from being called elsewhere
|
||||
* and making the ptep no longer valid.
|
||||
* 2) It synchronizes us with file truncation.
|
||||
*
|
||||
* ptep could have already be assigned via huge_pte_offset. That
|
||||
* is OK, as huge_pte_alloc will return the same value unless
|
||||
* something changed.
|
||||
*/
|
||||
mapping = vma->vm_file->f_mapping;
|
||||
i_mmap_lock_read(mapping);
|
||||
ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
|
||||
if (!ptep) {
|
||||
i_mmap_unlock_read(mapping);
|
||||
return VM_FAULT_OOM;
|
||||
}
|
||||
idx = vma_hugecache_offset(h, vma, haddr);
|
||||
|
||||
/*
|
||||
* Serialize hugepage allocation and instantiation, so that we don't
|
||||
* get spurious allocation failures if two CPUs race to instantiate
|
||||
* the same page in the page cache.
|
||||
*/
|
||||
idx = vma_hugecache_offset(h, vma, haddr);
|
||||
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
|
||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||
|
||||
|
@ -4066,7 +4034,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
}
|
||||
out_mutex:
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
i_mmap_unlock_read(mapping);
|
||||
/*
|
||||
* Generally it's safe to hold refcount during waiting page lock. But
|
||||
* here we just wait to defer the next page fault to avoid busy loop and
|
||||
|
@ -4671,12 +4638,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
|
|||
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
|
||||
* and returns the corresponding pte. While this is not necessary for the
|
||||
* !shared pmd case because we can allocate the pmd later as well, it makes the
|
||||
* code much cleaner.
|
||||
*
|
||||
* This routine must be called with i_mmap_rwsem held in at least read mode.
|
||||
* For hugetlbfs, this prevents removal of any page table entries associated
|
||||
* with the address space. This is important as we are setting up sharing
|
||||
* based on existing page table entries (mappings).
|
||||
* code much cleaner. pmd allocation is essential for the shared case because
|
||||
* pud has to be populated inside the same i_mmap_rwsem section - otherwise
|
||||
* racing tasks could either miss the sharing (see huge_pte_offset) or select a
|
||||
* bad pmd for sharing.
|
||||
*/
|
||||
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
|
||||
{
|
||||
|
@ -4693,6 +4658,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
|
|||
if (!vma_shareable(vma, addr))
|
||||
return (pte_t *)pmd_alloc(mm, pud, addr);
|
||||
|
||||
i_mmap_lock_write(mapping);
|
||||
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
|
||||
if (svma == vma)
|
||||
continue;
|
||||
|
@ -4722,6 +4688,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
|
|||
spin_unlock(ptl);
|
||||
out:
|
||||
pte = (pte_t *)pmd_alloc(mm, pud, addr);
|
||||
i_mmap_unlock_write(mapping);
|
||||
return pte;
|
||||
}
|
||||
|
||||
|
@ -4732,7 +4699,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
|
|||
* indicated by page_count > 1, unmap is achieved by clearing pud and
|
||||
* decrementing the ref count. If count == 1, the pte page is not shared.
|
||||
*
|
||||
* Called with page table lock held and i_mmap_rwsem held in write mode.
|
||||
* called with page table lock held.
|
||||
*
|
||||
* returns: 1 successfully unmapped a shared pte page
|
||||
* 0 the underlying pte page is not shared, or it is the last user
|
||||
|
|
|
@ -298,8 +298,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
|
|||
return;
|
||||
}
|
||||
|
||||
cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE);
|
||||
|
||||
*flags |= SLAB_KASAN;
|
||||
}
|
||||
|
||||
|
@ -349,28 +347,43 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
|
|||
}
|
||||
|
||||
/*
|
||||
* Since it's desirable to only call object contructors once during slab
|
||||
* allocation, we preassign tags to all such objects. Also preassign tags for
|
||||
* SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports.
|
||||
* For SLAB allocator we can't preassign tags randomly since the freelist is
|
||||
* stored as an array of indexes instead of a linked list. Assign tags based
|
||||
* on objects indexes, so that objects that are next to each other get
|
||||
* different tags.
|
||||
* After a tag is assigned, the object always gets allocated with the same tag.
|
||||
* The reason is that we can't change tags for objects with constructors on
|
||||
* reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor
|
||||
* code can save the pointer to the object somewhere (e.g. in the object
|
||||
* itself). Then if we retag it, the old saved pointer will become invalid.
|
||||
* This function assigns a tag to an object considering the following:
|
||||
* 1. A cache might have a constructor, which might save a pointer to a slab
|
||||
* object somewhere (e.g. in the object itself). We preassign a tag for
|
||||
* each object in caches with constructors during slab creation and reuse
|
||||
* the same tag each time a particular object is allocated.
|
||||
* 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be
|
||||
* accessed after being freed. We preassign tags for objects in these
|
||||
* caches as well.
|
||||
* 3. For SLAB allocator we can't preassign tags randomly since the freelist
|
||||
* is stored as an array of indexes instead of a linked list. Assign tags
|
||||
* based on objects indexes, so that objects that are next to each other
|
||||
* get different tags.
|
||||
*/
|
||||
static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new)
|
||||
static u8 assign_tag(struct kmem_cache *cache, const void *object,
|
||||
bool init, bool krealloc)
|
||||
{
|
||||
if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
|
||||
return new ? KASAN_TAG_KERNEL : random_tag();
|
||||
/* Reuse the same tag for krealloc'ed objects. */
|
||||
if (krealloc)
|
||||
return get_tag(object);
|
||||
|
||||
/*
|
||||
* If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
|
||||
* set, assign a tag when the object is being allocated (init == false).
|
||||
*/
|
||||
if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
|
||||
return init ? KASAN_TAG_KERNEL : random_tag();
|
||||
|
||||
/* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
|
||||
#ifdef CONFIG_SLAB
|
||||
/* For SLAB assign tags based on the object index in the freelist. */
|
||||
return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
|
||||
#else
|
||||
return new ? random_tag() : get_tag(object);
|
||||
/*
|
||||
* For SLUB assign a random tag during slab creation, otherwise reuse
|
||||
* the already assigned tag.
|
||||
*/
|
||||
return init ? random_tag() : get_tag(object);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -386,7 +399,8 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
|
|||
__memset(alloc_info, 0, sizeof(*alloc_info));
|
||||
|
||||
if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
|
||||
object = set_tag(object, assign_tag(cache, object, true));
|
||||
object = set_tag(object,
|
||||
assign_tag(cache, object, true, false));
|
||||
|
||||
return (void *)object;
|
||||
}
|
||||
|
@ -452,8 +466,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
|
|||
return __kasan_slab_free(cache, object, ip, true);
|
||||
}
|
||||
|
||||
void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
|
||||
size_t size, gfp_t flags)
|
||||
static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
|
||||
size_t size, gfp_t flags, bool krealloc)
|
||||
{
|
||||
unsigned long redzone_start;
|
||||
unsigned long redzone_end;
|
||||
|
@ -471,7 +485,7 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
|
|||
KASAN_SHADOW_SCALE_SIZE);
|
||||
|
||||
if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
|
||||
tag = assign_tag(cache, object, false);
|
||||
tag = assign_tag(cache, object, false, krealloc);
|
||||
|
||||
/* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
|
||||
kasan_unpoison_shadow(set_tag(object, tag), size);
|
||||
|
@ -483,6 +497,12 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
|
|||
|
||||
return set_tag(object, tag);
|
||||
}
|
||||
|
||||
void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
|
||||
size_t size, gfp_t flags)
|
||||
{
|
||||
return __kasan_kmalloc(cache, object, size, flags, false);
|
||||
}
|
||||
EXPORT_SYMBOL(kasan_kmalloc);
|
||||
|
||||
void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
|
||||
|
@ -522,7 +542,8 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
|
|||
if (unlikely(!PageSlab(page)))
|
||||
return kasan_kmalloc_large(object, size, flags);
|
||||
else
|
||||
return kasan_kmalloc(page->slab_cache, object, size, flags);
|
||||
return __kasan_kmalloc(page->slab_cache, object, size,
|
||||
flags, true);
|
||||
}
|
||||
|
||||
void kasan_poison_kfree(void *ptr, unsigned long ip)
|
||||
|
|
|
@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
|||
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
|
||||
struct address_space *mapping;
|
||||
LIST_HEAD(tokill);
|
||||
bool unmap_success = true;
|
||||
bool unmap_success;
|
||||
int kill = 1, forcekill;
|
||||
struct page *hpage = *hpagep;
|
||||
bool mlocked = PageMlocked(hpage);
|
||||
|
@ -1028,19 +1028,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
|||
if (kill)
|
||||
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
|
||||
|
||||
if (!PageHuge(hpage)) {
|
||||
unmap_success = try_to_unmap(hpage, ttu);
|
||||
} else if (mapping) {
|
||||
/*
|
||||
* For hugetlb pages, try_to_unmap could potentially call
|
||||
* huge_pmd_unshare. Because of this, take semaphore in
|
||||
* write mode here and set TTU_RMAP_LOCKED to indicate we
|
||||
* have taken the lock at this higer level.
|
||||
*/
|
||||
i_mmap_lock_write(mapping);
|
||||
unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
|
||||
i_mmap_unlock_write(mapping);
|
||||
}
|
||||
unmap_success = try_to_unmap(hpage, ttu);
|
||||
if (!unmap_success)
|
||||
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
|
||||
pfn, page_mapcount(hpage));
|
||||
|
|
26
mm/memory.c
26
mm/memory.c
|
@ -2994,6 +2994,28 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
|
|||
struct vm_area_struct *vma = vmf->vma;
|
||||
vm_fault_t ret;
|
||||
|
||||
/*
|
||||
* Preallocate pte before we take page_lock because this might lead to
|
||||
* deadlocks for memcg reclaim which waits for pages under writeback:
|
||||
* lock_page(A)
|
||||
* SetPageWriteback(A)
|
||||
* unlock_page(A)
|
||||
* lock_page(B)
|
||||
* lock_page(B)
|
||||
* pte_alloc_pne
|
||||
* shrink_page_list
|
||||
* wait_on_page_writeback(A)
|
||||
* SetPageWriteback(B)
|
||||
* unlock_page(B)
|
||||
* # flush A, B to clear the writeback
|
||||
*/
|
||||
if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
|
||||
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
|
||||
if (!vmf->prealloc_pte)
|
||||
return VM_FAULT_OOM;
|
||||
smp_wmb(); /* See comment in __pte_alloc() */
|
||||
}
|
||||
|
||||
ret = vma->vm_ops->fault(vmf);
|
||||
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
|
||||
VM_FAULT_DONE_COW)))
|
||||
|
@ -4077,8 +4099,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
|
|||
goto out;
|
||||
|
||||
if (range) {
|
||||
range->start = address & PAGE_MASK;
|
||||
range->end = range->start + PAGE_SIZE;
|
||||
mmu_notifier_range_init(range, mm, address & PAGE_MASK,
|
||||
(address & PAGE_MASK) + PAGE_SIZE);
|
||||
mmu_notifier_invalidate_range_start(range);
|
||||
}
|
||||
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
|
||||
|
|
13
mm/migrate.c
13
mm/migrate.c
|
@ -1324,19 +1324,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
|
|||
goto put_anon;
|
||||
|
||||
if (page_mapped(hpage)) {
|
||||
struct address_space *mapping = page_mapping(hpage);
|
||||
|
||||
/*
|
||||
* try_to_unmap could potentially call huge_pmd_unshare.
|
||||
* Because of this, take semaphore in write mode here and
|
||||
* set TTU_RMAP_LOCKED to let lower levels know we have
|
||||
* taken the lock.
|
||||
*/
|
||||
i_mmap_lock_write(mapping);
|
||||
try_to_unmap(hpage,
|
||||
TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
|
||||
TTU_RMAP_LOCKED);
|
||||
i_mmap_unlock_write(mapping);
|
||||
TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
|
||||
page_was_mapped = 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -2214,7 +2214,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
|
|||
*/
|
||||
boost_watermark(zone);
|
||||
if (alloc_flags & ALLOC_KSWAPD)
|
||||
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
|
||||
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
|
||||
|
||||
/* We are not allowed to try stealing from the whole block */
|
||||
if (!whole_block)
|
||||
|
@ -3102,6 +3102,12 @@ struct page *rmqueue(struct zone *preferred_zone,
|
|||
local_irq_restore(flags);
|
||||
|
||||
out:
|
||||
/* Separate test+clear to avoid unnecessary atomics */
|
||||
if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
|
||||
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
|
||||
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
|
||||
}
|
||||
|
||||
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
|
||||
return page;
|
||||
|
||||
|
|
|
@ -25,7 +25,6 @@
|
|||
* page->flags PG_locked (lock_page)
|
||||
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
|
||||
* mapping->i_mmap_rwsem
|
||||
* hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
|
||||
* anon_vma->rwsem
|
||||
* mm->page_table_lock or pte_lock
|
||||
* zone_lru_lock (in mark_page_accessed, isolate_lru_page)
|
||||
|
@ -1379,9 +1378,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
|||
/*
|
||||
* If sharing is possible, start and end will be adjusted
|
||||
* accordingly.
|
||||
*
|
||||
* If called for a huge page, caller must hold i_mmap_rwsem
|
||||
* in write mode as it is possible to call huge_pmd_unshare.
|
||||
*/
|
||||
adjust_range_if_pmd_sharing_possible(vma, &range.start,
|
||||
&range.end);
|
||||
|
|
|
@ -666,8 +666,10 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
|
|||
struct alien_cache *alc = NULL;
|
||||
|
||||
alc = kmalloc_node(memsize, gfp, node);
|
||||
init_arraycache(&alc->ac, entries, batch);
|
||||
spin_lock_init(&alc->lock);
|
||||
if (alc) {
|
||||
init_arraycache(&alc->ac, entries, batch);
|
||||
spin_lock_init(&alc->lock);
|
||||
}
|
||||
return alc;
|
||||
}
|
||||
|
||||
|
|
|
@ -3846,6 +3846,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
|
|||
unsigned int offset;
|
||||
size_t object_size;
|
||||
|
||||
ptr = kasan_reset_tag(ptr);
|
||||
|
||||
/* Find object and usable object size. */
|
||||
s = page->slab_cache;
|
||||
|
||||
|
|
|
@ -247,7 +247,8 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
|
|||
/*
|
||||
* Validates that the given object is:
|
||||
* - not bogus address
|
||||
* - known-safe heap or stack object
|
||||
* - fully contained by stack (or stack frame, when available)
|
||||
* - fully within SLAB object (or object whitelist area, when available)
|
||||
* - not in kernel text
|
||||
*/
|
||||
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
|
||||
|
@ -262,9 +263,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
|
|||
/* Check for invalid addresses. */
|
||||
check_bogus_address((const unsigned long)ptr, n, to_user);
|
||||
|
||||
/* Check for bad heap object. */
|
||||
check_heap_object(ptr, n, to_user);
|
||||
|
||||
/* Check for bad stack object. */
|
||||
switch (check_stack_object(ptr, n)) {
|
||||
case NOT_STACK:
|
||||
|
@ -282,6 +280,9 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
|
|||
usercopy_abort("process stack", NULL, to_user, 0, n);
|
||||
}
|
||||
|
||||
/* Check for bad heap object. */
|
||||
check_heap_object(ptr, n, to_user);
|
||||
|
||||
/* Check for object in kernel to avoid text exposure. */
|
||||
check_kernel_text_object((const unsigned long)ptr, n, to_user);
|
||||
}
|
||||
|
|
|
@ -267,14 +267,10 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
|
|||
VM_BUG_ON(dst_addr & ~huge_page_mask(h));
|
||||
|
||||
/*
|
||||
* Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
|
||||
* i_mmap_rwsem ensures the dst_pte remains valid even
|
||||
* in the case of shared pmds. fault mutex prevents
|
||||
* races with other faulting threads.
|
||||
* Serialize via hugetlb_fault_mutex
|
||||
*/
|
||||
mapping = dst_vma->vm_file->f_mapping;
|
||||
i_mmap_lock_read(mapping);
|
||||
idx = linear_page_index(dst_vma, dst_addr);
|
||||
mapping = dst_vma->vm_file->f_mapping;
|
||||
hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
|
||||
idx, dst_addr);
|
||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||
|
@ -283,7 +279,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
|
|||
dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
|
||||
if (!dst_pte) {
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
i_mmap_unlock_read(mapping);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
|
@ -291,7 +286,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
|
|||
dst_pteval = huge_ptep_get(dst_pte);
|
||||
if (!huge_pte_none(dst_pteval)) {
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
i_mmap_unlock_read(mapping);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
|
@ -299,7 +293,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
|
|||
dst_addr, src_addr, &page);
|
||||
|
||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||
i_mmap_unlock_read(mapping);
|
||||
vm_alloc_shared = vm_shared;
|
||||
|
||||
cond_resched();
|
||||
|
|
|
@ -478,7 +478,7 @@ bool page_mapped(struct page *page)
|
|||
return true;
|
||||
if (PageHuge(page))
|
||||
return false;
|
||||
for (i = 0; i < hpage_nr_pages(page); i++) {
|
||||
for (i = 0; i < (1 << compound_order(page)); i++) {
|
||||
if (atomic_read(&page[i]._mapcount) >= 0)
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -5,7 +5,9 @@
|
|||
* Example use:
|
||||
* cat /sys/kernel/debug/page_owner > page_owner_full.txt
|
||||
* grep -v ^PFN page_owner_full.txt > page_owner.txt
|
||||
* ./sort page_owner.txt sorted_page_owner.txt
|
||||
* ./page_owner_sort page_owner.txt sorted_page_owner.txt
|
||||
*
|
||||
* See Documentation/vm/page_owner.rst
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
|
Loading…
Reference in New Issue