Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton: "14 fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm, page_alloc: do not wake kswapd with zone lock held hugetlbfs: revert "use i_mmap_rwsem for more pmd sharing synchronization" hugetlbfs: revert "Use i_mmap_rwsem to fix page fault/truncate race" mm: page_mapped: don't assume compound page is huge or THP mm/memory.c: initialise mmu_notifier_range correctly tools/vm/page_owner: use page_owner_sort in the use example kasan: fix krealloc handling for tag-based mode kasan: make tag based mode work with CONFIG_HARDENED_USERCOPY kasan, arm64: use ARCH_SLAB_MINALIGN instead of manual aligning mm, memcg: fix reclaim deadlock with writeback mm/usercopy.c: no check page span for stack objects slab: alien caches must not be initialized if the allocation of the alien cache failed fork, memcg: fix cached_stacks case zram: idle writeback fixes and cleanup
This commit is contained in:
commit
a88cc8da02
|
@ -122,11 +122,18 @@ Description:
|
||||||
statistics (bd_count, bd_reads, bd_writes) in a format
|
statistics (bd_count, bd_reads, bd_writes) in a format
|
||||||
similar to block layer statistics file format.
|
similar to block layer statistics file format.
|
||||||
|
|
||||||
|
What: /sys/block/zram<id>/writeback_limit_enable
|
||||||
|
Date: November 2018
|
||||||
|
Contact: Minchan Kim <minchan@kernel.org>
|
||||||
|
Description:
|
||||||
|
The writeback_limit_enable file is read-write and specifies
|
||||||
|
eanbe of writeback_limit feature. "1" means eable the feature.
|
||||||
|
No limit "0" is the initial state.
|
||||||
|
|
||||||
What: /sys/block/zram<id>/writeback_limit
|
What: /sys/block/zram<id>/writeback_limit
|
||||||
Date: November 2018
|
Date: November 2018
|
||||||
Contact: Minchan Kim <minchan@kernel.org>
|
Contact: Minchan Kim <minchan@kernel.org>
|
||||||
Description:
|
Description:
|
||||||
The writeback_limit file is read-write and specifies the maximum
|
The writeback_limit file is read-write and specifies the maximum
|
||||||
amount of writeback ZRAM can do. The limit could be changed
|
amount of writeback ZRAM can do. The limit could be changed
|
||||||
in run time and "0" means disable the limit.
|
in run time.
|
||||||
No limit is the initial state.
|
|
||||||
|
|
|
@ -166,6 +166,7 @@ mem_limit WO specifies the maximum amount of memory ZRAM can use
|
||||||
to store the compressed data
|
to store the compressed data
|
||||||
writeback_limit WO specifies the maximum amount of write IO zram can
|
writeback_limit WO specifies the maximum amount of write IO zram can
|
||||||
write out to backing device as 4KB unit
|
write out to backing device as 4KB unit
|
||||||
|
writeback_limit_enable RW show and set writeback_limit feature
|
||||||
max_comp_streams RW the number of possible concurrent compress operations
|
max_comp_streams RW the number of possible concurrent compress operations
|
||||||
comp_algorithm RW show and change the compression algorithm
|
comp_algorithm RW show and change the compression algorithm
|
||||||
compact WO trigger memory compaction
|
compact WO trigger memory compaction
|
||||||
|
@ -280,32 +281,51 @@ With the command, zram writeback idle pages from memory to the storage.
|
||||||
If there are lots of write IO with flash device, potentially, it has
|
If there are lots of write IO with flash device, potentially, it has
|
||||||
flash wearout problem so that admin needs to design write limitation
|
flash wearout problem so that admin needs to design write limitation
|
||||||
to guarantee storage health for entire product life.
|
to guarantee storage health for entire product life.
|
||||||
To overcome the concern, zram supports "writeback_limit".
|
|
||||||
The "writeback_limit"'s default value is 0 so that it doesn't limit
|
To overcome the concern, zram supports "writeback_limit" feature.
|
||||||
any writeback. If admin want to measure writeback count in a certain
|
The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
|
||||||
period, he could know it via /sys/block/zram0/bd_stat's 3rd column.
|
any writeback. IOW, if admin want to apply writeback budget, he should
|
||||||
|
enable writeback_limit_enable via
|
||||||
|
|
||||||
|
$ echo 1 > /sys/block/zramX/writeback_limit_enable
|
||||||
|
|
||||||
|
Once writeback_limit_enable is set, zram doesn't allow any writeback
|
||||||
|
until admin set the budget via /sys/block/zramX/writeback_limit.
|
||||||
|
|
||||||
|
(If admin doesn't enable writeback_limit_enable, writeback_limit's value
|
||||||
|
assigned via /sys/block/zramX/writeback_limit is meaninless.)
|
||||||
|
|
||||||
If admin want to limit writeback as per-day 400M, he could do it
|
If admin want to limit writeback as per-day 400M, he could do it
|
||||||
like below.
|
like below.
|
||||||
|
|
||||||
MB_SHIFT=20
|
$ MB_SHIFT=20
|
||||||
4K_SHIFT=12
|
$ 4K_SHIFT=12
|
||||||
echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
|
$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
|
||||||
/sys/block/zram0/writeback_limit.
|
/sys/block/zram0/writeback_limit.
|
||||||
|
$ echo 1 > /sys/block/zram0/writeback_limit_enable
|
||||||
|
|
||||||
If admin want to allow further write again, he could do it like below
|
If admin want to allow further write again once the bugdet is exausted,
|
||||||
|
he could do it like below
|
||||||
|
|
||||||
echo 0 > /sys/block/zram0/writeback_limit
|
$ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
|
||||||
|
/sys/block/zram0/writeback_limit
|
||||||
|
|
||||||
If admin want to see remaining writeback budget since he set,
|
If admin want to see remaining writeback budget since he set,
|
||||||
|
|
||||||
cat /sys/block/zram0/writeback_limit
|
$ cat /sys/block/zramX/writeback_limit
|
||||||
|
|
||||||
|
If admin want to disable writeback limit, he could do
|
||||||
|
|
||||||
|
$ echo 0 > /sys/block/zramX/writeback_limit_enable
|
||||||
|
|
||||||
The writeback_limit count will reset whenever you reset zram(e.g.,
|
The writeback_limit count will reset whenever you reset zram(e.g.,
|
||||||
system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
|
system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
|
||||||
writeback happened until you reset the zram to allocate extra writeback
|
writeback happened until you reset the zram to allocate extra writeback
|
||||||
budget in next setting is user's job.
|
budget in next setting is user's job.
|
||||||
|
|
||||||
|
If admin want to measure writeback count in a certain period, he could
|
||||||
|
know it via /sys/block/zram0/bd_stat's 3rd column.
|
||||||
|
|
||||||
= memory tracking
|
= memory tracking
|
||||||
|
|
||||||
With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the
|
With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the
|
||||||
|
|
|
@ -58,6 +58,12 @@
|
||||||
*/
|
*/
|
||||||
#define ARCH_DMA_MINALIGN (128)
|
#define ARCH_DMA_MINALIGN (128)
|
||||||
|
|
||||||
|
#ifdef CONFIG_KASAN_SW_TAGS
|
||||||
|
#define ARCH_SLAB_MINALIGN (1ULL << KASAN_SHADOW_SCALE_SHIFT)
|
||||||
|
#else
|
||||||
|
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef __ASSEMBLY__
|
#ifndef __ASSEMBLY__
|
||||||
|
|
||||||
#include <linux/bitops.h>
|
#include <linux/bitops.h>
|
||||||
|
|
|
@ -316,11 +316,9 @@ static ssize_t idle_store(struct device *dev,
|
||||||
* See the comment in writeback_store.
|
* See the comment in writeback_store.
|
||||||
*/
|
*/
|
||||||
zram_slot_lock(zram, index);
|
zram_slot_lock(zram, index);
|
||||||
if (!zram_allocated(zram, index) ||
|
if (zram_allocated(zram, index) &&
|
||||||
zram_test_flag(zram, index, ZRAM_UNDER_WB))
|
!zram_test_flag(zram, index, ZRAM_UNDER_WB))
|
||||||
goto next;
|
|
||||||
zram_set_flag(zram, index, ZRAM_IDLE);
|
zram_set_flag(zram, index, ZRAM_IDLE);
|
||||||
next:
|
|
||||||
zram_slot_unlock(zram, index);
|
zram_slot_unlock(zram, index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -330,6 +328,41 @@ static ssize_t idle_store(struct device *dev,
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_ZRAM_WRITEBACK
|
#ifdef CONFIG_ZRAM_WRITEBACK
|
||||||
|
static ssize_t writeback_limit_enable_store(struct device *dev,
|
||||||
|
struct device_attribute *attr, const char *buf, size_t len)
|
||||||
|
{
|
||||||
|
struct zram *zram = dev_to_zram(dev);
|
||||||
|
u64 val;
|
||||||
|
ssize_t ret = -EINVAL;
|
||||||
|
|
||||||
|
if (kstrtoull(buf, 10, &val))
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
down_read(&zram->init_lock);
|
||||||
|
spin_lock(&zram->wb_limit_lock);
|
||||||
|
zram->wb_limit_enable = val;
|
||||||
|
spin_unlock(&zram->wb_limit_lock);
|
||||||
|
up_read(&zram->init_lock);
|
||||||
|
ret = len;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ssize_t writeback_limit_enable_show(struct device *dev,
|
||||||
|
struct device_attribute *attr, char *buf)
|
||||||
|
{
|
||||||
|
bool val;
|
||||||
|
struct zram *zram = dev_to_zram(dev);
|
||||||
|
|
||||||
|
down_read(&zram->init_lock);
|
||||||
|
spin_lock(&zram->wb_limit_lock);
|
||||||
|
val = zram->wb_limit_enable;
|
||||||
|
spin_unlock(&zram->wb_limit_lock);
|
||||||
|
up_read(&zram->init_lock);
|
||||||
|
|
||||||
|
return scnprintf(buf, PAGE_SIZE, "%d\n", val);
|
||||||
|
}
|
||||||
|
|
||||||
static ssize_t writeback_limit_store(struct device *dev,
|
static ssize_t writeback_limit_store(struct device *dev,
|
||||||
struct device_attribute *attr, const char *buf, size_t len)
|
struct device_attribute *attr, const char *buf, size_t len)
|
||||||
{
|
{
|
||||||
|
@ -341,9 +374,9 @@ static ssize_t writeback_limit_store(struct device *dev,
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
down_read(&zram->init_lock);
|
down_read(&zram->init_lock);
|
||||||
atomic64_set(&zram->stats.bd_wb_limit, val);
|
spin_lock(&zram->wb_limit_lock);
|
||||||
if (val == 0)
|
zram->bd_wb_limit = val;
|
||||||
zram->stop_writeback = false;
|
spin_unlock(&zram->wb_limit_lock);
|
||||||
up_read(&zram->init_lock);
|
up_read(&zram->init_lock);
|
||||||
ret = len;
|
ret = len;
|
||||||
|
|
||||||
|
@ -357,7 +390,9 @@ static ssize_t writeback_limit_show(struct device *dev,
|
||||||
struct zram *zram = dev_to_zram(dev);
|
struct zram *zram = dev_to_zram(dev);
|
||||||
|
|
||||||
down_read(&zram->init_lock);
|
down_read(&zram->init_lock);
|
||||||
val = atomic64_read(&zram->stats.bd_wb_limit);
|
spin_lock(&zram->wb_limit_lock);
|
||||||
|
val = zram->bd_wb_limit;
|
||||||
|
spin_unlock(&zram->wb_limit_lock);
|
||||||
up_read(&zram->init_lock);
|
up_read(&zram->init_lock);
|
||||||
|
|
||||||
return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
|
return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
|
||||||
|
@ -588,8 +623,8 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define HUGE_WRITEBACK 0x1
|
#define HUGE_WRITEBACK 1
|
||||||
#define IDLE_WRITEBACK 0x2
|
#define IDLE_WRITEBACK 2
|
||||||
|
|
||||||
static ssize_t writeback_store(struct device *dev,
|
static ssize_t writeback_store(struct device *dev,
|
||||||
struct device_attribute *attr, const char *buf, size_t len)
|
struct device_attribute *attr, const char *buf, size_t len)
|
||||||
|
@ -602,7 +637,7 @@ static ssize_t writeback_store(struct device *dev,
|
||||||
struct page *page;
|
struct page *page;
|
||||||
ssize_t ret, sz;
|
ssize_t ret, sz;
|
||||||
char mode_buf[8];
|
char mode_buf[8];
|
||||||
unsigned long mode = -1UL;
|
int mode = -1;
|
||||||
unsigned long blk_idx = 0;
|
unsigned long blk_idx = 0;
|
||||||
|
|
||||||
sz = strscpy(mode_buf, buf, sizeof(mode_buf));
|
sz = strscpy(mode_buf, buf, sizeof(mode_buf));
|
||||||
|
@ -618,7 +653,7 @@ static ssize_t writeback_store(struct device *dev,
|
||||||
else if (!strcmp(mode_buf, "huge"))
|
else if (!strcmp(mode_buf, "huge"))
|
||||||
mode = HUGE_WRITEBACK;
|
mode = HUGE_WRITEBACK;
|
||||||
|
|
||||||
if (mode == -1UL)
|
if (mode == -1)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
down_read(&zram->init_lock);
|
down_read(&zram->init_lock);
|
||||||
|
@ -645,10 +680,13 @@ static ssize_t writeback_store(struct device *dev,
|
||||||
bvec.bv_len = PAGE_SIZE;
|
bvec.bv_len = PAGE_SIZE;
|
||||||
bvec.bv_offset = 0;
|
bvec.bv_offset = 0;
|
||||||
|
|
||||||
if (zram->stop_writeback) {
|
spin_lock(&zram->wb_limit_lock);
|
||||||
|
if (zram->wb_limit_enable && !zram->bd_wb_limit) {
|
||||||
|
spin_unlock(&zram->wb_limit_lock);
|
||||||
ret = -EIO;
|
ret = -EIO;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
spin_unlock(&zram->wb_limit_lock);
|
||||||
|
|
||||||
if (!blk_idx) {
|
if (!blk_idx) {
|
||||||
blk_idx = alloc_block_bdev(zram);
|
blk_idx = alloc_block_bdev(zram);
|
||||||
|
@ -667,10 +705,11 @@ static ssize_t writeback_store(struct device *dev,
|
||||||
zram_test_flag(zram, index, ZRAM_UNDER_WB))
|
zram_test_flag(zram, index, ZRAM_UNDER_WB))
|
||||||
goto next;
|
goto next;
|
||||||
|
|
||||||
if ((mode & IDLE_WRITEBACK &&
|
if (mode == IDLE_WRITEBACK &&
|
||||||
!zram_test_flag(zram, index, ZRAM_IDLE)) &&
|
!zram_test_flag(zram, index, ZRAM_IDLE))
|
||||||
(mode & HUGE_WRITEBACK &&
|
goto next;
|
||||||
!zram_test_flag(zram, index, ZRAM_HUGE)))
|
if (mode == HUGE_WRITEBACK &&
|
||||||
|
!zram_test_flag(zram, index, ZRAM_HUGE))
|
||||||
goto next;
|
goto next;
|
||||||
/*
|
/*
|
||||||
* Clearing ZRAM_UNDER_WB is duty of caller.
|
* Clearing ZRAM_UNDER_WB is duty of caller.
|
||||||
|
@ -732,11 +771,10 @@ static ssize_t writeback_store(struct device *dev,
|
||||||
zram_set_element(zram, index, blk_idx);
|
zram_set_element(zram, index, blk_idx);
|
||||||
blk_idx = 0;
|
blk_idx = 0;
|
||||||
atomic64_inc(&zram->stats.pages_stored);
|
atomic64_inc(&zram->stats.pages_stored);
|
||||||
if (atomic64_add_unless(&zram->stats.bd_wb_limit,
|
spin_lock(&zram->wb_limit_lock);
|
||||||
-1 << (PAGE_SHIFT - 12), 0)) {
|
if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
|
||||||
if (atomic64_read(&zram->stats.bd_wb_limit) == 0)
|
zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12);
|
||||||
zram->stop_writeback = true;
|
spin_unlock(&zram->wb_limit_lock);
|
||||||
}
|
|
||||||
next:
|
next:
|
||||||
zram_slot_unlock(zram, index);
|
zram_slot_unlock(zram, index);
|
||||||
}
|
}
|
||||||
|
@ -1812,6 +1850,7 @@ static DEVICE_ATTR_RW(comp_algorithm);
|
||||||
static DEVICE_ATTR_RW(backing_dev);
|
static DEVICE_ATTR_RW(backing_dev);
|
||||||
static DEVICE_ATTR_WO(writeback);
|
static DEVICE_ATTR_WO(writeback);
|
||||||
static DEVICE_ATTR_RW(writeback_limit);
|
static DEVICE_ATTR_RW(writeback_limit);
|
||||||
|
static DEVICE_ATTR_RW(writeback_limit_enable);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static struct attribute *zram_disk_attrs[] = {
|
static struct attribute *zram_disk_attrs[] = {
|
||||||
|
@ -1828,6 +1867,7 @@ static struct attribute *zram_disk_attrs[] = {
|
||||||
&dev_attr_backing_dev.attr,
|
&dev_attr_backing_dev.attr,
|
||||||
&dev_attr_writeback.attr,
|
&dev_attr_writeback.attr,
|
||||||
&dev_attr_writeback_limit.attr,
|
&dev_attr_writeback_limit.attr,
|
||||||
|
&dev_attr_writeback_limit_enable.attr,
|
||||||
#endif
|
#endif
|
||||||
&dev_attr_io_stat.attr,
|
&dev_attr_io_stat.attr,
|
||||||
&dev_attr_mm_stat.attr,
|
&dev_attr_mm_stat.attr,
|
||||||
|
@ -1867,7 +1907,9 @@ static int zram_add(void)
|
||||||
device_id = ret;
|
device_id = ret;
|
||||||
|
|
||||||
init_rwsem(&zram->init_lock);
|
init_rwsem(&zram->init_lock);
|
||||||
|
#ifdef CONFIG_ZRAM_WRITEBACK
|
||||||
|
spin_lock_init(&zram->wb_limit_lock);
|
||||||
|
#endif
|
||||||
queue = blk_alloc_queue(GFP_KERNEL);
|
queue = blk_alloc_queue(GFP_KERNEL);
|
||||||
if (!queue) {
|
if (!queue) {
|
||||||
pr_err("Error allocating disk queue for device %d\n",
|
pr_err("Error allocating disk queue for device %d\n",
|
||||||
|
|
|
@ -86,7 +86,6 @@ struct zram_stats {
|
||||||
atomic64_t bd_count; /* no. of pages in backing device */
|
atomic64_t bd_count; /* no. of pages in backing device */
|
||||||
atomic64_t bd_reads; /* no. of reads from backing device */
|
atomic64_t bd_reads; /* no. of reads from backing device */
|
||||||
atomic64_t bd_writes; /* no. of writes from backing device */
|
atomic64_t bd_writes; /* no. of writes from backing device */
|
||||||
atomic64_t bd_wb_limit; /* writeback limit of backing device */
|
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -114,8 +113,10 @@ struct zram {
|
||||||
*/
|
*/
|
||||||
bool claim; /* Protected by bdev->bd_mutex */
|
bool claim; /* Protected by bdev->bd_mutex */
|
||||||
struct file *backing_dev;
|
struct file *backing_dev;
|
||||||
bool stop_writeback;
|
|
||||||
#ifdef CONFIG_ZRAM_WRITEBACK
|
#ifdef CONFIG_ZRAM_WRITEBACK
|
||||||
|
spinlock_t wb_limit_lock;
|
||||||
|
bool wb_limit_enable;
|
||||||
|
u64 bd_wb_limit;
|
||||||
struct block_device *bdev;
|
struct block_device *bdev;
|
||||||
unsigned int old_block_size;
|
unsigned int old_block_size;
|
||||||
unsigned long *bitmap;
|
unsigned long *bitmap;
|
||||||
|
|
|
@ -383,16 +383,17 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
|
||||||
* truncation is indicated by end of range being LLONG_MAX
|
* truncation is indicated by end of range being LLONG_MAX
|
||||||
* In this case, we first scan the range and release found pages.
|
* In this case, we first scan the range and release found pages.
|
||||||
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
|
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
|
||||||
* maps and global counts.
|
* maps and global counts. Page faults can not race with truncation
|
||||||
|
* in this routine. hugetlb_no_page() prevents page faults in the
|
||||||
|
* truncated range. It checks i_size before allocation, and again after
|
||||||
|
* with the page table lock for the page held. The same lock must be
|
||||||
|
* acquired to unmap a page.
|
||||||
* hole punch is indicated if end is not LLONG_MAX
|
* hole punch is indicated if end is not LLONG_MAX
|
||||||
* In the hole punch case we scan the range and release found pages.
|
* In the hole punch case we scan the range and release found pages.
|
||||||
* Only when releasing a page is the associated region/reserv map
|
* Only when releasing a page is the associated region/reserv map
|
||||||
* deleted. The region/reserv map for ranges without associated
|
* deleted. The region/reserv map for ranges without associated
|
||||||
* pages are not modified.
|
* pages are not modified. Page faults can race with hole punch.
|
||||||
*
|
* This is indicated if we find a mapped page.
|
||||||
* Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
|
|
||||||
* races with page faults.
|
|
||||||
*
|
|
||||||
* Note: If the passed end of range value is beyond the end of file, but
|
* Note: If the passed end of range value is beyond the end of file, but
|
||||||
* not LLONG_MAX this routine still performs a hole punch operation.
|
* not LLONG_MAX this routine still performs a hole punch operation.
|
||||||
*/
|
*/
|
||||||
|
@ -422,14 +423,32 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
||||||
|
|
||||||
for (i = 0; i < pagevec_count(&pvec); ++i) {
|
for (i = 0; i < pagevec_count(&pvec); ++i) {
|
||||||
struct page *page = pvec.pages[i];
|
struct page *page = pvec.pages[i];
|
||||||
|
u32 hash;
|
||||||
|
|
||||||
index = page->index;
|
index = page->index;
|
||||||
|
hash = hugetlb_fault_mutex_hash(h, current->mm,
|
||||||
|
&pseudo_vma,
|
||||||
|
mapping, index, 0);
|
||||||
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* A mapped page is impossible as callers should unmap
|
* If page is mapped, it was faulted in after being
|
||||||
* all references before calling. And, i_mmap_rwsem
|
* unmapped in caller. Unmap (again) now after taking
|
||||||
* prevents the creation of additional mappings.
|
* the fault mutex. The mutex will prevent faults
|
||||||
|
* until we finish removing the page.
|
||||||
|
*
|
||||||
|
* This race can only happen in the hole punch case.
|
||||||
|
* Getting here in a truncate operation is a bug.
|
||||||
*/
|
*/
|
||||||
VM_BUG_ON(page_mapped(page));
|
if (unlikely(page_mapped(page))) {
|
||||||
|
BUG_ON(truncate_op);
|
||||||
|
|
||||||
|
i_mmap_lock_write(mapping);
|
||||||
|
hugetlb_vmdelete_list(&mapping->i_mmap,
|
||||||
|
index * pages_per_huge_page(h),
|
||||||
|
(index + 1) * pages_per_huge_page(h));
|
||||||
|
i_mmap_unlock_write(mapping);
|
||||||
|
}
|
||||||
|
|
||||||
lock_page(page);
|
lock_page(page);
|
||||||
/*
|
/*
|
||||||
|
@ -451,6 +470,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
||||||
}
|
}
|
||||||
|
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
}
|
}
|
||||||
huge_pagevec_release(&pvec);
|
huge_pagevec_release(&pvec);
|
||||||
cond_resched();
|
cond_resched();
|
||||||
|
@ -462,20 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
||||||
|
|
||||||
static void hugetlbfs_evict_inode(struct inode *inode)
|
static void hugetlbfs_evict_inode(struct inode *inode)
|
||||||
{
|
{
|
||||||
struct address_space *mapping = inode->i_mapping;
|
|
||||||
struct resv_map *resv_map;
|
struct resv_map *resv_map;
|
||||||
|
|
||||||
/*
|
|
||||||
* The vfs layer guarantees that there are no other users of this
|
|
||||||
* inode. Therefore, it would be safe to call remove_inode_hugepages
|
|
||||||
* without holding i_mmap_rwsem. We acquire and hold here to be
|
|
||||||
* consistent with other callers. Since there will be no contention
|
|
||||||
* on the semaphore, overhead is negligible.
|
|
||||||
*/
|
|
||||||
i_mmap_lock_write(mapping);
|
|
||||||
remove_inode_hugepages(inode, 0, LLONG_MAX);
|
remove_inode_hugepages(inode, 0, LLONG_MAX);
|
||||||
i_mmap_unlock_write(mapping);
|
|
||||||
|
|
||||||
resv_map = (struct resv_map *)inode->i_mapping->private_data;
|
resv_map = (struct resv_map *)inode->i_mapping->private_data;
|
||||||
/* root inode doesn't have the resv_map, so we should check it */
|
/* root inode doesn't have the resv_map, so we should check it */
|
||||||
if (resv_map)
|
if (resv_map)
|
||||||
|
@ -496,8 +505,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
|
||||||
i_mmap_lock_write(mapping);
|
i_mmap_lock_write(mapping);
|
||||||
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
|
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
|
||||||
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
|
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
|
||||||
remove_inode_hugepages(inode, offset, LLONG_MAX);
|
|
||||||
i_mmap_unlock_write(mapping);
|
i_mmap_unlock_write(mapping);
|
||||||
|
remove_inode_hugepages(inode, offset, LLONG_MAX);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -531,8 +540,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
|
||||||
hugetlb_vmdelete_list(&mapping->i_mmap,
|
hugetlb_vmdelete_list(&mapping->i_mmap,
|
||||||
hole_start >> PAGE_SHIFT,
|
hole_start >> PAGE_SHIFT,
|
||||||
hole_end >> PAGE_SHIFT);
|
hole_end >> PAGE_SHIFT);
|
||||||
remove_inode_hugepages(inode, hole_start, hole_end);
|
|
||||||
i_mmap_unlock_write(mapping);
|
i_mmap_unlock_write(mapping);
|
||||||
|
remove_inode_hugepages(inode, hole_start, hole_end);
|
||||||
inode_unlock(inode);
|
inode_unlock(inode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -615,11 +624,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
|
||||||
/* addr is the offset within the file (zero based) */
|
/* addr is the offset within the file (zero based) */
|
||||||
addr = index * hpage_size;
|
addr = index * hpage_size;
|
||||||
|
|
||||||
/*
|
/* mutex taken here, fault path and hole punch */
|
||||||
* fault mutex taken here, protects against fault path
|
|
||||||
* and hole punch. inode_lock previously taken protects
|
|
||||||
* against truncation.
|
|
||||||
*/
|
|
||||||
hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
|
hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
|
||||||
index, addr);
|
index, addr);
|
||||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
|
|
@ -520,6 +520,12 @@ enum pgdat_flags {
|
||||||
PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
|
PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum zone_flags {
|
||||||
|
ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks.
|
||||||
|
* Cleared when kswapd is woken.
|
||||||
|
*/
|
||||||
|
};
|
||||||
|
|
||||||
static inline unsigned long zone_managed_pages(struct zone *zone)
|
static inline unsigned long zone_managed_pages(struct zone *zone)
|
||||||
{
|
{
|
||||||
return (unsigned long)atomic_long_read(&zone->managed_pages);
|
return (unsigned long)atomic_long_read(&zone->managed_pages);
|
||||||
|
|
|
@ -217,6 +217,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
|
||||||
memset(s->addr, 0, THREAD_SIZE);
|
memset(s->addr, 0, THREAD_SIZE);
|
||||||
|
|
||||||
tsk->stack_vm_area = s;
|
tsk->stack_vm_area = s;
|
||||||
|
tsk->stack = s->addr;
|
||||||
return s->addr;
|
return s->addr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
79
mm/hugetlb.c
79
mm/hugetlb.c
|
@ -3238,7 +3238,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
|
||||||
struct page *ptepage;
|
struct page *ptepage;
|
||||||
unsigned long addr;
|
unsigned long addr;
|
||||||
int cow;
|
int cow;
|
||||||
struct address_space *mapping = vma->vm_file->f_mapping;
|
|
||||||
struct hstate *h = hstate_vma(vma);
|
struct hstate *h = hstate_vma(vma);
|
||||||
unsigned long sz = huge_page_size(h);
|
unsigned long sz = huge_page_size(h);
|
||||||
struct mmu_notifier_range range;
|
struct mmu_notifier_range range;
|
||||||
|
@ -3250,23 +3249,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
|
||||||
mmu_notifier_range_init(&range, src, vma->vm_start,
|
mmu_notifier_range_init(&range, src, vma->vm_start,
|
||||||
vma->vm_end);
|
vma->vm_end);
|
||||||
mmu_notifier_invalidate_range_start(&range);
|
mmu_notifier_invalidate_range_start(&range);
|
||||||
} else {
|
|
||||||
/*
|
|
||||||
* For shared mappings i_mmap_rwsem must be held to call
|
|
||||||
* huge_pte_alloc, otherwise the returned ptep could go
|
|
||||||
* away if part of a shared pmd and another thread calls
|
|
||||||
* huge_pmd_unshare.
|
|
||||||
*/
|
|
||||||
i_mmap_lock_read(mapping);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
|
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
|
||||||
spinlock_t *src_ptl, *dst_ptl;
|
spinlock_t *src_ptl, *dst_ptl;
|
||||||
|
|
||||||
src_pte = huge_pte_offset(src, addr, sz);
|
src_pte = huge_pte_offset(src, addr, sz);
|
||||||
if (!src_pte)
|
if (!src_pte)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
dst_pte = huge_pte_alloc(dst, addr, sz);
|
dst_pte = huge_pte_alloc(dst, addr, sz);
|
||||||
if (!dst_pte) {
|
if (!dst_pte) {
|
||||||
ret = -ENOMEM;
|
ret = -ENOMEM;
|
||||||
|
@ -3337,8 +3326,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
|
||||||
|
|
||||||
if (cow)
|
if (cow)
|
||||||
mmu_notifier_invalidate_range_end(&range);
|
mmu_notifier_invalidate_range_end(&range);
|
||||||
else
|
|
||||||
i_mmap_unlock_read(mapping);
|
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -3755,16 +3742,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We can not race with truncation due to holding i_mmap_rwsem.
|
* Use page lock to guard against racing truncation
|
||||||
* Check once here for faults beyond end of file.
|
* before we get page_table_lock.
|
||||||
*/
|
*/
|
||||||
|
retry:
|
||||||
|
page = find_lock_page(mapping, idx);
|
||||||
|
if (!page) {
|
||||||
size = i_size_read(mapping->host) >> huge_page_shift(h);
|
size = i_size_read(mapping->host) >> huge_page_shift(h);
|
||||||
if (idx >= size)
|
if (idx >= size)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
retry:
|
|
||||||
page = find_lock_page(mapping, idx);
|
|
||||||
if (!page) {
|
|
||||||
/*
|
/*
|
||||||
* Check for page in userfault range
|
* Check for page in userfault range
|
||||||
*/
|
*/
|
||||||
|
@ -3784,18 +3771,14 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* hugetlb_fault_mutex and i_mmap_rwsem must be
|
* hugetlb_fault_mutex must be dropped before
|
||||||
* dropped before handling userfault. Reacquire
|
* handling userfault. Reacquire after handling
|
||||||
* after handling fault to make calling code simpler.
|
* fault to make calling code simpler.
|
||||||
*/
|
*/
|
||||||
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
|
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
|
||||||
idx, haddr);
|
idx, haddr);
|
||||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
i_mmap_unlock_read(mapping);
|
|
||||||
|
|
||||||
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
|
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
|
||||||
|
|
||||||
i_mmap_lock_read(mapping);
|
|
||||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
@ -3854,6 +3837,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
|
||||||
}
|
}
|
||||||
|
|
||||||
ptl = huge_pte_lock(h, mm, ptep);
|
ptl = huge_pte_lock(h, mm, ptep);
|
||||||
|
size = i_size_read(mapping->host) >> huge_page_shift(h);
|
||||||
|
if (idx >= size)
|
||||||
|
goto backout;
|
||||||
|
|
||||||
ret = 0;
|
ret = 0;
|
||||||
if (!huge_pte_none(huge_ptep_get(ptep)))
|
if (!huge_pte_none(huge_ptep_get(ptep)))
|
||||||
|
@ -3940,11 +3926,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
|
|
||||||
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
|
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
|
||||||
if (ptep) {
|
if (ptep) {
|
||||||
/*
|
|
||||||
* Since we hold no locks, ptep could be stale. That is
|
|
||||||
* OK as we are only making decisions based on content and
|
|
||||||
* not actually modifying content here.
|
|
||||||
*/
|
|
||||||
entry = huge_ptep_get(ptep);
|
entry = huge_ptep_get(ptep);
|
||||||
if (unlikely(is_hugetlb_entry_migration(entry))) {
|
if (unlikely(is_hugetlb_entry_migration(entry))) {
|
||||||
migration_entry_wait_huge(vma, mm, ptep);
|
migration_entry_wait_huge(vma, mm, ptep);
|
||||||
|
@ -3952,33 +3933,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
|
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
|
||||||
return VM_FAULT_HWPOISON_LARGE |
|
return VM_FAULT_HWPOISON_LARGE |
|
||||||
VM_FAULT_SET_HINDEX(hstate_index(h));
|
VM_FAULT_SET_HINDEX(hstate_index(h));
|
||||||
}
|
} else {
|
||||||
|
|
||||||
/*
|
|
||||||
* Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
|
|
||||||
* until finished with ptep. This serves two purposes:
|
|
||||||
* 1) It prevents huge_pmd_unshare from being called elsewhere
|
|
||||||
* and making the ptep no longer valid.
|
|
||||||
* 2) It synchronizes us with file truncation.
|
|
||||||
*
|
|
||||||
* ptep could have already be assigned via huge_pte_offset. That
|
|
||||||
* is OK, as huge_pte_alloc will return the same value unless
|
|
||||||
* something changed.
|
|
||||||
*/
|
|
||||||
mapping = vma->vm_file->f_mapping;
|
|
||||||
i_mmap_lock_read(mapping);
|
|
||||||
ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
|
ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
|
||||||
if (!ptep) {
|
if (!ptep)
|
||||||
i_mmap_unlock_read(mapping);
|
|
||||||
return VM_FAULT_OOM;
|
return VM_FAULT_OOM;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mapping = vma->vm_file->f_mapping;
|
||||||
|
idx = vma_hugecache_offset(h, vma, haddr);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Serialize hugepage allocation and instantiation, so that we don't
|
* Serialize hugepage allocation and instantiation, so that we don't
|
||||||
* get spurious allocation failures if two CPUs race to instantiate
|
* get spurious allocation failures if two CPUs race to instantiate
|
||||||
* the same page in the page cache.
|
* the same page in the page cache.
|
||||||
*/
|
*/
|
||||||
idx = vma_hugecache_offset(h, vma, haddr);
|
|
||||||
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
|
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
|
||||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
|
||||||
|
@ -4066,7 +4034,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
}
|
}
|
||||||
out_mutex:
|
out_mutex:
|
||||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
i_mmap_unlock_read(mapping);
|
|
||||||
/*
|
/*
|
||||||
* Generally it's safe to hold refcount during waiting page lock. But
|
* Generally it's safe to hold refcount during waiting page lock. But
|
||||||
* here we just wait to defer the next page fault to avoid busy loop and
|
* here we just wait to defer the next page fault to avoid busy loop and
|
||||||
|
@ -4671,12 +4638,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
|
||||||
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
|
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
|
||||||
* and returns the corresponding pte. While this is not necessary for the
|
* and returns the corresponding pte. While this is not necessary for the
|
||||||
* !shared pmd case because we can allocate the pmd later as well, it makes the
|
* !shared pmd case because we can allocate the pmd later as well, it makes the
|
||||||
* code much cleaner.
|
* code much cleaner. pmd allocation is essential for the shared case because
|
||||||
*
|
* pud has to be populated inside the same i_mmap_rwsem section - otherwise
|
||||||
* This routine must be called with i_mmap_rwsem held in at least read mode.
|
* racing tasks could either miss the sharing (see huge_pte_offset) or select a
|
||||||
* For hugetlbfs, this prevents removal of any page table entries associated
|
* bad pmd for sharing.
|
||||||
* with the address space. This is important as we are setting up sharing
|
|
||||||
* based on existing page table entries (mappings).
|
|
||||||
*/
|
*/
|
||||||
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
|
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
|
||||||
{
|
{
|
||||||
|
@ -4693,6 +4658,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
|
||||||
if (!vma_shareable(vma, addr))
|
if (!vma_shareable(vma, addr))
|
||||||
return (pte_t *)pmd_alloc(mm, pud, addr);
|
return (pte_t *)pmd_alloc(mm, pud, addr);
|
||||||
|
|
||||||
|
i_mmap_lock_write(mapping);
|
||||||
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
|
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
|
||||||
if (svma == vma)
|
if (svma == vma)
|
||||||
continue;
|
continue;
|
||||||
|
@ -4722,6 +4688,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
|
||||||
spin_unlock(ptl);
|
spin_unlock(ptl);
|
||||||
out:
|
out:
|
||||||
pte = (pte_t *)pmd_alloc(mm, pud, addr);
|
pte = (pte_t *)pmd_alloc(mm, pud, addr);
|
||||||
|
i_mmap_unlock_write(mapping);
|
||||||
return pte;
|
return pte;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4732,7 +4699,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
|
||||||
* indicated by page_count > 1, unmap is achieved by clearing pud and
|
* indicated by page_count > 1, unmap is achieved by clearing pud and
|
||||||
* decrementing the ref count. If count == 1, the pte page is not shared.
|
* decrementing the ref count. If count == 1, the pte page is not shared.
|
||||||
*
|
*
|
||||||
* Called with page table lock held and i_mmap_rwsem held in write mode.
|
* called with page table lock held.
|
||||||
*
|
*
|
||||||
* returns: 1 successfully unmapped a shared pte page
|
* returns: 1 successfully unmapped a shared pte page
|
||||||
* 0 the underlying pte page is not shared, or it is the last user
|
* 0 the underlying pte page is not shared, or it is the last user
|
||||||
|
|
|
@ -298,8 +298,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE);
|
|
||||||
|
|
||||||
*flags |= SLAB_KASAN;
|
*flags |= SLAB_KASAN;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -349,28 +347,43 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Since it's desirable to only call object contructors once during slab
|
* This function assigns a tag to an object considering the following:
|
||||||
* allocation, we preassign tags to all such objects. Also preassign tags for
|
* 1. A cache might have a constructor, which might save a pointer to a slab
|
||||||
* SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports.
|
* object somewhere (e.g. in the object itself). We preassign a tag for
|
||||||
* For SLAB allocator we can't preassign tags randomly since the freelist is
|
* each object in caches with constructors during slab creation and reuse
|
||||||
* stored as an array of indexes instead of a linked list. Assign tags based
|
* the same tag each time a particular object is allocated.
|
||||||
* on objects indexes, so that objects that are next to each other get
|
* 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be
|
||||||
* different tags.
|
* accessed after being freed. We preassign tags for objects in these
|
||||||
* After a tag is assigned, the object always gets allocated with the same tag.
|
* caches as well.
|
||||||
* The reason is that we can't change tags for objects with constructors on
|
* 3. For SLAB allocator we can't preassign tags randomly since the freelist
|
||||||
* reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor
|
* is stored as an array of indexes instead of a linked list. Assign tags
|
||||||
* code can save the pointer to the object somewhere (e.g. in the object
|
* based on objects indexes, so that objects that are next to each other
|
||||||
* itself). Then if we retag it, the old saved pointer will become invalid.
|
* get different tags.
|
||||||
*/
|
*/
|
||||||
static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new)
|
static u8 assign_tag(struct kmem_cache *cache, const void *object,
|
||||||
|
bool init, bool krealloc)
|
||||||
{
|
{
|
||||||
if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
|
/* Reuse the same tag for krealloc'ed objects. */
|
||||||
return new ? KASAN_TAG_KERNEL : random_tag();
|
if (krealloc)
|
||||||
|
return get_tag(object);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
|
||||||
|
* set, assign a tag when the object is being allocated (init == false).
|
||||||
|
*/
|
||||||
|
if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
|
||||||
|
return init ? KASAN_TAG_KERNEL : random_tag();
|
||||||
|
|
||||||
|
/* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
|
||||||
#ifdef CONFIG_SLAB
|
#ifdef CONFIG_SLAB
|
||||||
|
/* For SLAB assign tags based on the object index in the freelist. */
|
||||||
return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
|
return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
|
||||||
#else
|
#else
|
||||||
return new ? random_tag() : get_tag(object);
|
/*
|
||||||
|
* For SLUB assign a random tag during slab creation, otherwise reuse
|
||||||
|
* the already assigned tag.
|
||||||
|
*/
|
||||||
|
return init ? random_tag() : get_tag(object);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -386,7 +399,8 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
|
||||||
__memset(alloc_info, 0, sizeof(*alloc_info));
|
__memset(alloc_info, 0, sizeof(*alloc_info));
|
||||||
|
|
||||||
if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
|
if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
|
||||||
object = set_tag(object, assign_tag(cache, object, true));
|
object = set_tag(object,
|
||||||
|
assign_tag(cache, object, true, false));
|
||||||
|
|
||||||
return (void *)object;
|
return (void *)object;
|
||||||
}
|
}
|
||||||
|
@ -452,8 +466,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
|
||||||
return __kasan_slab_free(cache, object, ip, true);
|
return __kasan_slab_free(cache, object, ip, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
|
static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
|
||||||
size_t size, gfp_t flags)
|
size_t size, gfp_t flags, bool krealloc)
|
||||||
{
|
{
|
||||||
unsigned long redzone_start;
|
unsigned long redzone_start;
|
||||||
unsigned long redzone_end;
|
unsigned long redzone_end;
|
||||||
|
@ -471,7 +485,7 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
|
||||||
KASAN_SHADOW_SCALE_SIZE);
|
KASAN_SHADOW_SCALE_SIZE);
|
||||||
|
|
||||||
if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
|
if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
|
||||||
tag = assign_tag(cache, object, false);
|
tag = assign_tag(cache, object, false, krealloc);
|
||||||
|
|
||||||
/* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
|
/* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
|
||||||
kasan_unpoison_shadow(set_tag(object, tag), size);
|
kasan_unpoison_shadow(set_tag(object, tag), size);
|
||||||
|
@ -483,6 +497,12 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
|
||||||
|
|
||||||
return set_tag(object, tag);
|
return set_tag(object, tag);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
|
||||||
|
size_t size, gfp_t flags)
|
||||||
|
{
|
||||||
|
return __kasan_kmalloc(cache, object, size, flags, false);
|
||||||
|
}
|
||||||
EXPORT_SYMBOL(kasan_kmalloc);
|
EXPORT_SYMBOL(kasan_kmalloc);
|
||||||
|
|
||||||
void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
|
void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
|
||||||
|
@ -522,7 +542,8 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
|
||||||
if (unlikely(!PageSlab(page)))
|
if (unlikely(!PageSlab(page)))
|
||||||
return kasan_kmalloc_large(object, size, flags);
|
return kasan_kmalloc_large(object, size, flags);
|
||||||
else
|
else
|
||||||
return kasan_kmalloc(page->slab_cache, object, size, flags);
|
return __kasan_kmalloc(page->slab_cache, object, size,
|
||||||
|
flags, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
void kasan_poison_kfree(void *ptr, unsigned long ip)
|
void kasan_poison_kfree(void *ptr, unsigned long ip)
|
||||||
|
|
|
@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
||||||
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
|
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
|
||||||
struct address_space *mapping;
|
struct address_space *mapping;
|
||||||
LIST_HEAD(tokill);
|
LIST_HEAD(tokill);
|
||||||
bool unmap_success = true;
|
bool unmap_success;
|
||||||
int kill = 1, forcekill;
|
int kill = 1, forcekill;
|
||||||
struct page *hpage = *hpagep;
|
struct page *hpage = *hpagep;
|
||||||
bool mlocked = PageMlocked(hpage);
|
bool mlocked = PageMlocked(hpage);
|
||||||
|
@ -1028,19 +1028,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
||||||
if (kill)
|
if (kill)
|
||||||
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
|
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
|
||||||
|
|
||||||
if (!PageHuge(hpage)) {
|
|
||||||
unmap_success = try_to_unmap(hpage, ttu);
|
unmap_success = try_to_unmap(hpage, ttu);
|
||||||
} else if (mapping) {
|
|
||||||
/*
|
|
||||||
* For hugetlb pages, try_to_unmap could potentially call
|
|
||||||
* huge_pmd_unshare. Because of this, take semaphore in
|
|
||||||
* write mode here and set TTU_RMAP_LOCKED to indicate we
|
|
||||||
* have taken the lock at this higer level.
|
|
||||||
*/
|
|
||||||
i_mmap_lock_write(mapping);
|
|
||||||
unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
|
|
||||||
i_mmap_unlock_write(mapping);
|
|
||||||
}
|
|
||||||
if (!unmap_success)
|
if (!unmap_success)
|
||||||
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
|
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
|
||||||
pfn, page_mapcount(hpage));
|
pfn, page_mapcount(hpage));
|
||||||
|
|
26
mm/memory.c
26
mm/memory.c
|
@ -2994,6 +2994,28 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
|
||||||
struct vm_area_struct *vma = vmf->vma;
|
struct vm_area_struct *vma = vmf->vma;
|
||||||
vm_fault_t ret;
|
vm_fault_t ret;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Preallocate pte before we take page_lock because this might lead to
|
||||||
|
* deadlocks for memcg reclaim which waits for pages under writeback:
|
||||||
|
* lock_page(A)
|
||||||
|
* SetPageWriteback(A)
|
||||||
|
* unlock_page(A)
|
||||||
|
* lock_page(B)
|
||||||
|
* lock_page(B)
|
||||||
|
* pte_alloc_pne
|
||||||
|
* shrink_page_list
|
||||||
|
* wait_on_page_writeback(A)
|
||||||
|
* SetPageWriteback(B)
|
||||||
|
* unlock_page(B)
|
||||||
|
* # flush A, B to clear the writeback
|
||||||
|
*/
|
||||||
|
if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
|
||||||
|
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
|
||||||
|
if (!vmf->prealloc_pte)
|
||||||
|
return VM_FAULT_OOM;
|
||||||
|
smp_wmb(); /* See comment in __pte_alloc() */
|
||||||
|
}
|
||||||
|
|
||||||
ret = vma->vm_ops->fault(vmf);
|
ret = vma->vm_ops->fault(vmf);
|
||||||
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
|
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
|
||||||
VM_FAULT_DONE_COW)))
|
VM_FAULT_DONE_COW)))
|
||||||
|
@ -4077,8 +4099,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
if (range) {
|
if (range) {
|
||||||
range->start = address & PAGE_MASK;
|
mmu_notifier_range_init(range, mm, address & PAGE_MASK,
|
||||||
range->end = range->start + PAGE_SIZE;
|
(address & PAGE_MASK) + PAGE_SIZE);
|
||||||
mmu_notifier_invalidate_range_start(range);
|
mmu_notifier_invalidate_range_start(range);
|
||||||
}
|
}
|
||||||
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
|
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
|
||||||
|
|
13
mm/migrate.c
13
mm/migrate.c
|
@ -1324,19 +1324,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
|
||||||
goto put_anon;
|
goto put_anon;
|
||||||
|
|
||||||
if (page_mapped(hpage)) {
|
if (page_mapped(hpage)) {
|
||||||
struct address_space *mapping = page_mapping(hpage);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* try_to_unmap could potentially call huge_pmd_unshare.
|
|
||||||
* Because of this, take semaphore in write mode here and
|
|
||||||
* set TTU_RMAP_LOCKED to let lower levels know we have
|
|
||||||
* taken the lock.
|
|
||||||
*/
|
|
||||||
i_mmap_lock_write(mapping);
|
|
||||||
try_to_unmap(hpage,
|
try_to_unmap(hpage,
|
||||||
TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
|
TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
|
||||||
TTU_RMAP_LOCKED);
|
|
||||||
i_mmap_unlock_write(mapping);
|
|
||||||
page_was_mapped = 1;
|
page_was_mapped = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2214,7 +2214,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
|
||||||
*/
|
*/
|
||||||
boost_watermark(zone);
|
boost_watermark(zone);
|
||||||
if (alloc_flags & ALLOC_KSWAPD)
|
if (alloc_flags & ALLOC_KSWAPD)
|
||||||
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
|
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
|
||||||
|
|
||||||
/* We are not allowed to try stealing from the whole block */
|
/* We are not allowed to try stealing from the whole block */
|
||||||
if (!whole_block)
|
if (!whole_block)
|
||||||
|
@ -3102,6 +3102,12 @@ struct page *rmqueue(struct zone *preferred_zone,
|
||||||
local_irq_restore(flags);
|
local_irq_restore(flags);
|
||||||
|
|
||||||
out:
|
out:
|
||||||
|
/* Separate test+clear to avoid unnecessary atomics */
|
||||||
|
if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
|
||||||
|
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
|
||||||
|
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
|
||||||
|
}
|
||||||
|
|
||||||
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
|
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
|
||||||
return page;
|
return page;
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,6 @@
|
||||||
* page->flags PG_locked (lock_page)
|
* page->flags PG_locked (lock_page)
|
||||||
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
|
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
|
||||||
* mapping->i_mmap_rwsem
|
* mapping->i_mmap_rwsem
|
||||||
* hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
|
|
||||||
* anon_vma->rwsem
|
* anon_vma->rwsem
|
||||||
* mm->page_table_lock or pte_lock
|
* mm->page_table_lock or pte_lock
|
||||||
* zone_lru_lock (in mark_page_accessed, isolate_lru_page)
|
* zone_lru_lock (in mark_page_accessed, isolate_lru_page)
|
||||||
|
@ -1379,9 +1378,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
|
||||||
/*
|
/*
|
||||||
* If sharing is possible, start and end will be adjusted
|
* If sharing is possible, start and end will be adjusted
|
||||||
* accordingly.
|
* accordingly.
|
||||||
*
|
|
||||||
* If called for a huge page, caller must hold i_mmap_rwsem
|
|
||||||
* in write mode as it is possible to call huge_pmd_unshare.
|
|
||||||
*/
|
*/
|
||||||
adjust_range_if_pmd_sharing_possible(vma, &range.start,
|
adjust_range_if_pmd_sharing_possible(vma, &range.start,
|
||||||
&range.end);
|
&range.end);
|
||||||
|
|
|
@ -666,8 +666,10 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
|
||||||
struct alien_cache *alc = NULL;
|
struct alien_cache *alc = NULL;
|
||||||
|
|
||||||
alc = kmalloc_node(memsize, gfp, node);
|
alc = kmalloc_node(memsize, gfp, node);
|
||||||
|
if (alc) {
|
||||||
init_arraycache(&alc->ac, entries, batch);
|
init_arraycache(&alc->ac, entries, batch);
|
||||||
spin_lock_init(&alc->lock);
|
spin_lock_init(&alc->lock);
|
||||||
|
}
|
||||||
return alc;
|
return alc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3846,6 +3846,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
|
||||||
unsigned int offset;
|
unsigned int offset;
|
||||||
size_t object_size;
|
size_t object_size;
|
||||||
|
|
||||||
|
ptr = kasan_reset_tag(ptr);
|
||||||
|
|
||||||
/* Find object and usable object size. */
|
/* Find object and usable object size. */
|
||||||
s = page->slab_cache;
|
s = page->slab_cache;
|
||||||
|
|
||||||
|
|
|
@ -247,7 +247,8 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
|
||||||
/*
|
/*
|
||||||
* Validates that the given object is:
|
* Validates that the given object is:
|
||||||
* - not bogus address
|
* - not bogus address
|
||||||
* - known-safe heap or stack object
|
* - fully contained by stack (or stack frame, when available)
|
||||||
|
* - fully within SLAB object (or object whitelist area, when available)
|
||||||
* - not in kernel text
|
* - not in kernel text
|
||||||
*/
|
*/
|
||||||
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
|
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
|
||||||
|
@ -262,9 +263,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
|
||||||
/* Check for invalid addresses. */
|
/* Check for invalid addresses. */
|
||||||
check_bogus_address((const unsigned long)ptr, n, to_user);
|
check_bogus_address((const unsigned long)ptr, n, to_user);
|
||||||
|
|
||||||
/* Check for bad heap object. */
|
|
||||||
check_heap_object(ptr, n, to_user);
|
|
||||||
|
|
||||||
/* Check for bad stack object. */
|
/* Check for bad stack object. */
|
||||||
switch (check_stack_object(ptr, n)) {
|
switch (check_stack_object(ptr, n)) {
|
||||||
case NOT_STACK:
|
case NOT_STACK:
|
||||||
|
@ -282,6 +280,9 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
|
||||||
usercopy_abort("process stack", NULL, to_user, 0, n);
|
usercopy_abort("process stack", NULL, to_user, 0, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Check for bad heap object. */
|
||||||
|
check_heap_object(ptr, n, to_user);
|
||||||
|
|
||||||
/* Check for object in kernel to avoid text exposure. */
|
/* Check for object in kernel to avoid text exposure. */
|
||||||
check_kernel_text_object((const unsigned long)ptr, n, to_user);
|
check_kernel_text_object((const unsigned long)ptr, n, to_user);
|
||||||
}
|
}
|
||||||
|
|
|
@ -267,14 +267,10 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
|
||||||
VM_BUG_ON(dst_addr & ~huge_page_mask(h));
|
VM_BUG_ON(dst_addr & ~huge_page_mask(h));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
|
* Serialize via hugetlb_fault_mutex
|
||||||
* i_mmap_rwsem ensures the dst_pte remains valid even
|
|
||||||
* in the case of shared pmds. fault mutex prevents
|
|
||||||
* races with other faulting threads.
|
|
||||||
*/
|
*/
|
||||||
mapping = dst_vma->vm_file->f_mapping;
|
|
||||||
i_mmap_lock_read(mapping);
|
|
||||||
idx = linear_page_index(dst_vma, dst_addr);
|
idx = linear_page_index(dst_vma, dst_addr);
|
||||||
|
mapping = dst_vma->vm_file->f_mapping;
|
||||||
hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
|
hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
|
||||||
idx, dst_addr);
|
idx, dst_addr);
|
||||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
@ -283,7 +279,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
|
||||||
dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
|
dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
|
||||||
if (!dst_pte) {
|
if (!dst_pte) {
|
||||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
i_mmap_unlock_read(mapping);
|
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -291,7 +286,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
|
||||||
dst_pteval = huge_ptep_get(dst_pte);
|
dst_pteval = huge_ptep_get(dst_pte);
|
||||||
if (!huge_pte_none(dst_pteval)) {
|
if (!huge_pte_none(dst_pteval)) {
|
||||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
i_mmap_unlock_read(mapping);
|
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -299,7 +293,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
|
||||||
dst_addr, src_addr, &page);
|
dst_addr, src_addr, &page);
|
||||||
|
|
||||||
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
i_mmap_unlock_read(mapping);
|
|
||||||
vm_alloc_shared = vm_shared;
|
vm_alloc_shared = vm_shared;
|
||||||
|
|
||||||
cond_resched();
|
cond_resched();
|
||||||
|
|
|
@ -478,7 +478,7 @@ bool page_mapped(struct page *page)
|
||||||
return true;
|
return true;
|
||||||
if (PageHuge(page))
|
if (PageHuge(page))
|
||||||
return false;
|
return false;
|
||||||
for (i = 0; i < hpage_nr_pages(page); i++) {
|
for (i = 0; i < (1 << compound_order(page)); i++) {
|
||||||
if (atomic_read(&page[i]._mapcount) >= 0)
|
if (atomic_read(&page[i]._mapcount) >= 0)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,9 @@
|
||||||
* Example use:
|
* Example use:
|
||||||
* cat /sys/kernel/debug/page_owner > page_owner_full.txt
|
* cat /sys/kernel/debug/page_owner > page_owner_full.txt
|
||||||
* grep -v ^PFN page_owner_full.txt > page_owner.txt
|
* grep -v ^PFN page_owner_full.txt > page_owner.txt
|
||||||
* ./sort page_owner.txt sorted_page_owner.txt
|
* ./page_owner_sort page_owner.txt sorted_page_owner.txt
|
||||||
|
*
|
||||||
|
* See Documentation/vm/page_owner.rst
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
Loading…
Reference in New Issue