hugetlbfs: revert "Use i_mmap_rwsem to fix page fault/truncate race"
This reverts c86aa7bbfd
The reverted commit caused ABBA deadlocks when file migration raced with
file eviction for specific hugetlbfs files. This was discovered with a
modified version of the LTP move_pages12 test.
The purpose of the reverted patch was to close a long existing race
between hugetlbfs file truncation and page faults. After more analysis
of the patch and impacted code, it was determined that i_mmap_rwsem can
not be used for all required synchronization. Therefore, revert this
patch while working an another approach to the underlying issue.
Link: http://lkml.kernel.org/r/20190103235452.29335-1-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Jan Stancek <jstancek@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
8ab88c7169
commit
e7c5809779
|
@ -383,16 +383,17 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
|
||||||
* truncation is indicated by end of range being LLONG_MAX
|
* truncation is indicated by end of range being LLONG_MAX
|
||||||
* In this case, we first scan the range and release found pages.
|
* In this case, we first scan the range and release found pages.
|
||||||
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
|
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
|
||||||
* maps and global counts.
|
* maps and global counts. Page faults can not race with truncation
|
||||||
|
* in this routine. hugetlb_no_page() prevents page faults in the
|
||||||
|
* truncated range. It checks i_size before allocation, and again after
|
||||||
|
* with the page table lock for the page held. The same lock must be
|
||||||
|
* acquired to unmap a page.
|
||||||
* hole punch is indicated if end is not LLONG_MAX
|
* hole punch is indicated if end is not LLONG_MAX
|
||||||
* In the hole punch case we scan the range and release found pages.
|
* In the hole punch case we scan the range and release found pages.
|
||||||
* Only when releasing a page is the associated region/reserv map
|
* Only when releasing a page is the associated region/reserv map
|
||||||
* deleted. The region/reserv map for ranges without associated
|
* deleted. The region/reserv map for ranges without associated
|
||||||
* pages are not modified.
|
* pages are not modified. Page faults can race with hole punch.
|
||||||
*
|
* This is indicated if we find a mapped page.
|
||||||
* Callers of this routine must hold the i_mmap_rwsem in write mode to prevent
|
|
||||||
* races with page faults.
|
|
||||||
*
|
|
||||||
* Note: If the passed end of range value is beyond the end of file, but
|
* Note: If the passed end of range value is beyond the end of file, but
|
||||||
* not LLONG_MAX this routine still performs a hole punch operation.
|
* not LLONG_MAX this routine still performs a hole punch operation.
|
||||||
*/
|
*/
|
||||||
|
@ -422,14 +423,32 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
||||||
|
|
||||||
for (i = 0; i < pagevec_count(&pvec); ++i) {
|
for (i = 0; i < pagevec_count(&pvec); ++i) {
|
||||||
struct page *page = pvec.pages[i];
|
struct page *page = pvec.pages[i];
|
||||||
|
u32 hash;
|
||||||
|
|
||||||
index = page->index;
|
index = page->index;
|
||||||
|
hash = hugetlb_fault_mutex_hash(h, current->mm,
|
||||||
|
&pseudo_vma,
|
||||||
|
mapping, index, 0);
|
||||||
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* A mapped page is impossible as callers should unmap
|
* If page is mapped, it was faulted in after being
|
||||||
* all references before calling. And, i_mmap_rwsem
|
* unmapped in caller. Unmap (again) now after taking
|
||||||
* prevents the creation of additional mappings.
|
* the fault mutex. The mutex will prevent faults
|
||||||
|
* until we finish removing the page.
|
||||||
|
*
|
||||||
|
* This race can only happen in the hole punch case.
|
||||||
|
* Getting here in a truncate operation is a bug.
|
||||||
*/
|
*/
|
||||||
VM_BUG_ON(page_mapped(page));
|
if (unlikely(page_mapped(page))) {
|
||||||
|
BUG_ON(truncate_op);
|
||||||
|
|
||||||
|
i_mmap_lock_write(mapping);
|
||||||
|
hugetlb_vmdelete_list(&mapping->i_mmap,
|
||||||
|
index * pages_per_huge_page(h),
|
||||||
|
(index + 1) * pages_per_huge_page(h));
|
||||||
|
i_mmap_unlock_write(mapping);
|
||||||
|
}
|
||||||
|
|
||||||
lock_page(page);
|
lock_page(page);
|
||||||
/*
|
/*
|
||||||
|
@ -451,6 +470,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
||||||
}
|
}
|
||||||
|
|
||||||
unlock_page(page);
|
unlock_page(page);
|
||||||
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
||||||
}
|
}
|
||||||
huge_pagevec_release(&pvec);
|
huge_pagevec_release(&pvec);
|
||||||
cond_resched();
|
cond_resched();
|
||||||
|
@ -462,20 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
||||||
|
|
||||||
static void hugetlbfs_evict_inode(struct inode *inode)
|
static void hugetlbfs_evict_inode(struct inode *inode)
|
||||||
{
|
{
|
||||||
struct address_space *mapping = inode->i_mapping;
|
|
||||||
struct resv_map *resv_map;
|
struct resv_map *resv_map;
|
||||||
|
|
||||||
/*
|
|
||||||
* The vfs layer guarantees that there are no other users of this
|
|
||||||
* inode. Therefore, it would be safe to call remove_inode_hugepages
|
|
||||||
* without holding i_mmap_rwsem. We acquire and hold here to be
|
|
||||||
* consistent with other callers. Since there will be no contention
|
|
||||||
* on the semaphore, overhead is negligible.
|
|
||||||
*/
|
|
||||||
i_mmap_lock_write(mapping);
|
|
||||||
remove_inode_hugepages(inode, 0, LLONG_MAX);
|
remove_inode_hugepages(inode, 0, LLONG_MAX);
|
||||||
i_mmap_unlock_write(mapping);
|
|
||||||
|
|
||||||
resv_map = (struct resv_map *)inode->i_mapping->private_data;
|
resv_map = (struct resv_map *)inode->i_mapping->private_data;
|
||||||
/* root inode doesn't have the resv_map, so we should check it */
|
/* root inode doesn't have the resv_map, so we should check it */
|
||||||
if (resv_map)
|
if (resv_map)
|
||||||
|
@ -496,8 +505,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
|
||||||
i_mmap_lock_write(mapping);
|
i_mmap_lock_write(mapping);
|
||||||
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
|
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
|
||||||
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
|
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
|
||||||
remove_inode_hugepages(inode, offset, LLONG_MAX);
|
|
||||||
i_mmap_unlock_write(mapping);
|
i_mmap_unlock_write(mapping);
|
||||||
|
remove_inode_hugepages(inode, offset, LLONG_MAX);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -531,8 +540,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
|
||||||
hugetlb_vmdelete_list(&mapping->i_mmap,
|
hugetlb_vmdelete_list(&mapping->i_mmap,
|
||||||
hole_start >> PAGE_SHIFT,
|
hole_start >> PAGE_SHIFT,
|
||||||
hole_end >> PAGE_SHIFT);
|
hole_end >> PAGE_SHIFT);
|
||||||
remove_inode_hugepages(inode, hole_start, hole_end);
|
|
||||||
i_mmap_unlock_write(mapping);
|
i_mmap_unlock_write(mapping);
|
||||||
|
remove_inode_hugepages(inode, hole_start, hole_end);
|
||||||
inode_unlock(inode);
|
inode_unlock(inode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -615,11 +624,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
|
||||||
/* addr is the offset within the file (zero based) */
|
/* addr is the offset within the file (zero based) */
|
||||||
addr = index * hpage_size;
|
addr = index * hpage_size;
|
||||||
|
|
||||||
/*
|
/* mutex taken here, fault path and hole punch */
|
||||||
* fault mutex taken here, protects against fault path
|
|
||||||
* and hole punch. inode_lock previously taken protects
|
|
||||||
* against truncation.
|
|
||||||
*/
|
|
||||||
hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
|
hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
|
||||||
index, addr);
|
index, addr);
|
||||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||||
|
|
21
mm/hugetlb.c
21
mm/hugetlb.c
|
@ -3755,16 +3755,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We can not race with truncation due to holding i_mmap_rwsem.
|
* Use page lock to guard against racing truncation
|
||||||
* Check once here for faults beyond end of file.
|
* before we get page_table_lock.
|
||||||
*/
|
*/
|
||||||
size = i_size_read(mapping->host) >> huge_page_shift(h);
|
|
||||||
if (idx >= size)
|
|
||||||
goto out;
|
|
||||||
|
|
||||||
retry:
|
retry:
|
||||||
page = find_lock_page(mapping, idx);
|
page = find_lock_page(mapping, idx);
|
||||||
if (!page) {
|
if (!page) {
|
||||||
|
size = i_size_read(mapping->host) >> huge_page_shift(h);
|
||||||
|
if (idx >= size)
|
||||||
|
goto out;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check for page in userfault range
|
* Check for page in userfault range
|
||||||
*/
|
*/
|
||||||
|
@ -3854,6 +3854,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
|
||||||
}
|
}
|
||||||
|
|
||||||
ptl = huge_pte_lock(h, mm, ptep);
|
ptl = huge_pte_lock(h, mm, ptep);
|
||||||
|
size = i_size_read(mapping->host) >> huge_page_shift(h);
|
||||||
|
if (idx >= size)
|
||||||
|
goto backout;
|
||||||
|
|
||||||
ret = 0;
|
ret = 0;
|
||||||
if (!huge_pte_none(huge_ptep_get(ptep)))
|
if (!huge_pte_none(huge_ptep_get(ptep)))
|
||||||
|
@ -3956,10 +3959,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
|
* Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
|
||||||
* until finished with ptep. This serves two purposes:
|
* until finished with ptep. This prevents huge_pmd_unshare from
|
||||||
* 1) It prevents huge_pmd_unshare from being called elsewhere
|
* being called elsewhere and making the ptep no longer valid.
|
||||||
* and making the ptep no longer valid.
|
|
||||||
* 2) It synchronizes us with file truncation.
|
|
||||||
*
|
*
|
||||||
* ptep could have already be assigned via huge_pte_offset. That
|
* ptep could have already be assigned via huge_pte_offset. That
|
||||||
* is OK, as huge_pte_alloc will return the same value unless
|
* is OK, as huge_pte_alloc will return the same value unless
|
||||||
|
|
Loading…
Reference in New Issue