hmm related patches for 5.5
This is another round of bug fixing and cleanup. This time the focus is on the driver pattern to use mmu notifiers to monitor a VA range. This code is lifted out of many drivers and hmm_mirror directly into the mmu_notifier core and written using the best ideas from all the driver implementations. This removes many bugs from the drivers and has a very pleasing diffstat. More drivers can still be converted, but that is for another cycle. - A shared branch with RDMA reworking the RDMA ODP implementation - New mmu_interval_notifier API. This is focused on the use case of monitoring a VA and simplifies the process for drivers - A common seq-count locking scheme built into the mmu_interval_notifier API usable by drivers that call get_user_pages() or hmm_range_fault() with the VA range - Conversion of mlx5 ODP, hfi1, radeon, nouveau, AMD GPU, and Xen GntDev drivers to the new API. This deletes a lot of wonky driver code. - Two improvements for hmm_range_fault(), from testing done by Ralph -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEfB7FMLh+8QxL+6i3OG33FX4gmxoFAl3cCjQACgkQOG33FX4g mxpp8xAAiR9iOdT28m/tx1GF31XludrMhRZVIiz0vmCIxIiAkWekWEfAEVm9PDnh wdrxTJohSs+B65AK3sfToOM3AIuNCuFVWmbbHI5qmOO76vaSvcZa905Z++pNsawO Bn8mgRCprYoFHcxWLvTvnA5U0g1S2BSSOwBSZI43CbEnVvHjYAR6MnvRqfGMk+NF bf8fTk/x+fl0DCemhynlBLuJkogzoE2Hgl0yPY5bFna4PktOxdpa1yPaQsiqZ7e6 2s2NtM3pbMBJk0W42q5BU+aPhiqfxFFszasPSLBduXrD2xDsG76HJdHj5VydKmfL nelG4BvqJozXTEZWvTEePYhCqaZ41eJZ7Asw8BXtmacVqE5mDlTXo/Zdgbz7yEOR mI5MVyjD5rauZJldUOWXbwrPoWVFRvboauehiSgqvxvT9HvlFp9GKObSuu4gubBQ mzxs4t48tPhA7bswLmw0/pETSogFuVDfaB7hsyY0gi8EwxMFMpw2qFypm1PEEF+C BuUxCSShzvNKrraNe5PWaNNFd3AzIwAOWJHE+poH4bCoXQVr5nA+rq2gnHkdY5vq /xrBCyxkf0U05YoFGYembPVCInMehzp9Xjy8V+SueSvCg2/TYwGDCgGfsbe9dNOP Bc40JpS7BDn5w9nyLUJmOx7jfruNV6kx1QslA7NDDrB/rzOlsEc= =Hj8a -----END PGP SIGNATURE----- Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma Pull hmm updates from Jason Gunthorpe: "This is another round of bug fixing and cleanup. This time the focus is on the driver pattern to use mmu notifiers to monitor a VA range. This code is lifted out of many drivers and hmm_mirror directly into the mmu_notifier core and written using the best ideas from all the driver implementations. This removes many bugs from the drivers and has a very pleasing diffstat. More drivers can still be converted, but that is for another cycle. - A shared branch with RDMA reworking the RDMA ODP implementation - New mmu_interval_notifier API. This is focused on the use case of monitoring a VA and simplifies the process for drivers - A common seq-count locking scheme built into the mmu_interval_notifier API usable by drivers that call get_user_pages() or hmm_range_fault() with the VA range - Conversion of mlx5 ODP, hfi1, radeon, nouveau, AMD GPU, and Xen GntDev drivers to the new API. This deletes a lot of wonky driver code. - Two improvements for hmm_range_fault(), from testing done by Ralph" * tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: mm/hmm: remove hmm_range_dma_map and hmm_range_dma_unmap mm/hmm: make full use of walk_page_range() xen/gntdev: use mmu_interval_notifier_insert mm/hmm: remove hmm_mirror and related drm/amdgpu: Use mmu_interval_notifier instead of hmm_mirror drm/amdgpu: Use mmu_interval_insert instead of hmm_mirror drm/amdgpu: Call find_vma under mmap_sem nouveau: use mmu_interval_notifier instead of hmm_mirror nouveau: use mmu_notifier directly for invalidate_range_start drm/radeon: use mmu_interval_notifier_insert RDMA/hfi1: Use mmu_interval_notifier_insert for user_exp_rcv RDMA/odp: Use mmu_interval_notifier_insert() mm/hmm: define the pre-processor related parts of hmm.h even if disabled mm/hmm: allow hmm_range to be used with a mmu_interval_notifier or hmm_mirror mm/mmu_notifier: add an interval tree notifier mm/mmu_notifier: define the header pre-processor parts even if disabled mm/hmm: allow snapshot of the special zero page
This commit is contained in:
commit
aa32f11691
|
@ -147,49 +147,16 @@ Address space mirroring implementation and API
|
|||
Address space mirroring's main objective is to allow duplication of a range of
|
||||
CPU page table into a device page table; HMM helps keep both synchronized. A
|
||||
device driver that wants to mirror a process address space must start with the
|
||||
registration of an hmm_mirror struct::
|
||||
registration of a mmu_interval_notifier::
|
||||
|
||||
int hmm_mirror_register(struct hmm_mirror *mirror,
|
||||
struct mm_struct *mm);
|
||||
mni->ops = &driver_ops;
|
||||
int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
|
||||
unsigned long start, unsigned long length,
|
||||
struct mm_struct *mm);
|
||||
|
||||
The mirror struct has a set of callbacks that are used
|
||||
to propagate CPU page tables::
|
||||
|
||||
struct hmm_mirror_ops {
|
||||
/* release() - release hmm_mirror
|
||||
*
|
||||
* @mirror: pointer to struct hmm_mirror
|
||||
*
|
||||
* This is called when the mm_struct is being released. The callback
|
||||
* must ensure that all access to any pages obtained from this mirror
|
||||
* is halted before the callback returns. All future access should
|
||||
* fault.
|
||||
*/
|
||||
void (*release)(struct hmm_mirror *mirror);
|
||||
|
||||
/* sync_cpu_device_pagetables() - synchronize page tables
|
||||
*
|
||||
* @mirror: pointer to struct hmm_mirror
|
||||
* @update: update information (see struct mmu_notifier_range)
|
||||
* Return: -EAGAIN if update.blockable false and callback need to
|
||||
* block, 0 otherwise.
|
||||
*
|
||||
* This callback ultimately originates from mmu_notifiers when the CPU
|
||||
* page table is updated. The device driver must update its page table
|
||||
* in response to this callback. The update argument tells what action
|
||||
* to perform.
|
||||
*
|
||||
* The device driver must not return from this callback until the device
|
||||
* page tables are completely updated (TLBs flushed, etc); this is a
|
||||
* synchronous call.
|
||||
*/
|
||||
int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
|
||||
const struct hmm_update *update);
|
||||
};
|
||||
|
||||
The device driver must perform the update action to the range (mark range
|
||||
read only, or fully unmap, etc.). The device must complete the update before
|
||||
the driver callback returns.
|
||||
During the driver_ops->invalidate() callback the device driver must perform
|
||||
the update action to the range (mark range read only, or fully unmap,
|
||||
etc.). The device must complete the update before the driver callback returns.
|
||||
|
||||
When the device driver wants to populate a range of virtual addresses, it can
|
||||
use::
|
||||
|
@ -216,70 +183,46 @@ The usage pattern is::
|
|||
struct hmm_range range;
|
||||
...
|
||||
|
||||
range.notifier = &mni;
|
||||
range.start = ...;
|
||||
range.end = ...;
|
||||
range.pfns = ...;
|
||||
range.flags = ...;
|
||||
range.values = ...;
|
||||
range.pfn_shift = ...;
|
||||
hmm_range_register(&range, mirror);
|
||||
|
||||
/*
|
||||
* Just wait for range to be valid, safe to ignore return value as we
|
||||
* will use the return value of hmm_range_fault() below under the
|
||||
* mmap_sem to ascertain the validity of the range.
|
||||
*/
|
||||
hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
|
||||
if (!mmget_not_zero(mni->notifier.mm))
|
||||
return -EFAULT;
|
||||
|
||||
again:
|
||||
range.notifier_seq = mmu_interval_read_begin(&mni);
|
||||
down_read(&mm->mmap_sem);
|
||||
ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT);
|
||||
if (ret) {
|
||||
up_read(&mm->mmap_sem);
|
||||
if (ret == -EBUSY) {
|
||||
/*
|
||||
* No need to check hmm_range_wait_until_valid() return value
|
||||
* on retry we will get proper error with hmm_range_fault()
|
||||
*/
|
||||
hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
|
||||
goto again;
|
||||
}
|
||||
hmm_range_unregister(&range);
|
||||
if (ret == -EBUSY)
|
||||
goto again;
|
||||
return ret;
|
||||
}
|
||||
up_read(&mm->mmap_sem);
|
||||
|
||||
take_lock(driver->update);
|
||||
if (!hmm_range_valid(&range)) {
|
||||
if (mmu_interval_read_retry(&ni, range.notifier_seq) {
|
||||
release_lock(driver->update);
|
||||
up_read(&mm->mmap_sem);
|
||||
goto again;
|
||||
}
|
||||
|
||||
// Use pfns array content to update device page table
|
||||
/* Use pfns array content to update device page table,
|
||||
* under the update lock */
|
||||
|
||||
hmm_range_unregister(&range);
|
||||
release_lock(driver->update);
|
||||
up_read(&mm->mmap_sem);
|
||||
return 0;
|
||||
}
|
||||
|
||||
The driver->update lock is the same lock that the driver takes inside its
|
||||
sync_cpu_device_pagetables() callback. That lock must be held before calling
|
||||
hmm_range_valid() to avoid any race with a concurrent CPU page table update.
|
||||
|
||||
HMM implements all this on top of the mmu_notifier API because we wanted a
|
||||
simpler API and also to be able to perform optimizations latter on like doing
|
||||
concurrent device updates in multi-devices scenario.
|
||||
|
||||
HMM also serves as an impedance mismatch between how CPU page table updates
|
||||
are done (by CPU write to the page table and TLB flushes) and how devices
|
||||
update their own page table. Device updates are a multi-step process. First,
|
||||
appropriate commands are written to a buffer, then this buffer is scheduled for
|
||||
execution on the device. It is only once the device has executed commands in
|
||||
the buffer that the update is done. Creating and scheduling the update command
|
||||
buffer can happen concurrently for multiple devices. Waiting for each device to
|
||||
report commands as executed is serialized (there is no point in doing this
|
||||
concurrently).
|
||||
|
||||
invalidate() callback. That lock must be held before calling
|
||||
mmu_interval_read_retry() to avoid any race with a concurrent CPU page table
|
||||
update.
|
||||
|
||||
Leverage default_flags and pfn_flags_mask
|
||||
=========================================
|
||||
|
|
|
@ -967,6 +967,8 @@ struct amdgpu_device {
|
|||
struct mutex lock_reset;
|
||||
struct amdgpu_doorbell_index doorbell_index;
|
||||
|
||||
struct mutex notifier_lock;
|
||||
|
||||
int asic_reset_res;
|
||||
struct work_struct xgmi_reset_work;
|
||||
|
||||
|
|
|
@ -505,8 +505,7 @@ static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem,
|
|||
*
|
||||
* Returns 0 for success, negative errno for errors.
|
||||
*/
|
||||
static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm,
|
||||
uint64_t user_addr)
|
||||
static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
|
||||
{
|
||||
struct amdkfd_process_info *process_info = mem->process_info;
|
||||
struct amdgpu_bo *bo = mem->bo;
|
||||
|
@ -1199,7 +1198,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
|
|||
add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, user_addr);
|
||||
|
||||
if (user_addr) {
|
||||
ret = init_user_pages(*mem, current->mm, user_addr);
|
||||
ret = init_user_pages(*mem, user_addr);
|
||||
if (ret)
|
||||
goto allocate_init_user_pages_failed;
|
||||
}
|
||||
|
@ -1744,6 +1743,10 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
|
|||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* FIXME: Cannot ignore the return code, must hold
|
||||
* notifier_lock
|
||||
*/
|
||||
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
|
||||
|
||||
/* Mark the BO as valid unless it was invalidated
|
||||
|
|
|
@ -538,8 +538,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
|
|||
e->tv.num_shared = 2;
|
||||
|
||||
amdgpu_bo_list_get_list(p->bo_list, &p->validated);
|
||||
if (p->bo_list->first_userptr != p->bo_list->num_entries)
|
||||
p->mn = amdgpu_mn_get(p->adev, AMDGPU_MN_TYPE_GFX);
|
||||
|
||||
INIT_LIST_HEAD(&duplicates);
|
||||
amdgpu_vm_get_pd_bo(&fpriv->vm, &p->validated, &p->vm_pd);
|
||||
|
@ -1219,11 +1217,11 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
|
|||
if (r)
|
||||
goto error_unlock;
|
||||
|
||||
/* No memory allocation is allowed while holding the mn lock.
|
||||
* p->mn is hold until amdgpu_cs_submit is finished and fence is added
|
||||
* to BOs.
|
||||
/* No memory allocation is allowed while holding the notifier lock.
|
||||
* The lock is held until amdgpu_cs_submit is finished and fence is
|
||||
* added to BOs.
|
||||
*/
|
||||
amdgpu_mn_lock(p->mn);
|
||||
mutex_lock(&p->adev->notifier_lock);
|
||||
|
||||
/* If userptr are invalidated after amdgpu_cs_parser_bos(), return
|
||||
* -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl.
|
||||
|
@ -1266,13 +1264,13 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
|
|||
amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm);
|
||||
|
||||
ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
|
||||
amdgpu_mn_unlock(p->mn);
|
||||
mutex_unlock(&p->adev->notifier_lock);
|
||||
|
||||
return 0;
|
||||
|
||||
error_abort:
|
||||
drm_sched_job_cleanup(&job->base);
|
||||
amdgpu_mn_unlock(p->mn);
|
||||
mutex_unlock(&p->adev->notifier_lock);
|
||||
|
||||
error_unlock:
|
||||
amdgpu_job_free(job);
|
||||
|
|
|
@ -2794,6 +2794,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
|||
mutex_init(&adev->virt.vf_errors.lock);
|
||||
hash_init(adev->mn_hash);
|
||||
mutex_init(&adev->lock_reset);
|
||||
mutex_init(&adev->notifier_lock);
|
||||
mutex_init(&adev->virt.dpm_mutex);
|
||||
mutex_init(&adev->psp.mutex);
|
||||
|
||||
|
|
|
@ -51,439 +51,107 @@
|
|||
#include "amdgpu_amdkfd.h"
|
||||
|
||||
/**
|
||||
* struct amdgpu_mn_node
|
||||
* amdgpu_mn_invalidate_gfx - callback to notify about mm change
|
||||
*
|
||||
* @it: interval node defining start-last of the affected address range
|
||||
* @bos: list of all BOs in the affected address range
|
||||
*
|
||||
* Manages all BOs which are affected of a certain range of address space.
|
||||
*/
|
||||
struct amdgpu_mn_node {
|
||||
struct interval_tree_node it;
|
||||
struct list_head bos;
|
||||
};
|
||||
|
||||
/**
|
||||
* amdgpu_mn_destroy - destroy the HMM mirror
|
||||
*
|
||||
* @work: previously sheduled work item
|
||||
*
|
||||
* Lazy destroys the notifier from a work item
|
||||
*/
|
||||
static void amdgpu_mn_destroy(struct work_struct *work)
|
||||
{
|
||||
struct amdgpu_mn *amn = container_of(work, struct amdgpu_mn, work);
|
||||
struct amdgpu_device *adev = amn->adev;
|
||||
struct amdgpu_mn_node *node, *next_node;
|
||||
struct amdgpu_bo *bo, *next_bo;
|
||||
|
||||
mutex_lock(&adev->mn_lock);
|
||||
down_write(&amn->lock);
|
||||
hash_del(&amn->node);
|
||||
rbtree_postorder_for_each_entry_safe(node, next_node,
|
||||
&amn->objects.rb_root, it.rb) {
|
||||
list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) {
|
||||
bo->mn = NULL;
|
||||
list_del_init(&bo->mn_list);
|
||||
}
|
||||
kfree(node);
|
||||
}
|
||||
up_write(&amn->lock);
|
||||
mutex_unlock(&adev->mn_lock);
|
||||
|
||||
hmm_mirror_unregister(&amn->mirror);
|
||||
kfree(amn);
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_hmm_mirror_release - callback to notify about mm destruction
|
||||
*
|
||||
* @mirror: the HMM mirror (mm) this callback is about
|
||||
*
|
||||
* Shedule a work item to lazy destroy HMM mirror.
|
||||
*/
|
||||
static void amdgpu_hmm_mirror_release(struct hmm_mirror *mirror)
|
||||
{
|
||||
struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
|
||||
|
||||
INIT_WORK(&amn->work, amdgpu_mn_destroy);
|
||||
schedule_work(&amn->work);
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_mn_lock - take the write side lock for this notifier
|
||||
*
|
||||
* @mn: our notifier
|
||||
*/
|
||||
void amdgpu_mn_lock(struct amdgpu_mn *mn)
|
||||
{
|
||||
if (mn)
|
||||
down_write(&mn->lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_mn_unlock - drop the write side lock for this notifier
|
||||
*
|
||||
* @mn: our notifier
|
||||
*/
|
||||
void amdgpu_mn_unlock(struct amdgpu_mn *mn)
|
||||
{
|
||||
if (mn)
|
||||
up_write(&mn->lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_mn_read_lock - take the read side lock for this notifier
|
||||
*
|
||||
* @amn: our notifier
|
||||
* @blockable: is the notifier blockable
|
||||
*/
|
||||
static int amdgpu_mn_read_lock(struct amdgpu_mn *amn, bool blockable)
|
||||
{
|
||||
if (blockable)
|
||||
down_read(&amn->lock);
|
||||
else if (!down_read_trylock(&amn->lock))
|
||||
return -EAGAIN;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_mn_read_unlock - drop the read side lock for this notifier
|
||||
*
|
||||
* @amn: our notifier
|
||||
*/
|
||||
static void amdgpu_mn_read_unlock(struct amdgpu_mn *amn)
|
||||
{
|
||||
up_read(&amn->lock);
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_mn_invalidate_node - unmap all BOs of a node
|
||||
*
|
||||
* @node: the node with the BOs to unmap
|
||||
* @start: start of address range affected
|
||||
* @end: end of address range affected
|
||||
* @mni: the range (mm) is about to update
|
||||
* @range: details on the invalidation
|
||||
* @cur_seq: Value to pass to mmu_interval_set_seq()
|
||||
*
|
||||
* Block for operations on BOs to finish and mark pages as accessed and
|
||||
* potentially dirty.
|
||||
*/
|
||||
static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node,
|
||||
unsigned long start,
|
||||
unsigned long end)
|
||||
static bool amdgpu_mn_invalidate_gfx(struct mmu_interval_notifier *mni,
|
||||
const struct mmu_notifier_range *range,
|
||||
unsigned long cur_seq)
|
||||
{
|
||||
struct amdgpu_bo *bo;
|
||||
struct amdgpu_bo *bo = container_of(mni, struct amdgpu_bo, notifier);
|
||||
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
|
||||
long r;
|
||||
|
||||
list_for_each_entry(bo, &node->bos, mn_list) {
|
||||
if (!mmu_notifier_range_blockable(range))
|
||||
return false;
|
||||
|
||||
if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, start, end))
|
||||
continue;
|
||||
mutex_lock(&adev->notifier_lock);
|
||||
|
||||
r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv,
|
||||
true, false, MAX_SCHEDULE_TIMEOUT);
|
||||
if (r <= 0)
|
||||
DRM_ERROR("(%ld) failed to wait for user bo\n", r);
|
||||
}
|
||||
mmu_interval_set_seq(mni, cur_seq);
|
||||
|
||||
r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, true, false,
|
||||
MAX_SCHEDULE_TIMEOUT);
|
||||
mutex_unlock(&adev->notifier_lock);
|
||||
if (r <= 0)
|
||||
DRM_ERROR("(%ld) failed to wait for user bo\n", r);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_mn_sync_pagetables_gfx - callback to notify about mm change
|
||||
*
|
||||
* @mirror: the hmm_mirror (mm) is about to update
|
||||
* @update: the update start, end address
|
||||
*
|
||||
* Block for operations on BOs to finish and mark pages as accessed and
|
||||
* potentially dirty.
|
||||
*/
|
||||
static int
|
||||
amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror,
|
||||
const struct mmu_notifier_range *update)
|
||||
{
|
||||
struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
|
||||
unsigned long start = update->start;
|
||||
unsigned long end = update->end;
|
||||
bool blockable = mmu_notifier_range_blockable(update);
|
||||
struct interval_tree_node *it;
|
||||
|
||||
/* notification is exclusive, but interval is inclusive */
|
||||
end -= 1;
|
||||
|
||||
/* TODO we should be able to split locking for interval tree and
|
||||
* amdgpu_mn_invalidate_node
|
||||
*/
|
||||
if (amdgpu_mn_read_lock(amn, blockable))
|
||||
return -EAGAIN;
|
||||
|
||||
it = interval_tree_iter_first(&amn->objects, start, end);
|
||||
while (it) {
|
||||
struct amdgpu_mn_node *node;
|
||||
|
||||
if (!blockable) {
|
||||
amdgpu_mn_read_unlock(amn);
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
node = container_of(it, struct amdgpu_mn_node, it);
|
||||
it = interval_tree_iter_next(it, start, end);
|
||||
|
||||
amdgpu_mn_invalidate_node(node, start, end);
|
||||
}
|
||||
|
||||
amdgpu_mn_read_unlock(amn);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_mn_sync_pagetables_hsa - callback to notify about mm change
|
||||
*
|
||||
* @mirror: the hmm_mirror (mm) is about to update
|
||||
* @update: the update start, end address
|
||||
*
|
||||
* We temporarily evict all BOs between start and end. This
|
||||
* necessitates evicting all user-mode queues of the process. The BOs
|
||||
* are restorted in amdgpu_mn_invalidate_range_end_hsa.
|
||||
*/
|
||||
static int
|
||||
amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror,
|
||||
const struct mmu_notifier_range *update)
|
||||
{
|
||||
struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
|
||||
unsigned long start = update->start;
|
||||
unsigned long end = update->end;
|
||||
bool blockable = mmu_notifier_range_blockable(update);
|
||||
struct interval_tree_node *it;
|
||||
|
||||
/* notification is exclusive, but interval is inclusive */
|
||||
end -= 1;
|
||||
|
||||
if (amdgpu_mn_read_lock(amn, blockable))
|
||||
return -EAGAIN;
|
||||
|
||||
it = interval_tree_iter_first(&amn->objects, start, end);
|
||||
while (it) {
|
||||
struct amdgpu_mn_node *node;
|
||||
struct amdgpu_bo *bo;
|
||||
|
||||
if (!blockable) {
|
||||
amdgpu_mn_read_unlock(amn);
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
node = container_of(it, struct amdgpu_mn_node, it);
|
||||
it = interval_tree_iter_next(it, start, end);
|
||||
|
||||
list_for_each_entry(bo, &node->bos, mn_list) {
|
||||
struct kgd_mem *mem = bo->kfd_bo;
|
||||
|
||||
if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm,
|
||||
start, end))
|
||||
amdgpu_amdkfd_evict_userptr(mem, amn->mm);
|
||||
}
|
||||
}
|
||||
|
||||
amdgpu_mn_read_unlock(amn);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Low bits of any reasonable mm pointer will be unused due to struct
|
||||
* alignment. Use these bits to make a unique key from the mm pointer
|
||||
* and notifier type.
|
||||
*/
|
||||
#define AMDGPU_MN_KEY(mm, type) ((unsigned long)(mm) + (type))
|
||||
|
||||
static struct hmm_mirror_ops amdgpu_hmm_mirror_ops[] = {
|
||||
[AMDGPU_MN_TYPE_GFX] = {
|
||||
.sync_cpu_device_pagetables = amdgpu_mn_sync_pagetables_gfx,
|
||||
.release = amdgpu_hmm_mirror_release
|
||||
},
|
||||
[AMDGPU_MN_TYPE_HSA] = {
|
||||
.sync_cpu_device_pagetables = amdgpu_mn_sync_pagetables_hsa,
|
||||
.release = amdgpu_hmm_mirror_release
|
||||
},
|
||||
static const struct mmu_interval_notifier_ops amdgpu_mn_gfx_ops = {
|
||||
.invalidate = amdgpu_mn_invalidate_gfx,
|
||||
};
|
||||
|
||||
/**
|
||||
* amdgpu_mn_get - create HMM mirror context
|
||||
* amdgpu_mn_invalidate_hsa - callback to notify about mm change
|
||||
*
|
||||
* @adev: amdgpu device pointer
|
||||
* @type: type of MMU notifier context
|
||||
* @mni: the range (mm) is about to update
|
||||
* @range: details on the invalidation
|
||||
* @cur_seq: Value to pass to mmu_interval_set_seq()
|
||||
*
|
||||
* Creates a HMM mirror context for current->mm.
|
||||
* We temporarily evict the BO attached to this range. This necessitates
|
||||
* evicting all user-mode queues of the process.
|
||||
*/
|
||||
struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
|
||||
enum amdgpu_mn_type type)
|
||||
static bool amdgpu_mn_invalidate_hsa(struct mmu_interval_notifier *mni,
|
||||
const struct mmu_notifier_range *range,
|
||||
unsigned long cur_seq)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct amdgpu_mn *amn;
|
||||
unsigned long key = AMDGPU_MN_KEY(mm, type);
|
||||
int r;
|
||||
struct amdgpu_bo *bo = container_of(mni, struct amdgpu_bo, notifier);
|
||||
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
|
||||
|
||||
mutex_lock(&adev->mn_lock);
|
||||
if (down_write_killable(&mm->mmap_sem)) {
|
||||
mutex_unlock(&adev->mn_lock);
|
||||
return ERR_PTR(-EINTR);
|
||||
}
|
||||
if (!mmu_notifier_range_blockable(range))
|
||||
return false;
|
||||
|
||||
hash_for_each_possible(adev->mn_hash, amn, node, key)
|
||||
if (AMDGPU_MN_KEY(amn->mm, amn->type) == key)
|
||||
goto release_locks;
|
||||
mutex_lock(&adev->notifier_lock);
|
||||
|
||||
amn = kzalloc(sizeof(*amn), GFP_KERNEL);
|
||||
if (!amn) {
|
||||
amn = ERR_PTR(-ENOMEM);
|
||||
goto release_locks;
|
||||
}
|
||||
mmu_interval_set_seq(mni, cur_seq);
|
||||
|
||||
amn->adev = adev;
|
||||
amn->mm = mm;
|
||||
init_rwsem(&amn->lock);
|
||||
amn->type = type;
|
||||
amn->objects = RB_ROOT_CACHED;
|
||||
amdgpu_amdkfd_evict_userptr(bo->kfd_bo, bo->notifier.mm);
|
||||
mutex_unlock(&adev->notifier_lock);
|
||||
|
||||
amn->mirror.ops = &amdgpu_hmm_mirror_ops[type];
|
||||
r = hmm_mirror_register(&amn->mirror, mm);
|
||||
if (r)
|
||||
goto free_amn;
|
||||
|
||||
hash_add(adev->mn_hash, &amn->node, AMDGPU_MN_KEY(mm, type));
|
||||
|
||||
release_locks:
|
||||
up_write(&mm->mmap_sem);
|
||||
mutex_unlock(&adev->mn_lock);
|
||||
|
||||
return amn;
|
||||
|
||||
free_amn:
|
||||
up_write(&mm->mmap_sem);
|
||||
mutex_unlock(&adev->mn_lock);
|
||||
kfree(amn);
|
||||
|
||||
return ERR_PTR(r);
|
||||
return true;
|
||||
}
|
||||
|
||||
static const struct mmu_interval_notifier_ops amdgpu_mn_hsa_ops = {
|
||||
.invalidate = amdgpu_mn_invalidate_hsa,
|
||||
};
|
||||
|
||||
/**
|
||||
* amdgpu_mn_register - register a BO for notifier updates
|
||||
*
|
||||
* @bo: amdgpu buffer object
|
||||
* @addr: userptr addr we should monitor
|
||||
*
|
||||
* Registers an HMM mirror for the given BO at the specified address.
|
||||
* Registers a mmu_notifier for the given BO at the specified address.
|
||||
* Returns 0 on success, -ERRNO if anything goes wrong.
|
||||
*/
|
||||
int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr)
|
||||
{
|
||||
unsigned long end = addr + amdgpu_bo_size(bo) - 1;
|
||||
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
|
||||
enum amdgpu_mn_type type =
|
||||
bo->kfd_bo ? AMDGPU_MN_TYPE_HSA : AMDGPU_MN_TYPE_GFX;
|
||||
struct amdgpu_mn *amn;
|
||||
struct amdgpu_mn_node *node = NULL, *new_node;
|
||||
struct list_head bos;
|
||||
struct interval_tree_node *it;
|
||||
|
||||
amn = amdgpu_mn_get(adev, type);
|
||||
if (IS_ERR(amn))
|
||||
return PTR_ERR(amn);
|
||||
|
||||
new_node = kmalloc(sizeof(*new_node), GFP_KERNEL);
|
||||
if (!new_node)
|
||||
return -ENOMEM;
|
||||
|
||||
INIT_LIST_HEAD(&bos);
|
||||
|
||||
down_write(&amn->lock);
|
||||
|
||||
while ((it = interval_tree_iter_first(&amn->objects, addr, end))) {
|
||||
kfree(node);
|
||||
node = container_of(it, struct amdgpu_mn_node, it);
|
||||
interval_tree_remove(&node->it, &amn->objects);
|
||||
addr = min(it->start, addr);
|
||||
end = max(it->last, end);
|
||||
list_splice(&node->bos, &bos);
|
||||
}
|
||||
|
||||
if (!node)
|
||||
node = new_node;
|
||||
else
|
||||
kfree(new_node);
|
||||
|
||||
bo->mn = amn;
|
||||
|
||||
node->it.start = addr;
|
||||
node->it.last = end;
|
||||
INIT_LIST_HEAD(&node->bos);
|
||||
list_splice(&bos, &node->bos);
|
||||
list_add(&bo->mn_list, &node->bos);
|
||||
|
||||
interval_tree_insert(&node->it, &amn->objects);
|
||||
|
||||
up_write(&amn->lock);
|
||||
|
||||
return 0;
|
||||
if (bo->kfd_bo)
|
||||
return mmu_interval_notifier_insert(&bo->notifier, current->mm,
|
||||
addr, amdgpu_bo_size(bo),
|
||||
&amdgpu_mn_hsa_ops);
|
||||
return mmu_interval_notifier_insert(&bo->notifier, current->mm, addr,
|
||||
amdgpu_bo_size(bo),
|
||||
&amdgpu_mn_gfx_ops);
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_mn_unregister - unregister a BO for HMM mirror updates
|
||||
* amdgpu_mn_unregister - unregister a BO for notifier updates
|
||||
*
|
||||
* @bo: amdgpu buffer object
|
||||
*
|
||||
* Remove any registration of HMM mirror updates from the buffer object.
|
||||
* Remove any registration of mmu notifier updates from the buffer object.
|
||||
*/
|
||||
void amdgpu_mn_unregister(struct amdgpu_bo *bo)
|
||||
{
|
||||
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
|
||||
struct amdgpu_mn *amn;
|
||||
struct list_head *head;
|
||||
|
||||
mutex_lock(&adev->mn_lock);
|
||||
|
||||
amn = bo->mn;
|
||||
if (amn == NULL) {
|
||||
mutex_unlock(&adev->mn_lock);
|
||||
if (!bo->notifier.mm)
|
||||
return;
|
||||
}
|
||||
|
||||
down_write(&amn->lock);
|
||||
|
||||
/* save the next list entry for later */
|
||||
head = bo->mn_list.next;
|
||||
|
||||
bo->mn = NULL;
|
||||
list_del_init(&bo->mn_list);
|
||||
|
||||
if (list_empty(head)) {
|
||||
struct amdgpu_mn_node *node;
|
||||
|
||||
node = container_of(head, struct amdgpu_mn_node, bos);
|
||||
interval_tree_remove(&node->it, &amn->objects);
|
||||
kfree(node);
|
||||
}
|
||||
|
||||
up_write(&amn->lock);
|
||||
mutex_unlock(&adev->mn_lock);
|
||||
}
|
||||
|
||||
/* flags used by HMM internal, not related to CPU/GPU PTE flags */
|
||||
static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = {
|
||||
(1 << 0), /* HMM_PFN_VALID */
|
||||
(1 << 1), /* HMM_PFN_WRITE */
|
||||
0 /* HMM_PFN_DEVICE_PRIVATE */
|
||||
};
|
||||
|
||||
static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = {
|
||||
0xfffffffffffffffeUL, /* HMM_PFN_ERROR */
|
||||
0, /* HMM_PFN_NONE */
|
||||
0xfffffffffffffffcUL /* HMM_PFN_SPECIAL */
|
||||
};
|
||||
|
||||
void amdgpu_hmm_init_range(struct hmm_range *range)
|
||||
{
|
||||
if (range) {
|
||||
range->flags = hmm_range_flags;
|
||||
range->values = hmm_range_values;
|
||||
range->pfn_shift = PAGE_SHIFT;
|
||||
}
|
||||
mmu_interval_notifier_remove(&bo->notifier);
|
||||
bo->notifier.mm = NULL;
|
||||
}
|
||||
|
|
|
@ -30,63 +30,10 @@
|
|||
#include <linux/workqueue.h>
|
||||
#include <linux/interval_tree.h>
|
||||
|
||||
enum amdgpu_mn_type {
|
||||
AMDGPU_MN_TYPE_GFX,
|
||||
AMDGPU_MN_TYPE_HSA,
|
||||
};
|
||||
|
||||
/**
|
||||
* struct amdgpu_mn
|
||||
*
|
||||
* @adev: amdgpu device pointer
|
||||
* @mm: process address space
|
||||
* @type: type of MMU notifier
|
||||
* @work: destruction work item
|
||||
* @node: hash table node to find structure by adev and mn
|
||||
* @lock: rw semaphore protecting the notifier nodes
|
||||
* @objects: interval tree containing amdgpu_mn_nodes
|
||||
* @mirror: HMM mirror function support
|
||||
*
|
||||
* Data for each amdgpu device and process address space.
|
||||
*/
|
||||
struct amdgpu_mn {
|
||||
/* constant after initialisation */
|
||||
struct amdgpu_device *adev;
|
||||
struct mm_struct *mm;
|
||||
enum amdgpu_mn_type type;
|
||||
|
||||
/* only used on destruction */
|
||||
struct work_struct work;
|
||||
|
||||
/* protected by adev->mn_lock */
|
||||
struct hlist_node node;
|
||||
|
||||
/* objects protected by lock */
|
||||
struct rw_semaphore lock;
|
||||
struct rb_root_cached objects;
|
||||
|
||||
#ifdef CONFIG_HMM_MIRROR
|
||||
/* HMM mirror */
|
||||
struct hmm_mirror mirror;
|
||||
#endif
|
||||
};
|
||||
|
||||
#if defined(CONFIG_HMM_MIRROR)
|
||||
void amdgpu_mn_lock(struct amdgpu_mn *mn);
|
||||
void amdgpu_mn_unlock(struct amdgpu_mn *mn);
|
||||
struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
|
||||
enum amdgpu_mn_type type);
|
||||
int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr);
|
||||
void amdgpu_mn_unregister(struct amdgpu_bo *bo);
|
||||
void amdgpu_hmm_init_range(struct hmm_range *range);
|
||||
#else
|
||||
static inline void amdgpu_mn_lock(struct amdgpu_mn *mn) {}
|
||||
static inline void amdgpu_mn_unlock(struct amdgpu_mn *mn) {}
|
||||
static inline struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev,
|
||||
enum amdgpu_mn_type type)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
static inline int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr)
|
||||
{
|
||||
DRM_WARN_ONCE("HMM_MIRROR kernel config option is not enabled, "
|
||||
|
|
|
@ -30,6 +30,9 @@
|
|||
|
||||
#include <drm/amdgpu_drm.h>
|
||||
#include "amdgpu.h"
|
||||
#ifdef CONFIG_MMU_NOTIFIER
|
||||
#include <linux/mmu_notifier.h>
|
||||
#endif
|
||||
|
||||
#define AMDGPU_BO_INVALID_OFFSET LONG_MAX
|
||||
#define AMDGPU_BO_MAX_PLACEMENTS 3
|
||||
|
@ -101,10 +104,12 @@ struct amdgpu_bo {
|
|||
struct ttm_bo_kmap_obj dma_buf_vmap;
|
||||
struct amdgpu_mn *mn;
|
||||
|
||||
union {
|
||||
struct list_head mn_list;
|
||||
struct list_head shadow_list;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_MMU_NOTIFIER
|
||||
struct mmu_interval_notifier notifier;
|
||||
#endif
|
||||
|
||||
struct list_head shadow_list;
|
||||
|
||||
struct kgd_mem *kfd_bo;
|
||||
};
|
||||
|
|
|
@ -35,6 +35,7 @@
|
|||
#include <linux/hmm.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/swap.h>
|
||||
|
@ -769,6 +770,20 @@ struct amdgpu_ttm_tt {
|
|||
#endif
|
||||
};
|
||||
|
||||
#ifdef CONFIG_DRM_AMDGPU_USERPTR
|
||||
/* flags used by HMM internal, not related to CPU/GPU PTE flags */
|
||||
static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = {
|
||||
(1 << 0), /* HMM_PFN_VALID */
|
||||
(1 << 1), /* HMM_PFN_WRITE */
|
||||
0 /* HMM_PFN_DEVICE_PRIVATE */
|
||||
};
|
||||
|
||||
static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = {
|
||||
0xfffffffffffffffeUL, /* HMM_PFN_ERROR */
|
||||
0, /* HMM_PFN_NONE */
|
||||
0xfffffffffffffffcUL /* HMM_PFN_SPECIAL */
|
||||
};
|
||||
|
||||
/**
|
||||
* amdgpu_ttm_tt_get_user_pages - get device accessible pages that back user
|
||||
* memory and start HMM tracking CPU page table update
|
||||
|
@ -776,85 +791,89 @@ struct amdgpu_ttm_tt {
|
|||
* Calling function must call amdgpu_ttm_tt_userptr_range_done() once and only
|
||||
* once afterwards to stop HMM tracking
|
||||
*/
|
||||
#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
|
||||
|
||||
#define MAX_RETRY_HMM_RANGE_FAULT 16
|
||||
|
||||
int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
|
||||
{
|
||||
struct hmm_mirror *mirror = bo->mn ? &bo->mn->mirror : NULL;
|
||||
struct ttm_tt *ttm = bo->tbo.ttm;
|
||||
struct amdgpu_ttm_tt *gtt = (void *)ttm;
|
||||
struct mm_struct *mm = gtt->usertask->mm;
|
||||
unsigned long start = gtt->userptr;
|
||||
struct vm_area_struct *vma;
|
||||
struct hmm_range *range;
|
||||
unsigned long timeout;
|
||||
struct mm_struct *mm;
|
||||
unsigned long i;
|
||||
uint64_t *pfns;
|
||||
int r = 0;
|
||||
|
||||
if (!mm) /* Happens during process shutdown */
|
||||
mm = bo->notifier.mm;
|
||||
if (unlikely(!mm)) {
|
||||
DRM_DEBUG_DRIVER("BO is not registered?\n");
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
/* Another get_user_pages is running at the same time?? */
|
||||
if (WARN_ON(gtt->range))
|
||||
return -EFAULT;
|
||||
|
||||
if (!mmget_not_zero(mm)) /* Happens during process shutdown */
|
||||
return -ESRCH;
|
||||
|
||||
if (unlikely(!mirror)) {
|
||||
DRM_DEBUG_DRIVER("Failed to get hmm_mirror\n");
|
||||
r = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
vma = find_vma(mm, start);
|
||||
if (unlikely(!vma || start < vma->vm_start)) {
|
||||
r = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
if (unlikely((gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) &&
|
||||
vma->vm_file)) {
|
||||
r = -EPERM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
range = kzalloc(sizeof(*range), GFP_KERNEL);
|
||||
if (unlikely(!range)) {
|
||||
r = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
range->notifier = &bo->notifier;
|
||||
range->flags = hmm_range_flags;
|
||||
range->values = hmm_range_values;
|
||||
range->pfn_shift = PAGE_SHIFT;
|
||||
range->start = bo->notifier.interval_tree.start;
|
||||
range->end = bo->notifier.interval_tree.last + 1;
|
||||
range->default_flags = hmm_range_flags[HMM_PFN_VALID];
|
||||
if (!amdgpu_ttm_tt_is_readonly(ttm))
|
||||
range->default_flags |= range->flags[HMM_PFN_WRITE];
|
||||
|
||||
pfns = kvmalloc_array(ttm->num_pages, sizeof(*pfns), GFP_KERNEL);
|
||||
if (unlikely(!pfns)) {
|
||||
range->pfns = kvmalloc_array(ttm->num_pages, sizeof(*range->pfns),
|
||||
GFP_KERNEL);
|
||||
if (unlikely(!range->pfns)) {
|
||||
r = -ENOMEM;
|
||||
goto out_free_ranges;
|
||||
}
|
||||
|
||||
amdgpu_hmm_init_range(range);
|
||||
range->default_flags = range->flags[HMM_PFN_VALID];
|
||||
range->default_flags |= amdgpu_ttm_tt_is_readonly(ttm) ?
|
||||
0 : range->flags[HMM_PFN_WRITE];
|
||||
range->pfn_flags_mask = 0;
|
||||
range->pfns = pfns;
|
||||
range->start = start;
|
||||
range->end = start + ttm->num_pages * PAGE_SIZE;
|
||||
down_read(&mm->mmap_sem);
|
||||
vma = find_vma(mm, start);
|
||||
if (unlikely(!vma || start < vma->vm_start)) {
|
||||
r = -EFAULT;
|
||||
goto out_unlock;
|
||||
}
|
||||
if (unlikely((gtt->userflags & AMDGPU_GEM_USERPTR_ANONONLY) &&
|
||||
vma->vm_file)) {
|
||||
r = -EPERM;
|
||||
goto out_unlock;
|
||||
}
|
||||
up_read(&mm->mmap_sem);
|
||||
timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
|
||||
|
||||
hmm_range_register(range, mirror);
|
||||
|
||||
/*
|
||||
* Just wait for range to be valid, safe to ignore return value as we
|
||||
* will use the return value of hmm_range_fault() below under the
|
||||
* mmap_sem to ascertain the validity of the range.
|
||||
*/
|
||||
hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT);
|
||||
retry:
|
||||
range->notifier_seq = mmu_interval_read_begin(&bo->notifier);
|
||||
|
||||
down_read(&mm->mmap_sem);
|
||||
r = hmm_range_fault(range, 0);
|
||||
up_read(&mm->mmap_sem);
|
||||
|
||||
if (unlikely(r < 0))
|
||||
if (unlikely(r <= 0)) {
|
||||
/*
|
||||
* FIXME: This timeout should encompass the retry from
|
||||
* mmu_interval_read_retry() as well.
|
||||
*/
|
||||
if ((r == 0 || r == -EBUSY) && !time_after(jiffies, timeout))
|
||||
goto retry;
|
||||
goto out_free_pfns;
|
||||
}
|
||||
|
||||
for (i = 0; i < ttm->num_pages; i++) {
|
||||
pages[i] = hmm_device_entry_to_page(range, pfns[i]);
|
||||
/* FIXME: The pages cannot be touched outside the notifier_lock */
|
||||
pages[i] = hmm_device_entry_to_page(range, range->pfns[i]);
|
||||
if (unlikely(!pages[i])) {
|
||||
pr_err("Page fault failed for pfn[%lu] = 0x%llx\n",
|
||||
i, pfns[i]);
|
||||
i, range->pfns[i]);
|
||||
r = -ENOMEM;
|
||||
|
||||
goto out_free_pfns;
|
||||
|
@ -862,15 +881,18 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
|
|||
}
|
||||
|
||||
gtt->range = range;
|
||||
mmput(mm);
|
||||
|
||||
return 0;
|
||||
|
||||
out_unlock:
|
||||
up_read(&mm->mmap_sem);
|
||||
out_free_pfns:
|
||||
hmm_range_unregister(range);
|
||||
kvfree(pfns);
|
||||
kvfree(range->pfns);
|
||||
out_free_ranges:
|
||||
kfree(range);
|
||||
out:
|
||||
mmput(mm);
|
||||
return r;
|
||||
}
|
||||
|
||||
|
@ -895,15 +917,18 @@ bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm)
|
|||
"No user pages to check\n");
|
||||
|
||||
if (gtt->range) {
|
||||
r = hmm_range_valid(gtt->range);
|
||||
hmm_range_unregister(gtt->range);
|
||||
|
||||
/*
|
||||
* FIXME: Must always hold notifier_lock for this, and must
|
||||
* not ignore the return code.
|
||||
*/
|
||||
r = mmu_interval_read_retry(gtt->range->notifier,
|
||||
gtt->range->notifier_seq);
|
||||
kvfree(gtt->range->pfns);
|
||||
kfree(gtt->range);
|
||||
gtt->range = NULL;
|
||||
}
|
||||
|
||||
return r;
|
||||
return !r;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -984,10 +1009,18 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_tt *ttm)
|
|||
sg_free_table(ttm->sg);
|
||||
|
||||
#if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR)
|
||||
if (gtt->range &&
|
||||
ttm->pages[0] == hmm_device_entry_to_page(gtt->range,
|
||||
gtt->range->pfns[0]))
|
||||
WARN_ONCE(1, "Missing get_user_page_done\n");
|
||||
if (gtt->range) {
|
||||
unsigned long i;
|
||||
|
||||
for (i = 0; i < ttm->num_pages; i++) {
|
||||
if (ttm->pages[i] !=
|
||||
hmm_device_entry_to_page(gtt->range,
|
||||
gtt->range->pfns[i]))
|
||||
break;
|
||||
}
|
||||
|
||||
WARN((i == ttm->num_pages), "Missing get_user_page_done\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -88,6 +88,7 @@ nouveau_ivmm_find(struct nouveau_svm *svm, u64 inst)
|
|||
}
|
||||
|
||||
struct nouveau_svmm {
|
||||
struct mmu_notifier notifier;
|
||||
struct nouveau_vmm *vmm;
|
||||
struct {
|
||||
unsigned long start;
|
||||
|
@ -95,9 +96,6 @@ struct nouveau_svmm {
|
|||
} unmanaged;
|
||||
|
||||
struct mutex mutex;
|
||||
|
||||
struct mm_struct *mm;
|
||||
struct hmm_mirror mirror;
|
||||
};
|
||||
|
||||
#define SVMM_DBG(s,f,a...) \
|
||||
|
@ -251,10 +249,11 @@ nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit)
|
|||
}
|
||||
|
||||
static int
|
||||
nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror,
|
||||
const struct mmu_notifier_range *update)
|
||||
nouveau_svmm_invalidate_range_start(struct mmu_notifier *mn,
|
||||
const struct mmu_notifier_range *update)
|
||||
{
|
||||
struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror);
|
||||
struct nouveau_svmm *svmm =
|
||||
container_of(mn, struct nouveau_svmm, notifier);
|
||||
unsigned long start = update->start;
|
||||
unsigned long limit = update->end;
|
||||
|
||||
|
@ -264,6 +263,9 @@ nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror,
|
|||
SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit);
|
||||
|
||||
mutex_lock(&svmm->mutex);
|
||||
if (unlikely(!svmm->vmm))
|
||||
goto out;
|
||||
|
||||
if (limit > svmm->unmanaged.start && start < svmm->unmanaged.limit) {
|
||||
if (start < svmm->unmanaged.start) {
|
||||
nouveau_svmm_invalidate(svmm, start,
|
||||
|
@ -273,19 +275,20 @@ nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror,
|
|||
}
|
||||
|
||||
nouveau_svmm_invalidate(svmm, start, limit);
|
||||
|
||||
out:
|
||||
mutex_unlock(&svmm->mutex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
nouveau_svmm_release(struct hmm_mirror *mirror)
|
||||
static void nouveau_svmm_free_notifier(struct mmu_notifier *mn)
|
||||
{
|
||||
kfree(container_of(mn, struct nouveau_svmm, notifier));
|
||||
}
|
||||
|
||||
static const struct hmm_mirror_ops
|
||||
nouveau_svmm = {
|
||||
.sync_cpu_device_pagetables = nouveau_svmm_sync_cpu_device_pagetables,
|
||||
.release = nouveau_svmm_release,
|
||||
static const struct mmu_notifier_ops nouveau_mn_ops = {
|
||||
.invalidate_range_start = nouveau_svmm_invalidate_range_start,
|
||||
.free_notifier = nouveau_svmm_free_notifier,
|
||||
};
|
||||
|
||||
void
|
||||
|
@ -293,8 +296,10 @@ nouveau_svmm_fini(struct nouveau_svmm **psvmm)
|
|||
{
|
||||
struct nouveau_svmm *svmm = *psvmm;
|
||||
if (svmm) {
|
||||
hmm_mirror_unregister(&svmm->mirror);
|
||||
kfree(*psvmm);
|
||||
mutex_lock(&svmm->mutex);
|
||||
svmm->vmm = NULL;
|
||||
mutex_unlock(&svmm->mutex);
|
||||
mmu_notifier_put(&svmm->notifier);
|
||||
*psvmm = NULL;
|
||||
}
|
||||
}
|
||||
|
@ -320,7 +325,7 @@ nouveau_svmm_init(struct drm_device *dev, void *data,
|
|||
mutex_lock(&cli->mutex);
|
||||
if (cli->svm.cli) {
|
||||
ret = -EBUSY;
|
||||
goto done;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
/* Allocate a new GPU VMM that can support SVM (managed by the
|
||||
|
@ -335,24 +340,26 @@ nouveau_svmm_init(struct drm_device *dev, void *data,
|
|||
.fault_replay = true,
|
||||
}, sizeof(struct gp100_vmm_v0), &cli->svm.vmm);
|
||||
if (ret)
|
||||
goto done;
|
||||
goto out_free;
|
||||
|
||||
/* Enable HMM mirroring of CPU address-space to VMM. */
|
||||
svmm->mm = get_task_mm(current);
|
||||
down_write(&svmm->mm->mmap_sem);
|
||||
svmm->mirror.ops = &nouveau_svmm;
|
||||
ret = hmm_mirror_register(&svmm->mirror, svmm->mm);
|
||||
if (ret == 0) {
|
||||
cli->svm.svmm = svmm;
|
||||
cli->svm.cli = cli;
|
||||
}
|
||||
up_write(&svmm->mm->mmap_sem);
|
||||
mmput(svmm->mm);
|
||||
|
||||
done:
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
svmm->notifier.ops = &nouveau_mn_ops;
|
||||
ret = __mmu_notifier_register(&svmm->notifier, current->mm);
|
||||
if (ret)
|
||||
nouveau_svmm_fini(&svmm);
|
||||
goto out_mm_unlock;
|
||||
/* Note, ownership of svmm transfers to mmu_notifier */
|
||||
|
||||
cli->svm.svmm = svmm;
|
||||
cli->svm.cli = cli;
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
mutex_unlock(&cli->mutex);
|
||||
return 0;
|
||||
|
||||
out_mm_unlock:
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
out_free:
|
||||
mutex_unlock(&cli->mutex);
|
||||
kfree(svmm);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -475,43 +482,90 @@ nouveau_svm_fault_cache(struct nouveau_svm *svm,
|
|||
fault->inst, fault->addr, fault->access);
|
||||
}
|
||||
|
||||
static inline bool
|
||||
nouveau_range_done(struct hmm_range *range)
|
||||
{
|
||||
bool ret = hmm_range_valid(range);
|
||||
struct svm_notifier {
|
||||
struct mmu_interval_notifier notifier;
|
||||
struct nouveau_svmm *svmm;
|
||||
};
|
||||
|
||||
hmm_range_unregister(range);
|
||||
return ret;
|
||||
static bool nouveau_svm_range_invalidate(struct mmu_interval_notifier *mni,
|
||||
const struct mmu_notifier_range *range,
|
||||
unsigned long cur_seq)
|
||||
{
|
||||
struct svm_notifier *sn =
|
||||
container_of(mni, struct svm_notifier, notifier);
|
||||
|
||||
/*
|
||||
* serializes the update to mni->invalidate_seq done by caller and
|
||||
* prevents invalidation of the PTE from progressing while HW is being
|
||||
* programmed. This is very hacky and only works because the normal
|
||||
* notifier that does invalidation is always called after the range
|
||||
* notifier.
|
||||
*/
|
||||
if (mmu_notifier_range_blockable(range))
|
||||
mutex_lock(&sn->svmm->mutex);
|
||||
else if (!mutex_trylock(&sn->svmm->mutex))
|
||||
return false;
|
||||
mmu_interval_set_seq(mni, cur_seq);
|
||||
mutex_unlock(&sn->svmm->mutex);
|
||||
return true;
|
||||
}
|
||||
|
||||
static int
|
||||
nouveau_range_fault(struct nouveau_svmm *svmm, struct hmm_range *range)
|
||||
static const struct mmu_interval_notifier_ops nouveau_svm_mni_ops = {
|
||||
.invalidate = nouveau_svm_range_invalidate,
|
||||
};
|
||||
|
||||
static int nouveau_range_fault(struct nouveau_svmm *svmm,
|
||||
struct nouveau_drm *drm, void *data, u32 size,
|
||||
u64 *pfns, struct svm_notifier *notifier)
|
||||
{
|
||||
unsigned long timeout =
|
||||
jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
|
||||
/* Have HMM fault pages within the fault window to the GPU. */
|
||||
struct hmm_range range = {
|
||||
.notifier = ¬ifier->notifier,
|
||||
.start = notifier->notifier.interval_tree.start,
|
||||
.end = notifier->notifier.interval_tree.last + 1,
|
||||
.pfns = pfns,
|
||||
.flags = nouveau_svm_pfn_flags,
|
||||
.values = nouveau_svm_pfn_values,
|
||||
.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT,
|
||||
};
|
||||
struct mm_struct *mm = notifier->notifier.mm;
|
||||
long ret;
|
||||
|
||||
range->default_flags = 0;
|
||||
range->pfn_flags_mask = -1UL;
|
||||
while (true) {
|
||||
if (time_after(jiffies, timeout))
|
||||
return -EBUSY;
|
||||
|
||||
ret = hmm_range_register(range, &svmm->mirror);
|
||||
if (ret) {
|
||||
up_read(&svmm->mm->mmap_sem);
|
||||
return (int)ret;
|
||||
range.notifier_seq = mmu_interval_read_begin(range.notifier);
|
||||
range.default_flags = 0;
|
||||
range.pfn_flags_mask = -1UL;
|
||||
down_read(&mm->mmap_sem);
|
||||
ret = hmm_range_fault(&range, 0);
|
||||
up_read(&mm->mmap_sem);
|
||||
if (ret <= 0) {
|
||||
if (ret == 0 || ret == -EBUSY)
|
||||
continue;
|
||||
return ret;
|
||||
}
|
||||
|
||||
mutex_lock(&svmm->mutex);
|
||||
if (mmu_interval_read_retry(range.notifier,
|
||||
range.notifier_seq)) {
|
||||
mutex_unlock(&svmm->mutex);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
|
||||
up_read(&svmm->mm->mmap_sem);
|
||||
return -EBUSY;
|
||||
}
|
||||
nouveau_dmem_convert_pfn(drm, &range);
|
||||
|
||||
ret = hmm_range_fault(range, 0);
|
||||
if (ret <= 0) {
|
||||
if (ret == 0)
|
||||
ret = -EBUSY;
|
||||
up_read(&svmm->mm->mmap_sem);
|
||||
hmm_range_unregister(range);
|
||||
return ret;
|
||||
}
|
||||
return 0;
|
||||
svmm->vmm->vmm.object.client->super = true;
|
||||
ret = nvif_object_ioctl(&svmm->vmm->vmm.object, data, size, NULL);
|
||||
svmm->vmm->vmm.object.client->super = false;
|
||||
mutex_unlock(&svmm->mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -531,7 +585,6 @@ nouveau_svm_fault(struct nvif_notify *notify)
|
|||
} i;
|
||||
u64 phys[16];
|
||||
} args;
|
||||
struct hmm_range range;
|
||||
struct vm_area_struct *vma;
|
||||
u64 inst, start, limit;
|
||||
int fi, fn, pi, fill;
|
||||
|
@ -587,6 +640,9 @@ nouveau_svm_fault(struct nvif_notify *notify)
|
|||
args.i.p.version = 0;
|
||||
|
||||
for (fi = 0; fn = fi + 1, fi < buffer->fault_nr; fi = fn) {
|
||||
struct svm_notifier notifier;
|
||||
struct mm_struct *mm;
|
||||
|
||||
/* Cancel any faults from non-SVM channels. */
|
||||
if (!(svmm = buffer->fault[fi]->svmm)) {
|
||||
nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
|
||||
|
@ -606,24 +662,32 @@ nouveau_svm_fault(struct nvif_notify *notify)
|
|||
start = max_t(u64, start, svmm->unmanaged.limit);
|
||||
SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit);
|
||||
|
||||
mm = svmm->notifier.mm;
|
||||
if (!mmget_not_zero(mm)) {
|
||||
nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Intersect fault window with the CPU VMA, cancelling
|
||||
* the fault if the address is invalid.
|
||||
*/
|
||||
down_read(&svmm->mm->mmap_sem);
|
||||
vma = find_vma_intersection(svmm->mm, start, limit);
|
||||
down_read(&mm->mmap_sem);
|
||||
vma = find_vma_intersection(mm, start, limit);
|
||||
if (!vma) {
|
||||
SVMM_ERR(svmm, "wndw %016llx-%016llx", start, limit);
|
||||
up_read(&svmm->mm->mmap_sem);
|
||||
up_read(&mm->mmap_sem);
|
||||
mmput(mm);
|
||||
nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
|
||||
continue;
|
||||
}
|
||||
start = max_t(u64, start, vma->vm_start);
|
||||
limit = min_t(u64, limit, vma->vm_end);
|
||||
up_read(&mm->mmap_sem);
|
||||
SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit);
|
||||
|
||||
if (buffer->fault[fi]->addr != start) {
|
||||
SVMM_ERR(svmm, "addr %016llx", buffer->fault[fi]->addr);
|
||||
up_read(&svmm->mm->mmap_sem);
|
||||
mmput(mm);
|
||||
nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
|
||||
continue;
|
||||
}
|
||||
|
@ -679,33 +743,19 @@ nouveau_svm_fault(struct nvif_notify *notify)
|
|||
args.i.p.addr,
|
||||
args.i.p.addr + args.i.p.size, fn - fi);
|
||||
|
||||
/* Have HMM fault pages within the fault window to the GPU. */
|
||||
range.start = args.i.p.addr;
|
||||
range.end = args.i.p.addr + args.i.p.size;
|
||||
range.pfns = args.phys;
|
||||
range.flags = nouveau_svm_pfn_flags;
|
||||
range.values = nouveau_svm_pfn_values;
|
||||
range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
|
||||
again:
|
||||
ret = nouveau_range_fault(svmm, &range);
|
||||
if (ret == 0) {
|
||||
mutex_lock(&svmm->mutex);
|
||||
if (!nouveau_range_done(&range)) {
|
||||
mutex_unlock(&svmm->mutex);
|
||||
goto again;
|
||||
}
|
||||
|
||||
nouveau_dmem_convert_pfn(svm->drm, &range);
|
||||
|
||||
svmm->vmm->vmm.object.client->super = true;
|
||||
ret = nvif_object_ioctl(&svmm->vmm->vmm.object,
|
||||
&args, sizeof(args.i) +
|
||||
pi * sizeof(args.phys[0]),
|
||||
NULL);
|
||||
svmm->vmm->vmm.object.client->super = false;
|
||||
mutex_unlock(&svmm->mutex);
|
||||
up_read(&svmm->mm->mmap_sem);
|
||||
notifier.svmm = svmm;
|
||||
ret = mmu_interval_notifier_insert(¬ifier.notifier,
|
||||
svmm->notifier.mm,
|
||||
args.i.p.addr, args.i.p.size,
|
||||
&nouveau_svm_mni_ops);
|
||||
if (!ret) {
|
||||
ret = nouveau_range_fault(
|
||||
svmm, svm->drm, &args,
|
||||
sizeof(args.i) + pi * sizeof(args.phys[0]),
|
||||
args.phys, ¬ifier);
|
||||
mmu_interval_notifier_remove(¬ifier.notifier);
|
||||
}
|
||||
mmput(mm);
|
||||
|
||||
/* Cancel any faults in the window whose pages didn't manage
|
||||
* to keep their valid bit, or stay writeable when required.
|
||||
|
@ -714,10 +764,10 @@ nouveau_svm_fault(struct nvif_notify *notify)
|
|||
*/
|
||||
while (fi < fn) {
|
||||
struct nouveau_svm_fault *fault = buffer->fault[fi++];
|
||||
pi = (fault->addr - range.start) >> PAGE_SHIFT;
|
||||
pi = (fault->addr - args.i.p.addr) >> PAGE_SHIFT;
|
||||
if (ret ||
|
||||
!(range.pfns[pi] & NVIF_VMM_PFNMAP_V0_V) ||
|
||||
(!(range.pfns[pi] & NVIF_VMM_PFNMAP_V0_W) &&
|
||||
!(args.phys[pi] & NVIF_VMM_PFNMAP_V0_V) ||
|
||||
(!(args.phys[pi] & NVIF_VMM_PFNMAP_V0_W) &&
|
||||
fault->access != 0 && fault->access != 3)) {
|
||||
nouveau_svm_fault_cancel_fault(svm, fault);
|
||||
continue;
|
||||
|
|
|
@ -68,6 +68,10 @@
|
|||
#include <linux/hashtable.h>
|
||||
#include <linux/dma-fence.h>
|
||||
|
||||
#ifdef CONFIG_MMU_NOTIFIER
|
||||
#include <linux/mmu_notifier.h>
|
||||
#endif
|
||||
|
||||
#include <drm/ttm/ttm_bo_api.h>
|
||||
#include <drm/ttm/ttm_bo_driver.h>
|
||||
#include <drm/ttm/ttm_placement.h>
|
||||
|
@ -509,8 +513,9 @@ struct radeon_bo {
|
|||
struct ttm_bo_kmap_obj dma_buf_vmap;
|
||||
pid_t pid;
|
||||
|
||||
struct radeon_mn *mn;
|
||||
struct list_head mn_list;
|
||||
#ifdef CONFIG_MMU_NOTIFIER
|
||||
struct mmu_interval_notifier notifier;
|
||||
#endif
|
||||
};
|
||||
#define gem_to_radeon_bo(gobj) container_of((gobj), struct radeon_bo, tbo.base)
|
||||
|
||||
|
|
|
@ -36,131 +36,51 @@
|
|||
|
||||
#include "radeon.h"
|
||||
|
||||
struct radeon_mn {
|
||||
struct mmu_notifier mn;
|
||||
|
||||
/* objects protected by lock */
|
||||
struct mutex lock;
|
||||
struct rb_root_cached objects;
|
||||
};
|
||||
|
||||
struct radeon_mn_node {
|
||||
struct interval_tree_node it;
|
||||
struct list_head bos;
|
||||
};
|
||||
|
||||
/**
|
||||
* radeon_mn_invalidate_range_start - callback to notify about mm change
|
||||
* radeon_mn_invalidate - callback to notify about mm change
|
||||
*
|
||||
* @mn: our notifier
|
||||
* @mn: the mm this callback is about
|
||||
* @start: start of updated range
|
||||
* @end: end of updated range
|
||||
* @range: the VMA under invalidation
|
||||
*
|
||||
* We block for all BOs between start and end to be idle and
|
||||
* unmap them by move them into system domain again.
|
||||
*/
|
||||
static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
|
||||
const struct mmu_notifier_range *range)
|
||||
static bool radeon_mn_invalidate(struct mmu_interval_notifier *mn,
|
||||
const struct mmu_notifier_range *range,
|
||||
unsigned long cur_seq)
|
||||
{
|
||||
struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
|
||||
struct radeon_bo *bo = container_of(mn, struct radeon_bo, notifier);
|
||||
struct ttm_operation_ctx ctx = { false, false };
|
||||
struct interval_tree_node *it;
|
||||
unsigned long end;
|
||||
int ret = 0;
|
||||
long r;
|
||||
|
||||
/* notification is exclusive, but interval is inclusive */
|
||||
end = range->end - 1;
|
||||
if (!bo->tbo.ttm || bo->tbo.ttm->state != tt_bound)
|
||||
return true;
|
||||
|
||||
/* TODO we should be able to split locking for interval tree and
|
||||
* the tear down.
|
||||
*/
|
||||
if (mmu_notifier_range_blockable(range))
|
||||
mutex_lock(&rmn->lock);
|
||||
else if (!mutex_trylock(&rmn->lock))
|
||||
return -EAGAIN;
|
||||
if (!mmu_notifier_range_blockable(range))
|
||||
return false;
|
||||
|
||||
it = interval_tree_iter_first(&rmn->objects, range->start, end);
|
||||
while (it) {
|
||||
struct radeon_mn_node *node;
|
||||
struct radeon_bo *bo;
|
||||
long r;
|
||||
|
||||
if (!mmu_notifier_range_blockable(range)) {
|
||||
ret = -EAGAIN;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
node = container_of(it, struct radeon_mn_node, it);
|
||||
it = interval_tree_iter_next(it, range->start, end);
|
||||
|
||||
list_for_each_entry(bo, &node->bos, mn_list) {
|
||||
|
||||
if (!bo->tbo.ttm || bo->tbo.ttm->state != tt_bound)
|
||||
continue;
|
||||
|
||||
r = radeon_bo_reserve(bo, true);
|
||||
if (r) {
|
||||
DRM_ERROR("(%ld) failed to reserve user bo\n", r);
|
||||
continue;
|
||||
}
|
||||
|
||||
r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv,
|
||||
true, false, MAX_SCHEDULE_TIMEOUT);
|
||||
if (r <= 0)
|
||||
DRM_ERROR("(%ld) failed to wait for user bo\n", r);
|
||||
|
||||
radeon_ttm_placement_from_domain(bo, RADEON_GEM_DOMAIN_CPU);
|
||||
r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
|
||||
if (r)
|
||||
DRM_ERROR("(%ld) failed to validate user bo\n", r);
|
||||
|
||||
radeon_bo_unreserve(bo);
|
||||
}
|
||||
r = radeon_bo_reserve(bo, true);
|
||||
if (r) {
|
||||
DRM_ERROR("(%ld) failed to reserve user bo\n", r);
|
||||
return true;
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&rmn->lock);
|
||||
|
||||
return ret;
|
||||
r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, true, false,
|
||||
MAX_SCHEDULE_TIMEOUT);
|
||||
if (r <= 0)
|
||||
DRM_ERROR("(%ld) failed to wait for user bo\n", r);
|
||||
|
||||
radeon_ttm_placement_from_domain(bo, RADEON_GEM_DOMAIN_CPU);
|
||||
r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
|
||||
if (r)
|
||||
DRM_ERROR("(%ld) failed to validate user bo\n", r);
|
||||
|
||||
radeon_bo_unreserve(bo);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void radeon_mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
|
||||
{
|
||||
struct mmu_notifier_range range = {
|
||||
.mm = mm,
|
||||
.start = 0,
|
||||
.end = ULONG_MAX,
|
||||
.flags = 0,
|
||||
.event = MMU_NOTIFY_UNMAP,
|
||||
};
|
||||
|
||||
radeon_mn_invalidate_range_start(mn, &range);
|
||||
}
|
||||
|
||||
static struct mmu_notifier *radeon_mn_alloc_notifier(struct mm_struct *mm)
|
||||
{
|
||||
struct radeon_mn *rmn;
|
||||
|
||||
rmn = kzalloc(sizeof(*rmn), GFP_KERNEL);
|
||||
if (!rmn)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
mutex_init(&rmn->lock);
|
||||
rmn->objects = RB_ROOT_CACHED;
|
||||
return &rmn->mn;
|
||||
}
|
||||
|
||||
static void radeon_mn_free_notifier(struct mmu_notifier *mn)
|
||||
{
|
||||
kfree(container_of(mn, struct radeon_mn, mn));
|
||||
}
|
||||
|
||||
static const struct mmu_notifier_ops radeon_mn_ops = {
|
||||
.release = radeon_mn_release,
|
||||
.invalidate_range_start = radeon_mn_invalidate_range_start,
|
||||
.alloc_notifier = radeon_mn_alloc_notifier,
|
||||
.free_notifier = radeon_mn_free_notifier,
|
||||
static const struct mmu_interval_notifier_ops radeon_mn_ops = {
|
||||
.invalidate = radeon_mn_invalidate,
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -174,51 +94,20 @@ static const struct mmu_notifier_ops radeon_mn_ops = {
|
|||
*/
|
||||
int radeon_mn_register(struct radeon_bo *bo, unsigned long addr)
|
||||
{
|
||||
unsigned long end = addr + radeon_bo_size(bo) - 1;
|
||||
struct mmu_notifier *mn;
|
||||
struct radeon_mn *rmn;
|
||||
struct radeon_mn_node *node = NULL;
|
||||
struct list_head bos;
|
||||
struct interval_tree_node *it;
|
||||
int ret;
|
||||
|
||||
mn = mmu_notifier_get(&radeon_mn_ops, current->mm);
|
||||
if (IS_ERR(mn))
|
||||
return PTR_ERR(mn);
|
||||
rmn = container_of(mn, struct radeon_mn, mn);
|
||||
|
||||
INIT_LIST_HEAD(&bos);
|
||||
|
||||
mutex_lock(&rmn->lock);
|
||||
|
||||
while ((it = interval_tree_iter_first(&rmn->objects, addr, end))) {
|
||||
kfree(node);
|
||||
node = container_of(it, struct radeon_mn_node, it);
|
||||
interval_tree_remove(&node->it, &rmn->objects);
|
||||
addr = min(it->start, addr);
|
||||
end = max(it->last, end);
|
||||
list_splice(&node->bos, &bos);
|
||||
}
|
||||
|
||||
if (!node) {
|
||||
node = kmalloc(sizeof(struct radeon_mn_node), GFP_KERNEL);
|
||||
if (!node) {
|
||||
mutex_unlock(&rmn->lock);
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
|
||||
bo->mn = rmn;
|
||||
|
||||
node->it.start = addr;
|
||||
node->it.last = end;
|
||||
INIT_LIST_HEAD(&node->bos);
|
||||
list_splice(&bos, &node->bos);
|
||||
list_add(&bo->mn_list, &node->bos);
|
||||
|
||||
interval_tree_insert(&node->it, &rmn->objects);
|
||||
|
||||
mutex_unlock(&rmn->lock);
|
||||
ret = mmu_interval_notifier_insert(&bo->notifier, current->mm, addr,
|
||||
radeon_bo_size(bo), &radeon_mn_ops);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* FIXME: radeon appears to allow get_user_pages to run during
|
||||
* invalidate_range_start/end, which is not a safe way to read the
|
||||
* PTEs. It should use the mmu_interval_read_begin() scheme around the
|
||||
* get_user_pages to ensure that the PTEs are read properly
|
||||
*/
|
||||
mmu_interval_read_begin(&bo->notifier);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -231,27 +120,8 @@ int radeon_mn_register(struct radeon_bo *bo, unsigned long addr)
|
|||
*/
|
||||
void radeon_mn_unregister(struct radeon_bo *bo)
|
||||
{
|
||||
struct radeon_mn *rmn = bo->mn;
|
||||
struct list_head *head;
|
||||
|
||||
if (!rmn)
|
||||
if (!bo->notifier.mm)
|
||||
return;
|
||||
|
||||
mutex_lock(&rmn->lock);
|
||||
/* save the next list entry for later */
|
||||
head = bo->mn_list.next;
|
||||
|
||||
list_del(&bo->mn_list);
|
||||
|
||||
if (list_empty(head)) {
|
||||
struct radeon_mn_node *node;
|
||||
node = container_of(head, struct radeon_mn_node, bos);
|
||||
interval_tree_remove(&node->it, &rmn->objects);
|
||||
kfree(node);
|
||||
}
|
||||
|
||||
mutex_unlock(&rmn->lock);
|
||||
|
||||
mmu_notifier_put(&rmn->mn);
|
||||
bo->mn = NULL;
|
||||
mmu_interval_notifier_remove(&bo->notifier);
|
||||
bo->notifier.mm = NULL;
|
||||
}
|
||||
|
|
|
@ -2634,7 +2634,6 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
|
|||
SET_DEVICE_OP(dev_ops, get_vf_guid);
|
||||
SET_DEVICE_OP(dev_ops, get_vf_stats);
|
||||
SET_DEVICE_OP(dev_ops, init_port);
|
||||
SET_DEVICE_OP(dev_ops, invalidate_range);
|
||||
SET_DEVICE_OP(dev_ops, iw_accept);
|
||||
SET_DEVICE_OP(dev_ops, iw_add_ref);
|
||||
SET_DEVICE_OP(dev_ops, iw_connect);
|
||||
|
|
|
@ -48,197 +48,33 @@
|
|||
|
||||
#include "uverbs.h"
|
||||
|
||||
static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
|
||||
static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
|
||||
const struct mmu_interval_notifier_ops *ops)
|
||||
{
|
||||
mutex_lock(&umem_odp->umem_mutex);
|
||||
if (umem_odp->notifiers_count++ == 0)
|
||||
/*
|
||||
* Initialize the completion object for waiting on
|
||||
* notifiers. Since notifier_count is zero, no one should be
|
||||
* waiting right now.
|
||||
*/
|
||||
reinit_completion(&umem_odp->notifier_completion);
|
||||
mutex_unlock(&umem_odp->umem_mutex);
|
||||
}
|
||||
|
||||
static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
|
||||
{
|
||||
mutex_lock(&umem_odp->umem_mutex);
|
||||
/*
|
||||
* This sequence increase will notify the QP page fault that the page
|
||||
* that is going to be mapped in the spte could have been freed.
|
||||
*/
|
||||
++umem_odp->notifiers_seq;
|
||||
if (--umem_odp->notifiers_count == 0)
|
||||
complete_all(&umem_odp->notifier_completion);
|
||||
mutex_unlock(&umem_odp->umem_mutex);
|
||||
}
|
||||
|
||||
static void ib_umem_notifier_release(struct mmu_notifier *mn,
|
||||
struct mm_struct *mm)
|
||||
{
|
||||
struct ib_ucontext_per_mm *per_mm =
|
||||
container_of(mn, struct ib_ucontext_per_mm, mn);
|
||||
struct rb_node *node;
|
||||
|
||||
down_read(&per_mm->umem_rwsem);
|
||||
if (!per_mm->mn.users)
|
||||
goto out;
|
||||
|
||||
for (node = rb_first_cached(&per_mm->umem_tree); node;
|
||||
node = rb_next(node)) {
|
||||
struct ib_umem_odp *umem_odp =
|
||||
rb_entry(node, struct ib_umem_odp, interval_tree.rb);
|
||||
|
||||
/*
|
||||
* Increase the number of notifiers running, to prevent any
|
||||
* further fault handling on this MR.
|
||||
*/
|
||||
ib_umem_notifier_start_account(umem_odp);
|
||||
complete_all(&umem_odp->notifier_completion);
|
||||
umem_odp->umem.ibdev->ops.invalidate_range(
|
||||
umem_odp, ib_umem_start(umem_odp),
|
||||
ib_umem_end(umem_odp));
|
||||
}
|
||||
|
||||
out:
|
||||
up_read(&per_mm->umem_rwsem);
|
||||
}
|
||||
|
||||
static int invalidate_range_start_trampoline(struct ib_umem_odp *item,
|
||||
u64 start, u64 end, void *cookie)
|
||||
{
|
||||
ib_umem_notifier_start_account(item);
|
||||
item->umem.ibdev->ops.invalidate_range(item, start, end);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
|
||||
const struct mmu_notifier_range *range)
|
||||
{
|
||||
struct ib_ucontext_per_mm *per_mm =
|
||||
container_of(mn, struct ib_ucontext_per_mm, mn);
|
||||
int rc;
|
||||
|
||||
if (mmu_notifier_range_blockable(range))
|
||||
down_read(&per_mm->umem_rwsem);
|
||||
else if (!down_read_trylock(&per_mm->umem_rwsem))
|
||||
return -EAGAIN;
|
||||
|
||||
if (!per_mm->mn.users) {
|
||||
up_read(&per_mm->umem_rwsem);
|
||||
/*
|
||||
* At this point users is permanently zero and visible to this
|
||||
* CPU without a lock, that fact is relied on to skip the unlock
|
||||
* in range_end.
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
|
||||
rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
|
||||
range->end,
|
||||
invalidate_range_start_trampoline,
|
||||
mmu_notifier_range_blockable(range),
|
||||
NULL);
|
||||
if (rc)
|
||||
up_read(&per_mm->umem_rwsem);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
|
||||
u64 end, void *cookie)
|
||||
{
|
||||
ib_umem_notifier_end_account(item);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
|
||||
const struct mmu_notifier_range *range)
|
||||
{
|
||||
struct ib_ucontext_per_mm *per_mm =
|
||||
container_of(mn, struct ib_ucontext_per_mm, mn);
|
||||
|
||||
if (unlikely(!per_mm->mn.users))
|
||||
return;
|
||||
|
||||
rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
|
||||
range->end,
|
||||
invalidate_range_end_trampoline, true, NULL);
|
||||
up_read(&per_mm->umem_rwsem);
|
||||
}
|
||||
|
||||
static struct mmu_notifier *ib_umem_alloc_notifier(struct mm_struct *mm)
|
||||
{
|
||||
struct ib_ucontext_per_mm *per_mm;
|
||||
|
||||
per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL);
|
||||
if (!per_mm)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
per_mm->umem_tree = RB_ROOT_CACHED;
|
||||
init_rwsem(&per_mm->umem_rwsem);
|
||||
|
||||
WARN_ON(mm != current->mm);
|
||||
rcu_read_lock();
|
||||
per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
|
||||
rcu_read_unlock();
|
||||
return &per_mm->mn;
|
||||
}
|
||||
|
||||
static void ib_umem_free_notifier(struct mmu_notifier *mn)
|
||||
{
|
||||
struct ib_ucontext_per_mm *per_mm =
|
||||
container_of(mn, struct ib_ucontext_per_mm, mn);
|
||||
|
||||
WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root));
|
||||
|
||||
put_pid(per_mm->tgid);
|
||||
kfree(per_mm);
|
||||
}
|
||||
|
||||
static const struct mmu_notifier_ops ib_umem_notifiers = {
|
||||
.release = ib_umem_notifier_release,
|
||||
.invalidate_range_start = ib_umem_notifier_invalidate_range_start,
|
||||
.invalidate_range_end = ib_umem_notifier_invalidate_range_end,
|
||||
.alloc_notifier = ib_umem_alloc_notifier,
|
||||
.free_notifier = ib_umem_free_notifier,
|
||||
};
|
||||
|
||||
static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp)
|
||||
{
|
||||
struct ib_ucontext_per_mm *per_mm;
|
||||
struct mmu_notifier *mn;
|
||||
int ret;
|
||||
|
||||
umem_odp->umem.is_odp = 1;
|
||||
mutex_init(&umem_odp->umem_mutex);
|
||||
|
||||
if (!umem_odp->is_implicit_odp) {
|
||||
size_t page_size = 1UL << umem_odp->page_shift;
|
||||
unsigned long start;
|
||||
unsigned long end;
|
||||
size_t pages;
|
||||
|
||||
umem_odp->interval_tree.start =
|
||||
ALIGN_DOWN(umem_odp->umem.address, page_size);
|
||||
start = ALIGN_DOWN(umem_odp->umem.address, page_size);
|
||||
if (check_add_overflow(umem_odp->umem.address,
|
||||
(unsigned long)umem_odp->umem.length,
|
||||
&umem_odp->interval_tree.last))
|
||||
&end))
|
||||
return -EOVERFLOW;
|
||||
umem_odp->interval_tree.last =
|
||||
ALIGN(umem_odp->interval_tree.last, page_size);
|
||||
if (unlikely(umem_odp->interval_tree.last < page_size))
|
||||
end = ALIGN(end, page_size);
|
||||
if (unlikely(end < page_size))
|
||||
return -EOVERFLOW;
|
||||
|
||||
pages = (umem_odp->interval_tree.last -
|
||||
umem_odp->interval_tree.start) >>
|
||||
umem_odp->page_shift;
|
||||
pages = (end - start) >> umem_odp->page_shift;
|
||||
if (!pages)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Note that the representation of the intervals in the
|
||||
* interval tree considers the ending point as contained in
|
||||
* the interval.
|
||||
*/
|
||||
umem_odp->interval_tree.last--;
|
||||
|
||||
umem_odp->page_list = kvcalloc(
|
||||
pages, sizeof(*umem_odp->page_list), GFP_KERNEL);
|
||||
if (!umem_odp->page_list)
|
||||
|
@ -250,26 +86,13 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp)
|
|||
ret = -ENOMEM;
|
||||
goto out_page_list;
|
||||
}
|
||||
}
|
||||
|
||||
mn = mmu_notifier_get(&ib_umem_notifiers, umem_odp->umem.owning_mm);
|
||||
if (IS_ERR(mn)) {
|
||||
ret = PTR_ERR(mn);
|
||||
goto out_dma_list;
|
||||
ret = mmu_interval_notifier_insert(&umem_odp->notifier,
|
||||
umem_odp->umem.owning_mm,
|
||||
start, end - start, ops);
|
||||
if (ret)
|
||||
goto out_dma_list;
|
||||
}
|
||||
umem_odp->per_mm = per_mm =
|
||||
container_of(mn, struct ib_ucontext_per_mm, mn);
|
||||
|
||||
mutex_init(&umem_odp->umem_mutex);
|
||||
init_completion(&umem_odp->notifier_completion);
|
||||
|
||||
if (!umem_odp->is_implicit_odp) {
|
||||
down_write(&per_mm->umem_rwsem);
|
||||
interval_tree_insert(&umem_odp->interval_tree,
|
||||
&per_mm->umem_tree);
|
||||
up_write(&per_mm->umem_rwsem);
|
||||
}
|
||||
mmgrab(umem_odp->umem.owning_mm);
|
||||
|
||||
return 0;
|
||||
|
||||
|
@ -305,8 +128,6 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
|
|||
|
||||
if (!context)
|
||||
return ERR_PTR(-EIO);
|
||||
if (WARN_ON_ONCE(!context->device->ops.invalidate_range))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
|
||||
if (!umem_odp)
|
||||
|
@ -318,8 +139,10 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
|
|||
umem_odp->is_implicit_odp = 1;
|
||||
umem_odp->page_shift = PAGE_SHIFT;
|
||||
|
||||
ret = ib_init_umem_odp(umem_odp);
|
||||
umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
|
||||
ret = ib_init_umem_odp(umem_odp, NULL);
|
||||
if (ret) {
|
||||
put_pid(umem_odp->tgid);
|
||||
kfree(umem_odp);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
@ -336,8 +159,10 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
|
|||
* @addr: The starting userspace VA
|
||||
* @size: The length of the userspace VA
|
||||
*/
|
||||
struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root,
|
||||
unsigned long addr, size_t size)
|
||||
struct ib_umem_odp *
|
||||
ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
|
||||
size_t size,
|
||||
const struct mmu_interval_notifier_ops *ops)
|
||||
{
|
||||
/*
|
||||
* Caller must ensure that root cannot be freed during the call to
|
||||
|
@ -360,9 +185,12 @@ struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root,
|
|||
umem->writable = root->umem.writable;
|
||||
umem->owning_mm = root->umem.owning_mm;
|
||||
odp_data->page_shift = PAGE_SHIFT;
|
||||
odp_data->notifier.ops = ops;
|
||||
|
||||
ret = ib_init_umem_odp(odp_data);
|
||||
odp_data->tgid = get_pid(root->tgid);
|
||||
ret = ib_init_umem_odp(odp_data, ops);
|
||||
if (ret) {
|
||||
put_pid(odp_data->tgid);
|
||||
kfree(odp_data);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
@ -383,7 +211,8 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child);
|
|||
* conjunction with MMU notifiers.
|
||||
*/
|
||||
struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
|
||||
size_t size, int access)
|
||||
size_t size, int access,
|
||||
const struct mmu_interval_notifier_ops *ops)
|
||||
{
|
||||
struct ib_umem_odp *umem_odp;
|
||||
struct ib_ucontext *context;
|
||||
|
@ -398,8 +227,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
|
|||
if (!context)
|
||||
return ERR_PTR(-EIO);
|
||||
|
||||
if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) ||
|
||||
WARN_ON_ONCE(!context->device->ops.invalidate_range))
|
||||
if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
|
||||
|
@ -411,6 +239,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
|
|||
umem_odp->umem.address = addr;
|
||||
umem_odp->umem.writable = ib_access_writable(access);
|
||||
umem_odp->umem.owning_mm = mm = current->mm;
|
||||
umem_odp->notifier.ops = ops;
|
||||
|
||||
umem_odp->page_shift = PAGE_SHIFT;
|
||||
if (access & IB_ACCESS_HUGETLB) {
|
||||
|
@ -429,11 +258,14 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
|
|||
up_read(&mm->mmap_sem);
|
||||
}
|
||||
|
||||
ret = ib_init_umem_odp(umem_odp);
|
||||
umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
|
||||
ret = ib_init_umem_odp(umem_odp, ops);
|
||||
if (ret)
|
||||
goto err_free;
|
||||
goto err_put_pid;
|
||||
return umem_odp;
|
||||
|
||||
err_put_pid:
|
||||
put_pid(umem_odp->tgid);
|
||||
err_free:
|
||||
kfree(umem_odp);
|
||||
return ERR_PTR(ret);
|
||||
|
@ -442,8 +274,6 @@ EXPORT_SYMBOL(ib_umem_odp_get);
|
|||
|
||||
void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
|
||||
{
|
||||
struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
|
||||
|
||||
/*
|
||||
* Ensure that no more pages are mapped in the umem.
|
||||
*
|
||||
|
@ -455,28 +285,11 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
|
|||
ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
|
||||
ib_umem_end(umem_odp));
|
||||
mutex_unlock(&umem_odp->umem_mutex);
|
||||
mmu_interval_notifier_remove(&umem_odp->notifier);
|
||||
kvfree(umem_odp->dma_list);
|
||||
kvfree(umem_odp->page_list);
|
||||
put_pid(umem_odp->tgid);
|
||||
}
|
||||
|
||||
down_write(&per_mm->umem_rwsem);
|
||||
if (!umem_odp->is_implicit_odp) {
|
||||
interval_tree_remove(&umem_odp->interval_tree,
|
||||
&per_mm->umem_tree);
|
||||
complete_all(&umem_odp->notifier_completion);
|
||||
}
|
||||
/*
|
||||
* NOTE! mmu_notifier_unregister() can happen between a start/end
|
||||
* callback, resulting in a missing end, and thus an unbalanced
|
||||
* lock. This doesn't really matter to us since we are about to kfree
|
||||
* the memory that holds the lock, however LOCKDEP doesn't like this.
|
||||
* Thus we call the mmu_notifier_put under the rwsem and test the
|
||||
* internal users count to reliably see if we are past this point.
|
||||
*/
|
||||
mmu_notifier_put(&per_mm->mn);
|
||||
up_write(&per_mm->umem_rwsem);
|
||||
|
||||
mmdrop(umem_odp->umem.owning_mm);
|
||||
kfree(umem_odp);
|
||||
}
|
||||
EXPORT_SYMBOL(ib_umem_odp_release);
|
||||
|
@ -501,7 +314,7 @@ EXPORT_SYMBOL(ib_umem_odp_release);
|
|||
*/
|
||||
static int ib_umem_odp_map_dma_single_page(
|
||||
struct ib_umem_odp *umem_odp,
|
||||
int page_index,
|
||||
unsigned int page_index,
|
||||
struct page *page,
|
||||
u64 access_mask,
|
||||
unsigned long current_seq)
|
||||
|
@ -510,12 +323,7 @@ static int ib_umem_odp_map_dma_single_page(
|
|||
dma_addr_t dma_addr;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* Note: we avoid writing if seq is different from the initial seq, to
|
||||
* handle case of a racing notifier. This check also allows us to bail
|
||||
* early if we have a notifier running in parallel with us.
|
||||
*/
|
||||
if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) {
|
||||
if (mmu_interval_check_retry(&umem_odp->notifier, current_seq)) {
|
||||
ret = -EAGAIN;
|
||||
goto out;
|
||||
}
|
||||
|
@ -618,7 +426,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt,
|
|||
* existing beyond the lifetime of the originating process.. Presumably
|
||||
* mmget_not_zero will fail in this case.
|
||||
*/
|
||||
owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID);
|
||||
owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID);
|
||||
if (!owning_process || !mmget_not_zero(owning_mm)) {
|
||||
ret = -EINVAL;
|
||||
goto out_put_task;
|
||||
|
@ -762,32 +570,3 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
|
|||
}
|
||||
}
|
||||
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
|
||||
|
||||
/* @last is not a part of the interval. See comment for function
|
||||
* node_last.
|
||||
*/
|
||||
int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
|
||||
u64 start, u64 last,
|
||||
umem_call_back cb,
|
||||
bool blockable,
|
||||
void *cookie)
|
||||
{
|
||||
int ret_val = 0;
|
||||
struct interval_tree_node *node, *next;
|
||||
struct ib_umem_odp *umem;
|
||||
|
||||
if (unlikely(start == last))
|
||||
return ret_val;
|
||||
|
||||
for (node = interval_tree_iter_first(root, start, last - 1);
|
||||
node; node = next) {
|
||||
/* TODO move the blockable decision up to the callback */
|
||||
if (!blockable)
|
||||
return -EAGAIN;
|
||||
next = interval_tree_iter_next(node, start, last - 1);
|
||||
umem = container_of(node, struct ib_umem_odp, interval_tree);
|
||||
ret_val = cb(umem, start, last, cookie) || ret_val;
|
||||
}
|
||||
|
||||
return ret_val;
|
||||
}
|
||||
|
|
|
@ -1138,7 +1138,7 @@ static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len)
|
|||
HFI1_CAP_UGET_MASK(uctxt->flags, MASK) |
|
||||
HFI1_CAP_KGET_MASK(uctxt->flags, K2U);
|
||||
/* adjust flag if this fd is not able to cache */
|
||||
if (!fd->handler)
|
||||
if (!fd->use_mn)
|
||||
cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */
|
||||
|
||||
cinfo.num_active = hfi1_count_active_units();
|
||||
|
|
|
@ -1444,7 +1444,7 @@ struct hfi1_filedata {
|
|||
/* for cpu affinity; -1 if none */
|
||||
int rec_cpu_num;
|
||||
u32 tid_n_pinned;
|
||||
struct mmu_rb_handler *handler;
|
||||
bool use_mn;
|
||||
struct tid_rb_node **entry_to_rb;
|
||||
spinlock_t tid_lock; /* protect tid_[limit,used] counters */
|
||||
u32 tid_limit;
|
||||
|
|
|
@ -59,11 +59,11 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
|
|||
struct tid_user_buf *tbuf,
|
||||
u32 rcventry, struct tid_group *grp,
|
||||
u16 pageidx, unsigned int npages);
|
||||
static int tid_rb_insert(void *arg, struct mmu_rb_node *node);
|
||||
static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
|
||||
struct tid_rb_node *tnode);
|
||||
static void tid_rb_remove(void *arg, struct mmu_rb_node *node);
|
||||
static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
|
||||
static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
|
||||
const struct mmu_notifier_range *range,
|
||||
unsigned long cur_seq);
|
||||
static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
|
||||
struct tid_group *grp,
|
||||
unsigned int start, u16 count,
|
||||
|
@ -73,10 +73,8 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
|
|||
struct tid_group **grp);
|
||||
static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
|
||||
|
||||
static struct mmu_rb_ops tid_rb_ops = {
|
||||
.insert = tid_rb_insert,
|
||||
.remove = tid_rb_remove,
|
||||
.invalidate = tid_rb_invalidate
|
||||
static const struct mmu_interval_notifier_ops tid_mn_ops = {
|
||||
.invalidate = tid_rb_invalidate,
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -87,7 +85,6 @@ static struct mmu_rb_ops tid_rb_ops = {
|
|||
int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
|
||||
struct hfi1_ctxtdata *uctxt)
|
||||
{
|
||||
struct hfi1_devdata *dd = uctxt->dd;
|
||||
int ret = 0;
|
||||
|
||||
spin_lock_init(&fd->tid_lock);
|
||||
|
@ -109,20 +106,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
|
|||
fd->entry_to_rb = NULL;
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/*
|
||||
* Register MMU notifier callbacks. If the registration
|
||||
* fails, continue without TID caching for this context.
|
||||
*/
|
||||
ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops,
|
||||
dd->pport->hfi1_wq,
|
||||
&fd->handler);
|
||||
if (ret) {
|
||||
dd_dev_info(dd,
|
||||
"Failed MMU notifier registration %d\n",
|
||||
ret);
|
||||
ret = 0;
|
||||
}
|
||||
fd->use_mn = true;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -139,7 +123,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
|
|||
* init.
|
||||
*/
|
||||
spin_lock(&fd->tid_lock);
|
||||
if (uctxt->subctxt_cnt && fd->handler) {
|
||||
if (uctxt->subctxt_cnt && fd->use_mn) {
|
||||
u16 remainder;
|
||||
|
||||
fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
|
||||
|
@ -158,18 +142,10 @@ void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
|
|||
{
|
||||
struct hfi1_ctxtdata *uctxt = fd->uctxt;
|
||||
|
||||
/*
|
||||
* The notifier would have been removed when the process'es mm
|
||||
* was freed.
|
||||
*/
|
||||
if (fd->handler) {
|
||||
hfi1_mmu_rb_unregister(fd->handler);
|
||||
} else {
|
||||
if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
|
||||
unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
|
||||
if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
|
||||
unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
|
||||
}
|
||||
if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
|
||||
unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
|
||||
if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
|
||||
unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
|
||||
|
||||
kfree(fd->invalid_tids);
|
||||
fd->invalid_tids = NULL;
|
||||
|
@ -201,7 +177,7 @@ static void unpin_rcv_pages(struct hfi1_filedata *fd,
|
|||
|
||||
if (mapped) {
|
||||
pci_unmap_single(dd->pcidev, node->dma_addr,
|
||||
node->mmu.len, PCI_DMA_FROMDEVICE);
|
||||
node->npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
|
||||
pages = &node->pages[idx];
|
||||
} else {
|
||||
pages = &tidbuf->pages[idx];
|
||||
|
@ -777,8 +753,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
|
|||
return -EFAULT;
|
||||
}
|
||||
|
||||
node->mmu.addr = tbuf->vaddr + (pageidx * PAGE_SIZE);
|
||||
node->mmu.len = npages * PAGE_SIZE;
|
||||
node->fdata = fd;
|
||||
node->phys = page_to_phys(pages[0]);
|
||||
node->npages = npages;
|
||||
node->rcventry = rcventry;
|
||||
|
@ -787,23 +762,35 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
|
|||
node->freed = false;
|
||||
memcpy(node->pages, pages, sizeof(struct page *) * npages);
|
||||
|
||||
if (!fd->handler)
|
||||
ret = tid_rb_insert(fd, &node->mmu);
|
||||
else
|
||||
ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu);
|
||||
|
||||
if (ret) {
|
||||
hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
|
||||
node->rcventry, node->mmu.addr, node->phys, ret);
|
||||
pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
|
||||
PCI_DMA_FROMDEVICE);
|
||||
kfree(node);
|
||||
return -EFAULT;
|
||||
if (fd->use_mn) {
|
||||
ret = mmu_interval_notifier_insert(
|
||||
&node->notifier, fd->mm,
|
||||
tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
|
||||
&tid_mn_ops);
|
||||
if (ret)
|
||||
goto out_unmap;
|
||||
/*
|
||||
* FIXME: This is in the wrong order, the notifier should be
|
||||
* established before the pages are pinned by pin_rcv_pages.
|
||||
*/
|
||||
mmu_interval_read_begin(&node->notifier);
|
||||
}
|
||||
fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
|
||||
|
||||
hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
|
||||
trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
|
||||
node->mmu.addr, node->phys, phys);
|
||||
node->notifier.interval_tree.start, node->phys,
|
||||
phys);
|
||||
return 0;
|
||||
|
||||
out_unmap:
|
||||
hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
|
||||
node->rcventry, node->notifier.interval_tree.start,
|
||||
node->phys, ret);
|
||||
pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
|
||||
PCI_DMA_FROMDEVICE);
|
||||
kfree(node);
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
|
||||
|
@ -833,10 +820,9 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
|
|||
if (grp)
|
||||
*grp = node->grp;
|
||||
|
||||
if (!fd->handler)
|
||||
cacheless_tid_rb_remove(fd, node);
|
||||
else
|
||||
hfi1_mmu_rb_remove(fd->handler, &node->mmu);
|
||||
if (fd->use_mn)
|
||||
mmu_interval_notifier_remove(&node->notifier);
|
||||
cacheless_tid_rb_remove(fd, node);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -847,7 +833,8 @@ static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
|
|||
struct hfi1_devdata *dd = uctxt->dd;
|
||||
|
||||
trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
|
||||
node->npages, node->mmu.addr, node->phys,
|
||||
node->npages,
|
||||
node->notifier.interval_tree.start, node->phys,
|
||||
node->dma_addr);
|
||||
|
||||
/*
|
||||
|
@ -894,30 +881,29 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
|
|||
if (!node || node->rcventry != rcventry)
|
||||
continue;
|
||||
|
||||
if (fd->use_mn)
|
||||
mmu_interval_notifier_remove(
|
||||
&node->notifier);
|
||||
cacheless_tid_rb_remove(fd, node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Always return 0 from this function. A non-zero return indicates that the
|
||||
* remove operation will be called and that memory should be unpinned.
|
||||
* However, the driver cannot unpin out from under PSM. Instead, retain the
|
||||
* memory (by returning 0) and inform PSM that the memory is going away. PSM
|
||||
* will call back later when it has removed the memory from its list.
|
||||
*/
|
||||
static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
|
||||
static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
|
||||
const struct mmu_notifier_range *range,
|
||||
unsigned long cur_seq)
|
||||
{
|
||||
struct hfi1_filedata *fdata = arg;
|
||||
struct hfi1_ctxtdata *uctxt = fdata->uctxt;
|
||||
struct tid_rb_node *node =
|
||||
container_of(mnode, struct tid_rb_node, mmu);
|
||||
container_of(mni, struct tid_rb_node, notifier);
|
||||
struct hfi1_filedata *fdata = node->fdata;
|
||||
struct hfi1_ctxtdata *uctxt = fdata->uctxt;
|
||||
|
||||
if (node->freed)
|
||||
return 0;
|
||||
return true;
|
||||
|
||||
trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr,
|
||||
trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
|
||||
node->notifier.interval_tree.start,
|
||||
node->rcventry, node->npages, node->dma_addr);
|
||||
node->freed = true;
|
||||
|
||||
|
@ -946,18 +932,7 @@ static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
|
|||
fdata->invalid_tid_idx++;
|
||||
}
|
||||
spin_unlock(&fdata->invalid_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tid_rb_insert(void *arg, struct mmu_rb_node *node)
|
||||
{
|
||||
struct hfi1_filedata *fdata = arg;
|
||||
struct tid_rb_node *tnode =
|
||||
container_of(node, struct tid_rb_node, mmu);
|
||||
u32 base = fdata->uctxt->expected_base;
|
||||
|
||||
fdata->entry_to_rb[tnode->rcventry - base] = tnode;
|
||||
return 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
|
||||
|
@ -968,12 +943,3 @@ static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
|
|||
fdata->entry_to_rb[tnode->rcventry - base] = NULL;
|
||||
clear_tid_node(fdata, tnode);
|
||||
}
|
||||
|
||||
static void tid_rb_remove(void *arg, struct mmu_rb_node *node)
|
||||
{
|
||||
struct hfi1_filedata *fdata = arg;
|
||||
struct tid_rb_node *tnode =
|
||||
container_of(node, struct tid_rb_node, mmu);
|
||||
|
||||
cacheless_tid_rb_remove(fdata, tnode);
|
||||
}
|
||||
|
|
|
@ -65,7 +65,8 @@ struct tid_user_buf {
|
|||
};
|
||||
|
||||
struct tid_rb_node {
|
||||
struct mmu_rb_node mmu;
|
||||
struct mmu_interval_notifier notifier;
|
||||
struct hfi1_filedata *fdata;
|
||||
unsigned long phys;
|
||||
struct tid_group *grp;
|
||||
u32 rcventry;
|
||||
|
|
|
@ -1258,8 +1258,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
|
|||
void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
|
||||
int __init mlx5_ib_odp_init(void);
|
||||
void mlx5_ib_odp_cleanup(void);
|
||||
void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
|
||||
unsigned long end);
|
||||
void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
|
||||
void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
|
||||
size_t nentries, struct mlx5_ib_mr *mr, int flags);
|
||||
|
@ -1289,11 +1287,10 @@ mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
|
|||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
static inline void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp,
|
||||
unsigned long start,
|
||||
unsigned long end){};
|
||||
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
|
||||
|
||||
extern const struct mmu_interval_notifier_ops mlx5_mn_ops;
|
||||
|
||||
/* Needed for rep profile */
|
||||
void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
|
||||
const struct mlx5_ib_profile *profile,
|
||||
|
|
|
@ -749,7 +749,8 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
|
|||
if (access_flags & IB_ACCESS_ON_DEMAND) {
|
||||
struct ib_umem_odp *odp;
|
||||
|
||||
odp = ib_umem_odp_get(udata, start, length, access_flags);
|
||||
odp = ib_umem_odp_get(udata, start, length, access_flags,
|
||||
&mlx5_mn_ops);
|
||||
if (IS_ERR(odp)) {
|
||||
mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
|
||||
PTR_ERR(odp));
|
||||
|
|
|
@ -241,18 +241,27 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
|
|||
xa_unlock(&imr->implicit_children);
|
||||
}
|
||||
|
||||
void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
|
||||
unsigned long end)
|
||||
static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
|
||||
const struct mmu_notifier_range *range,
|
||||
unsigned long cur_seq)
|
||||
{
|
||||
struct ib_umem_odp *umem_odp =
|
||||
container_of(mni, struct ib_umem_odp, notifier);
|
||||
struct mlx5_ib_mr *mr;
|
||||
const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
|
||||
sizeof(struct mlx5_mtt)) - 1;
|
||||
u64 idx = 0, blk_start_idx = 0;
|
||||
u64 invalidations = 0;
|
||||
unsigned long start;
|
||||
unsigned long end;
|
||||
int in_block = 0;
|
||||
u64 addr;
|
||||
|
||||
if (!mmu_notifier_range_blockable(range))
|
||||
return false;
|
||||
|
||||
mutex_lock(&umem_odp->umem_mutex);
|
||||
mmu_interval_set_seq(mni, cur_seq);
|
||||
/*
|
||||
* If npages is zero then umem_odp->private may not be setup yet. This
|
||||
* does not complete until after the first page is mapped for DMA.
|
||||
|
@ -261,8 +270,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
|
|||
goto out;
|
||||
mr = umem_odp->private;
|
||||
|
||||
start = max_t(u64, ib_umem_start(umem_odp), start);
|
||||
end = min_t(u64, ib_umem_end(umem_odp), end);
|
||||
start = max_t(u64, ib_umem_start(umem_odp), range->start);
|
||||
end = min_t(u64, ib_umem_end(umem_odp), range->end);
|
||||
|
||||
/*
|
||||
* Iteration one - zap the HW's MTTs. The notifiers_count ensures that
|
||||
|
@ -319,8 +328,13 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
|
|||
destroy_unused_implicit_child_mr(mr);
|
||||
out:
|
||||
mutex_unlock(&umem_odp->umem_mutex);
|
||||
return true;
|
||||
}
|
||||
|
||||
const struct mmu_interval_notifier_ops mlx5_mn_ops = {
|
||||
.invalidate = mlx5_ib_invalidate_range,
|
||||
};
|
||||
|
||||
void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
|
||||
{
|
||||
struct ib_odp_caps *caps = &dev->odp_caps;
|
||||
|
@ -419,7 +433,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
|
|||
|
||||
odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
|
||||
idx * MLX5_IMR_MTT_SIZE,
|
||||
MLX5_IMR_MTT_SIZE);
|
||||
MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
|
||||
if (IS_ERR(odp))
|
||||
return ERR_CAST(odp);
|
||||
|
||||
|
@ -606,8 +620,9 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
|
|||
u64 user_va, size_t bcnt, u32 *bytes_mapped,
|
||||
u32 flags)
|
||||
{
|
||||
int current_seq, page_shift, ret, np;
|
||||
int page_shift, ret, np;
|
||||
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
|
||||
unsigned long current_seq;
|
||||
u64 access_mask;
|
||||
u64 start_idx, page_mask;
|
||||
|
||||
|
@ -619,12 +634,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
|
|||
if (odp->umem.writable && !downgrade)
|
||||
access_mask |= ODP_WRITE_ALLOWED_BIT;
|
||||
|
||||
current_seq = READ_ONCE(odp->notifiers_seq);
|
||||
/*
|
||||
* Ensure the sequence number is valid for some time before we call
|
||||
* gup.
|
||||
*/
|
||||
smp_rmb();
|
||||
current_seq = mmu_interval_read_begin(&odp->notifier);
|
||||
|
||||
np = ib_umem_odp_map_dma_pages(odp, user_va, bcnt, access_mask,
|
||||
current_seq);
|
||||
|
@ -632,7 +642,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
|
|||
return np;
|
||||
|
||||
mutex_lock(&odp->umem_mutex);
|
||||
if (!ib_umem_mmu_notifier_retry(odp, current_seq)) {
|
||||
if (!mmu_interval_read_retry(&odp->notifier, current_seq)) {
|
||||
/*
|
||||
* No need to check whether the MTTs really belong to
|
||||
* this MR, since ib_umem_odp_map_dma_pages already
|
||||
|
@ -662,19 +672,6 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
|
|||
return np << (page_shift - PAGE_SHIFT);
|
||||
|
||||
out:
|
||||
if (ret == -EAGAIN) {
|
||||
unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
|
||||
|
||||
if (!wait_for_completion_timeout(&odp->notifier_completion,
|
||||
timeout)) {
|
||||
mlx5_ib_warn(
|
||||
mr->dev,
|
||||
"timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
|
||||
current_seq, odp->notifiers_seq,
|
||||
odp->notifiers_count);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -1622,7 +1619,6 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
|
|||
|
||||
static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
|
||||
.advise_mr = mlx5_ib_advise_mr,
|
||||
.invalidate_range = mlx5_ib_invalidate_range,
|
||||
};
|
||||
|
||||
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
|
||||
|
|
|
@ -21,15 +21,8 @@ struct gntdev_dmabuf_priv;
|
|||
struct gntdev_priv {
|
||||
/* Maps with visible offsets in the file descriptor. */
|
||||
struct list_head maps;
|
||||
/*
|
||||
* Maps that are not visible; will be freed on munmap.
|
||||
* Only populated if populate_freeable_maps == 1
|
||||
*/
|
||||
struct list_head freeable_maps;
|
||||
/* lock protects maps and freeable_maps. */
|
||||
struct mutex lock;
|
||||
struct mm_struct *mm;
|
||||
struct mmu_notifier mn;
|
||||
|
||||
#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
|
||||
/* Device for which DMA memory is allocated. */
|
||||
|
@ -49,6 +42,7 @@ struct gntdev_unmap_notify {
|
|||
};
|
||||
|
||||
struct gntdev_grant_map {
|
||||
struct mmu_interval_notifier notifier;
|
||||
struct list_head next;
|
||||
struct vm_area_struct *vma;
|
||||
int index;
|
||||
|
|
|
@ -63,7 +63,6 @@ MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by "
|
|||
static atomic_t pages_mapped = ATOMIC_INIT(0);
|
||||
|
||||
static int use_ptemod;
|
||||
#define populate_freeable_maps use_ptemod
|
||||
|
||||
static int unmap_grant_pages(struct gntdev_grant_map *map,
|
||||
int offset, int pages);
|
||||
|
@ -249,12 +248,6 @@ void gntdev_put_map(struct gntdev_priv *priv, struct gntdev_grant_map *map)
|
|||
evtchn_put(map->notify.event);
|
||||
}
|
||||
|
||||
if (populate_freeable_maps && priv) {
|
||||
mutex_lock(&priv->lock);
|
||||
list_del(&map->next);
|
||||
mutex_unlock(&priv->lock);
|
||||
}
|
||||
|
||||
if (map->pages && !use_ptemod)
|
||||
unmap_grant_pages(map, 0, map->count);
|
||||
gntdev_free_map(map);
|
||||
|
@ -444,16 +437,9 @@ static void gntdev_vma_close(struct vm_area_struct *vma)
|
|||
|
||||
pr_debug("gntdev_vma_close %p\n", vma);
|
||||
if (use_ptemod) {
|
||||
/* It is possible that an mmu notifier could be running
|
||||
* concurrently, so take priv->lock to ensure that the vma won't
|
||||
* vanishing during the unmap_grant_pages call, since we will
|
||||
* spin here until that completes. Such a concurrent call will
|
||||
* not do any unmapping, since that has been done prior to
|
||||
* closing the vma, but it may still iterate the unmap_ops list.
|
||||
*/
|
||||
mutex_lock(&priv->lock);
|
||||
WARN_ON(map->vma != vma);
|
||||
mmu_interval_notifier_remove(&map->notifier);
|
||||
map->vma = NULL;
|
||||
mutex_unlock(&priv->lock);
|
||||
}
|
||||
vma->vm_private_data = NULL;
|
||||
gntdev_put_map(priv, map);
|
||||
|
@ -475,109 +461,44 @@ static const struct vm_operations_struct gntdev_vmops = {
|
|||
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
static bool in_range(struct gntdev_grant_map *map,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
if (!map->vma)
|
||||
return false;
|
||||
if (map->vma->vm_start >= end)
|
||||
return false;
|
||||
if (map->vma->vm_end <= start)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int unmap_if_in_range(struct gntdev_grant_map *map,
|
||||
unsigned long start, unsigned long end,
|
||||
bool blockable)
|
||||
static bool gntdev_invalidate(struct mmu_interval_notifier *mn,
|
||||
const struct mmu_notifier_range *range,
|
||||
unsigned long cur_seq)
|
||||
{
|
||||
struct gntdev_grant_map *map =
|
||||
container_of(mn, struct gntdev_grant_map, notifier);
|
||||
unsigned long mstart, mend;
|
||||
int err;
|
||||
|
||||
if (!in_range(map, start, end))
|
||||
return 0;
|
||||
if (!mmu_notifier_range_blockable(range))
|
||||
return false;
|
||||
|
||||
if (!blockable)
|
||||
return -EAGAIN;
|
||||
/*
|
||||
* If the VMA is split or otherwise changed the notifier is not
|
||||
* updated, but we don't want to process VA's outside the modified
|
||||
* VMA. FIXME: It would be much more understandable to just prevent
|
||||
* modifying the VMA in the first place.
|
||||
*/
|
||||
if (map->vma->vm_start >= range->end ||
|
||||
map->vma->vm_end <= range->start)
|
||||
return true;
|
||||
|
||||
mstart = max(start, map->vma->vm_start);
|
||||
mend = min(end, map->vma->vm_end);
|
||||
mstart = max(range->start, map->vma->vm_start);
|
||||
mend = min(range->end, map->vma->vm_end);
|
||||
pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n",
|
||||
map->index, map->count,
|
||||
map->vma->vm_start, map->vma->vm_end,
|
||||
start, end, mstart, mend);
|
||||
range->start, range->end, mstart, mend);
|
||||
err = unmap_grant_pages(map,
|
||||
(mstart - map->vma->vm_start) >> PAGE_SHIFT,
|
||||
(mend - mstart) >> PAGE_SHIFT);
|
||||
WARN_ON(err);
|
||||
|
||||
return 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
static int mn_invl_range_start(struct mmu_notifier *mn,
|
||||
const struct mmu_notifier_range *range)
|
||||
{
|
||||
struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
|
||||
struct gntdev_grant_map *map;
|
||||
int ret = 0;
|
||||
|
||||
if (mmu_notifier_range_blockable(range))
|
||||
mutex_lock(&priv->lock);
|
||||
else if (!mutex_trylock(&priv->lock))
|
||||
return -EAGAIN;
|
||||
|
||||
list_for_each_entry(map, &priv->maps, next) {
|
||||
ret = unmap_if_in_range(map, range->start, range->end,
|
||||
mmu_notifier_range_blockable(range));
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
}
|
||||
list_for_each_entry(map, &priv->freeable_maps, next) {
|
||||
ret = unmap_if_in_range(map, range->start, range->end,
|
||||
mmu_notifier_range_blockable(range));
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&priv->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void mn_release(struct mmu_notifier *mn,
|
||||
struct mm_struct *mm)
|
||||
{
|
||||
struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn);
|
||||
struct gntdev_grant_map *map;
|
||||
int err;
|
||||
|
||||
mutex_lock(&priv->lock);
|
||||
list_for_each_entry(map, &priv->maps, next) {
|
||||
if (!map->vma)
|
||||
continue;
|
||||
pr_debug("map %d+%d (%lx %lx)\n",
|
||||
map->index, map->count,
|
||||
map->vma->vm_start, map->vma->vm_end);
|
||||
err = unmap_grant_pages(map, /* offset */ 0, map->count);
|
||||
WARN_ON(err);
|
||||
}
|
||||
list_for_each_entry(map, &priv->freeable_maps, next) {
|
||||
if (!map->vma)
|
||||
continue;
|
||||
pr_debug("map %d+%d (%lx %lx)\n",
|
||||
map->index, map->count,
|
||||
map->vma->vm_start, map->vma->vm_end);
|
||||
err = unmap_grant_pages(map, /* offset */ 0, map->count);
|
||||
WARN_ON(err);
|
||||
}
|
||||
mutex_unlock(&priv->lock);
|
||||
}
|
||||
|
||||
static const struct mmu_notifier_ops gntdev_mmu_ops = {
|
||||
.release = mn_release,
|
||||
.invalidate_range_start = mn_invl_range_start,
|
||||
static const struct mmu_interval_notifier_ops gntdev_mmu_ops = {
|
||||
.invalidate = gntdev_invalidate,
|
||||
};
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
@ -592,7 +513,6 @@ static int gntdev_open(struct inode *inode, struct file *flip)
|
|||
return -ENOMEM;
|
||||
|
||||
INIT_LIST_HEAD(&priv->maps);
|
||||
INIT_LIST_HEAD(&priv->freeable_maps);
|
||||
mutex_init(&priv->lock);
|
||||
|
||||
#ifdef CONFIG_XEN_GNTDEV_DMABUF
|
||||
|
@ -604,17 +524,6 @@ static int gntdev_open(struct inode *inode, struct file *flip)
|
|||
}
|
||||
#endif
|
||||
|
||||
if (use_ptemod) {
|
||||
priv->mm = get_task_mm(current);
|
||||
if (!priv->mm) {
|
||||
kfree(priv);
|
||||
return -ENOMEM;
|
||||
}
|
||||
priv->mn.ops = &gntdev_mmu_ops;
|
||||
ret = mmu_notifier_register(&priv->mn, priv->mm);
|
||||
mmput(priv->mm);
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
kfree(priv);
|
||||
return ret;
|
||||
|
@ -644,16 +553,12 @@ static int gntdev_release(struct inode *inode, struct file *flip)
|
|||
list_del(&map->next);
|
||||
gntdev_put_map(NULL /* already removed */, map);
|
||||
}
|
||||
WARN_ON(!list_empty(&priv->freeable_maps));
|
||||
mutex_unlock(&priv->lock);
|
||||
|
||||
#ifdef CONFIG_XEN_GNTDEV_DMABUF
|
||||
gntdev_dmabuf_fini(priv->dmabuf_priv);
|
||||
#endif
|
||||
|
||||
if (use_ptemod)
|
||||
mmu_notifier_unregister(&priv->mn, priv->mm);
|
||||
|
||||
kfree(priv);
|
||||
return 0;
|
||||
}
|
||||
|
@ -714,8 +619,6 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv,
|
|||
map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count);
|
||||
if (map) {
|
||||
list_del(&map->next);
|
||||
if (populate_freeable_maps)
|
||||
list_add_tail(&map->next, &priv->freeable_maps);
|
||||
err = 0;
|
||||
}
|
||||
mutex_unlock(&priv->lock);
|
||||
|
@ -1087,11 +990,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
|
|||
goto unlock_out;
|
||||
if (use_ptemod && map->vma)
|
||||
goto unlock_out;
|
||||
if (use_ptemod && priv->mm != vma->vm_mm) {
|
||||
pr_warn("Huh? Other mm?\n");
|
||||
goto unlock_out;
|
||||
}
|
||||
|
||||
refcount_inc(&map->users);
|
||||
|
||||
vma->vm_ops = &gntdev_vmops;
|
||||
|
@ -1102,10 +1000,6 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
|
|||
vma->vm_flags |= VM_DONTCOPY;
|
||||
|
||||
vma->vm_private_data = map;
|
||||
|
||||
if (use_ptemod)
|
||||
map->vma = vma;
|
||||
|
||||
if (map->flags) {
|
||||
if ((vma->vm_flags & VM_WRITE) &&
|
||||
(map->flags & GNTMAP_readonly))
|
||||
|
@ -1116,8 +1010,28 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
|
|||
map->flags |= GNTMAP_readonly;
|
||||
}
|
||||
|
||||
if (use_ptemod) {
|
||||
map->vma = vma;
|
||||
err = mmu_interval_notifier_insert_locked(
|
||||
&map->notifier, vma->vm_mm, vma->vm_start,
|
||||
vma->vm_end - vma->vm_start, &gntdev_mmu_ops);
|
||||
if (err)
|
||||
goto out_unlock_put;
|
||||
}
|
||||
mutex_unlock(&priv->lock);
|
||||
|
||||
/*
|
||||
* gntdev takes the address of the PTE in find_grant_ptes() and passes
|
||||
* it to the hypervisor in gntdev_map_grant_pages(). The purpose of
|
||||
* the notifier is to prevent the hypervisor pointer to the PTE from
|
||||
* going stale.
|
||||
*
|
||||
* Since this vma's mappings can't be touched without the mmap_sem,
|
||||
* and we are holding it now, there is no need for the notifier_range
|
||||
* locking pattern.
|
||||
*/
|
||||
mmu_interval_read_begin(&map->notifier);
|
||||
|
||||
if (use_ptemod) {
|
||||
map->pages_vm_start = vma->vm_start;
|
||||
err = apply_to_page_range(vma->vm_mm, vma->vm_start,
|
||||
|
@ -1166,8 +1080,11 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
|
|||
mutex_unlock(&priv->lock);
|
||||
out_put_map:
|
||||
if (use_ptemod) {
|
||||
map->vma = NULL;
|
||||
unmap_grant_pages(map, 0, map->count);
|
||||
if (map->vma) {
|
||||
mmu_interval_notifier_remove(&map->notifier);
|
||||
map->vma = NULL;
|
||||
}
|
||||
}
|
||||
gntdev_put_map(priv, map);
|
||||
return err;
|
||||
|
|
|
@ -62,37 +62,12 @@
|
|||
#include <linux/kconfig.h>
|
||||
#include <asm/pgtable.h>
|
||||
|
||||
#ifdef CONFIG_HMM_MIRROR
|
||||
|
||||
#include <linux/device.h>
|
||||
#include <linux/migrate.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/completion.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
|
||||
|
||||
/*
|
||||
* struct hmm - HMM per mm struct
|
||||
*
|
||||
* @mm: mm struct this HMM struct is bound to
|
||||
* @lock: lock protecting ranges list
|
||||
* @ranges: list of range being snapshotted
|
||||
* @mirrors: list of mirrors for this mm
|
||||
* @mmu_notifier: mmu notifier to track updates to CPU page table
|
||||
* @mirrors_sem: read/write semaphore protecting the mirrors list
|
||||
* @wq: wait queue for user waiting on a range invalidation
|
||||
* @notifiers: count of active mmu notifiers
|
||||
*/
|
||||
struct hmm {
|
||||
struct mmu_notifier mmu_notifier;
|
||||
spinlock_t ranges_lock;
|
||||
struct list_head ranges;
|
||||
struct list_head mirrors;
|
||||
struct rw_semaphore mirrors_sem;
|
||||
wait_queue_head_t wq;
|
||||
long notifiers;
|
||||
};
|
||||
|
||||
/*
|
||||
* hmm_pfn_flag_e - HMM flag enums
|
||||
*
|
||||
|
@ -145,6 +120,8 @@ enum hmm_pfn_value_e {
|
|||
/*
|
||||
* struct hmm_range - track invalidation lock on virtual address range
|
||||
*
|
||||
* @notifier: a mmu_interval_notifier that includes the start/end
|
||||
* @notifier_seq: result of mmu_interval_read_begin()
|
||||
* @hmm: the core HMM structure this range is active against
|
||||
* @vma: the vm area struct for the range
|
||||
* @list: all range lock are on a list
|
||||
|
@ -159,8 +136,8 @@ enum hmm_pfn_value_e {
|
|||
* @valid: pfns array did not change since it has been fill by an HMM function
|
||||
*/
|
||||
struct hmm_range {
|
||||
struct hmm *hmm;
|
||||
struct list_head list;
|
||||
struct mmu_interval_notifier *notifier;
|
||||
unsigned long notifier_seq;
|
||||
unsigned long start;
|
||||
unsigned long end;
|
||||
uint64_t *pfns;
|
||||
|
@ -169,32 +146,8 @@ struct hmm_range {
|
|||
uint64_t default_flags;
|
||||
uint64_t pfn_flags_mask;
|
||||
uint8_t pfn_shift;
|
||||
bool valid;
|
||||
};
|
||||
|
||||
/*
|
||||
* hmm_range_wait_until_valid() - wait for range to be valid
|
||||
* @range: range affected by invalidation to wait on
|
||||
* @timeout: time out for wait in ms (ie abort wait after that period of time)
|
||||
* Return: true if the range is valid, false otherwise.
|
||||
*/
|
||||
static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
|
||||
unsigned long timeout)
|
||||
{
|
||||
return wait_event_timeout(range->hmm->wq, range->valid,
|
||||
msecs_to_jiffies(timeout)) != 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* hmm_range_valid() - test if a range is valid or not
|
||||
* @range: range
|
||||
* Return: true if the range is valid, false otherwise.
|
||||
*/
|
||||
static inline bool hmm_range_valid(struct hmm_range *range)
|
||||
{
|
||||
return range->valid;
|
||||
}
|
||||
|
||||
/*
|
||||
* hmm_device_entry_to_page() - return struct page pointed to by a device entry
|
||||
* @range: range use to decode device entry value
|
||||
|
@ -264,120 +217,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
|
|||
range->flags[HMM_PFN_VALID];
|
||||
}
|
||||
|
||||
/*
|
||||
* Mirroring: how to synchronize device page table with CPU page table.
|
||||
*
|
||||
* A device driver that is participating in HMM mirroring must always
|
||||
* synchronize with CPU page table updates. For this, device drivers can either
|
||||
* directly use mmu_notifier APIs or they can use the hmm_mirror API. Device
|
||||
* drivers can decide to register one mirror per device per process, or just
|
||||
* one mirror per process for a group of devices. The pattern is:
|
||||
*
|
||||
* int device_bind_address_space(..., struct mm_struct *mm, ...)
|
||||
* {
|
||||
* struct device_address_space *das;
|
||||
*
|
||||
* // Device driver specific initialization, and allocation of das
|
||||
* // which contains an hmm_mirror struct as one of its fields.
|
||||
* ...
|
||||
*
|
||||
* ret = hmm_mirror_register(&das->mirror, mm, &device_mirror_ops);
|
||||
* if (ret) {
|
||||
* // Cleanup on error
|
||||
* return ret;
|
||||
* }
|
||||
*
|
||||
* // Other device driver specific initialization
|
||||
* ...
|
||||
* }
|
||||
*
|
||||
* Once an hmm_mirror is registered for an address space, the device driver
|
||||
* will get callbacks through sync_cpu_device_pagetables() operation (see
|
||||
* hmm_mirror_ops struct).
|
||||
*
|
||||
* Device driver must not free the struct containing the hmm_mirror struct
|
||||
* before calling hmm_mirror_unregister(). The expected usage is to do that when
|
||||
* the device driver is unbinding from an address space.
|
||||
*
|
||||
*
|
||||
* void device_unbind_address_space(struct device_address_space *das)
|
||||
* {
|
||||
* // Device driver specific cleanup
|
||||
* ...
|
||||
*
|
||||
* hmm_mirror_unregister(&das->mirror);
|
||||
*
|
||||
* // Other device driver specific cleanup, and now das can be freed
|
||||
* ...
|
||||
* }
|
||||
*/
|
||||
|
||||
struct hmm_mirror;
|
||||
|
||||
/*
|
||||
* struct hmm_mirror_ops - HMM mirror device operations callback
|
||||
*
|
||||
* @update: callback to update range on a device
|
||||
*/
|
||||
struct hmm_mirror_ops {
|
||||
/* release() - release hmm_mirror
|
||||
*
|
||||
* @mirror: pointer to struct hmm_mirror
|
||||
*
|
||||
* This is called when the mm_struct is being released. The callback
|
||||
* must ensure that all access to any pages obtained from this mirror
|
||||
* is halted before the callback returns. All future access should
|
||||
* fault.
|
||||
*/
|
||||
void (*release)(struct hmm_mirror *mirror);
|
||||
|
||||
/* sync_cpu_device_pagetables() - synchronize page tables
|
||||
*
|
||||
* @mirror: pointer to struct hmm_mirror
|
||||
* @update: update information (see struct mmu_notifier_range)
|
||||
* Return: -EAGAIN if mmu_notifier_range_blockable(update) is false
|
||||
* and callback needs to block, 0 otherwise.
|
||||
*
|
||||
* This callback ultimately originates from mmu_notifiers when the CPU
|
||||
* page table is updated. The device driver must update its page table
|
||||
* in response to this callback. The update argument tells what action
|
||||
* to perform.
|
||||
*
|
||||
* The device driver must not return from this callback until the device
|
||||
* page tables are completely updated (TLBs flushed, etc); this is a
|
||||
* synchronous call.
|
||||
*/
|
||||
int (*sync_cpu_device_pagetables)(
|
||||
struct hmm_mirror *mirror,
|
||||
const struct mmu_notifier_range *update);
|
||||
};
|
||||
|
||||
/*
|
||||
* struct hmm_mirror - mirror struct for a device driver
|
||||
*
|
||||
* @hmm: pointer to struct hmm (which is unique per mm_struct)
|
||||
* @ops: device driver callback for HMM mirror operations
|
||||
* @list: for list of mirrors of a given mm
|
||||
*
|
||||
* Each address space (mm_struct) being mirrored by a device must register one
|
||||
* instance of an hmm_mirror struct with HMM. HMM will track the list of all
|
||||
* mirrors for each mm_struct.
|
||||
*/
|
||||
struct hmm_mirror {
|
||||
struct hmm *hmm;
|
||||
const struct hmm_mirror_ops *ops;
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
|
||||
void hmm_mirror_unregister(struct hmm_mirror *mirror);
|
||||
|
||||
/*
|
||||
* Please see Documentation/vm/hmm.rst for how to use the range API.
|
||||
*/
|
||||
int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror);
|
||||
void hmm_range_unregister(struct hmm_range *range);
|
||||
|
||||
/*
|
||||
* Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case.
|
||||
*/
|
||||
|
@ -386,16 +225,17 @@ void hmm_range_unregister(struct hmm_range *range);
|
|||
/* Don't fault in missing PTEs, just snapshot the current state. */
|
||||
#define HMM_FAULT_SNAPSHOT (1 << 1)
|
||||
|
||||
#ifdef CONFIG_HMM_MIRROR
|
||||
/*
|
||||
* Please see Documentation/vm/hmm.rst for how to use the range API.
|
||||
*/
|
||||
long hmm_range_fault(struct hmm_range *range, unsigned int flags);
|
||||
|
||||
long hmm_range_dma_map(struct hmm_range *range,
|
||||
struct device *device,
|
||||
dma_addr_t *daddrs,
|
||||
unsigned int flags);
|
||||
long hmm_range_dma_unmap(struct hmm_range *range,
|
||||
struct device *device,
|
||||
dma_addr_t *daddrs,
|
||||
bool dirty);
|
||||
#else
|
||||
static inline long hmm_range_fault(struct hmm_range *range, unsigned int flags)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
|
||||
|
@ -406,6 +246,4 @@ long hmm_range_dma_unmap(struct hmm_range *range,
|
|||
*/
|
||||
#define HMM_RANGE_DEFAULT_TIMEOUT 1000
|
||||
|
||||
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
|
||||
|
||||
#endif /* LINUX_HMM_H */
|
||||
|
|
|
@ -6,9 +6,12 @@
|
|||
#include <linux/spinlock.h>
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/srcu.h>
|
||||
#include <linux/interval_tree.h>
|
||||
|
||||
struct mmu_notifier_mm;
|
||||
struct mmu_notifier;
|
||||
struct mmu_notifier_ops;
|
||||
struct mmu_notifier_range;
|
||||
struct mmu_interval_notifier;
|
||||
|
||||
/**
|
||||
* enum mmu_notifier_event - reason for the mmu notifier callback
|
||||
|
@ -31,6 +34,9 @@ struct mmu_notifier_ops;
|
|||
* access flags). User should soft dirty the page in the end callback to make
|
||||
* sure that anyone relying on soft dirtyness catch pages that might be written
|
||||
* through non CPU mappings.
|
||||
*
|
||||
* @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal
|
||||
* that the mm refcount is zero and the range is no longer accessible.
|
||||
*/
|
||||
enum mmu_notifier_event {
|
||||
MMU_NOTIFY_UNMAP = 0,
|
||||
|
@ -38,38 +44,11 @@ enum mmu_notifier_event {
|
|||
MMU_NOTIFY_PROTECTION_VMA,
|
||||
MMU_NOTIFY_PROTECTION_PAGE,
|
||||
MMU_NOTIFY_SOFT_DIRTY,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_MMU_NOTIFIER
|
||||
|
||||
#ifdef CONFIG_LOCKDEP
|
||||
extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The mmu notifier_mm structure is allocated and installed in
|
||||
* mm->mmu_notifier_mm inside the mm_take_all_locks() protected
|
||||
* critical section and it's released only when mm_count reaches zero
|
||||
* in mmdrop().
|
||||
*/
|
||||
struct mmu_notifier_mm {
|
||||
/* all mmu notifiers registerd in this mm are queued in this list */
|
||||
struct hlist_head list;
|
||||
/* to serialize the list modifications and hlist_unhashed */
|
||||
spinlock_t lock;
|
||||
MMU_NOTIFY_RELEASE,
|
||||
};
|
||||
|
||||
#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
|
||||
|
||||
struct mmu_notifier_range {
|
||||
struct vm_area_struct *vma;
|
||||
struct mm_struct *mm;
|
||||
unsigned long start;
|
||||
unsigned long end;
|
||||
unsigned flags;
|
||||
enum mmu_notifier_event event;
|
||||
};
|
||||
|
||||
struct mmu_notifier_ops {
|
||||
/*
|
||||
* Called either by mmu_notifier_unregister or when the mm is
|
||||
|
@ -249,6 +228,41 @@ struct mmu_notifier {
|
|||
unsigned int users;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct mmu_interval_notifier_ops
|
||||
* @invalidate: Upon return the caller must stop using any SPTEs within this
|
||||
* range. This function can sleep. Return false only if sleeping
|
||||
* was required but mmu_notifier_range_blockable(range) is false.
|
||||
*/
|
||||
struct mmu_interval_notifier_ops {
|
||||
bool (*invalidate)(struct mmu_interval_notifier *mni,
|
||||
const struct mmu_notifier_range *range,
|
||||
unsigned long cur_seq);
|
||||
};
|
||||
|
||||
struct mmu_interval_notifier {
|
||||
struct interval_tree_node interval_tree;
|
||||
const struct mmu_interval_notifier_ops *ops;
|
||||
struct mm_struct *mm;
|
||||
struct hlist_node deferred_item;
|
||||
unsigned long invalidate_seq;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_MMU_NOTIFIER
|
||||
|
||||
#ifdef CONFIG_LOCKDEP
|
||||
extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
|
||||
#endif
|
||||
|
||||
struct mmu_notifier_range {
|
||||
struct vm_area_struct *vma;
|
||||
struct mm_struct *mm;
|
||||
unsigned long start;
|
||||
unsigned long end;
|
||||
unsigned flags;
|
||||
enum mmu_notifier_event event;
|
||||
};
|
||||
|
||||
static inline int mm_has_notifiers(struct mm_struct *mm)
|
||||
{
|
||||
return unlikely(mm->mmu_notifier_mm);
|
||||
|
@ -275,6 +289,81 @@ extern int __mmu_notifier_register(struct mmu_notifier *mn,
|
|||
struct mm_struct *mm);
|
||||
extern void mmu_notifier_unregister(struct mmu_notifier *mn,
|
||||
struct mm_struct *mm);
|
||||
|
||||
unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni);
|
||||
int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
|
||||
struct mm_struct *mm, unsigned long start,
|
||||
unsigned long length,
|
||||
const struct mmu_interval_notifier_ops *ops);
|
||||
int mmu_interval_notifier_insert_locked(
|
||||
struct mmu_interval_notifier *mni, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long length,
|
||||
const struct mmu_interval_notifier_ops *ops);
|
||||
void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni);
|
||||
|
||||
/**
|
||||
* mmu_interval_set_seq - Save the invalidation sequence
|
||||
* @mni - The mni passed to invalidate
|
||||
* @cur_seq - The cur_seq passed to the invalidate() callback
|
||||
*
|
||||
* This must be called unconditionally from the invalidate callback of a
|
||||
* struct mmu_interval_notifier_ops under the same lock that is used to call
|
||||
* mmu_interval_read_retry(). It updates the sequence number for later use by
|
||||
* mmu_interval_read_retry(). The provided cur_seq will always be odd.
|
||||
*
|
||||
* If the caller does not call mmu_interval_read_begin() or
|
||||
* mmu_interval_read_retry() then this call is not required.
|
||||
*/
|
||||
static inline void mmu_interval_set_seq(struct mmu_interval_notifier *mni,
|
||||
unsigned long cur_seq)
|
||||
{
|
||||
WRITE_ONCE(mni->invalidate_seq, cur_seq);
|
||||
}
|
||||
|
||||
/**
|
||||
* mmu_interval_read_retry - End a read side critical section against a VA range
|
||||
* mni: The range
|
||||
* seq: The return of the paired mmu_interval_read_begin()
|
||||
*
|
||||
* This MUST be called under a user provided lock that is also held
|
||||
* unconditionally by op->invalidate() when it calls mmu_interval_set_seq().
|
||||
*
|
||||
* Each call should be paired with a single mmu_interval_read_begin() and
|
||||
* should be used to conclude the read side.
|
||||
*
|
||||
* Returns true if an invalidation collided with this critical section, and
|
||||
* the caller should retry.
|
||||
*/
|
||||
static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *mni,
|
||||
unsigned long seq)
|
||||
{
|
||||
return mni->invalidate_seq != seq;
|
||||
}
|
||||
|
||||
/**
|
||||
* mmu_interval_check_retry - Test if a collision has occurred
|
||||
* mni: The range
|
||||
* seq: The return of the matching mmu_interval_read_begin()
|
||||
*
|
||||
* This can be used in the critical section between mmu_interval_read_begin()
|
||||
* and mmu_interval_read_retry(). A return of true indicates an invalidation
|
||||
* has collided with this critical region and a future
|
||||
* mmu_interval_read_retry() will return true.
|
||||
*
|
||||
* False is not reliable and only suggests a collision may not have
|
||||
* occured. It can be called many times and does not have to hold the user
|
||||
* provided lock.
|
||||
*
|
||||
* This call can be used as part of loops and other expensive operations to
|
||||
* expedite a retry.
|
||||
*/
|
||||
static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *mni,
|
||||
unsigned long seq)
|
||||
{
|
||||
/* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
|
||||
return READ_ONCE(mni->invalidate_seq) != seq;
|
||||
}
|
||||
|
||||
extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
|
||||
extern void __mmu_notifier_release(struct mm_struct *mm);
|
||||
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
|
||||
|
|
|
@ -35,11 +35,11 @@
|
|||
|
||||
#include <rdma/ib_umem.h>
|
||||
#include <rdma/ib_verbs.h>
|
||||
#include <linux/interval_tree.h>
|
||||
|
||||
struct ib_umem_odp {
|
||||
struct ib_umem umem;
|
||||
struct ib_ucontext_per_mm *per_mm;
|
||||
struct mmu_interval_notifier notifier;
|
||||
struct pid *tgid;
|
||||
|
||||
/*
|
||||
* An array of the pages included in the on-demand paging umem.
|
||||
|
@ -62,13 +62,8 @@ struct ib_umem_odp {
|
|||
struct mutex umem_mutex;
|
||||
void *private; /* for the HW driver to use. */
|
||||
|
||||
int notifiers_seq;
|
||||
int notifiers_count;
|
||||
int npages;
|
||||
|
||||
/* Tree tracking */
|
||||
struct interval_tree_node interval_tree;
|
||||
|
||||
/*
|
||||
* An implicit odp umem cannot be DMA mapped, has 0 length, and serves
|
||||
* only as an anchor for the driver to hold onto the per_mm. FIXME:
|
||||
|
@ -77,7 +72,6 @@ struct ib_umem_odp {
|
|||
*/
|
||||
bool is_implicit_odp;
|
||||
|
||||
struct completion notifier_completion;
|
||||
unsigned int page_shift;
|
||||
};
|
||||
|
||||
|
@ -89,13 +83,13 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem)
|
|||
/* Returns the first page of an ODP umem. */
|
||||
static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp)
|
||||
{
|
||||
return umem_odp->interval_tree.start;
|
||||
return umem_odp->notifier.interval_tree.start;
|
||||
}
|
||||
|
||||
/* Returns the address of the page after the last one of an ODP umem. */
|
||||
static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp)
|
||||
{
|
||||
return umem_odp->interval_tree.last + 1;
|
||||
return umem_odp->notifier.interval_tree.last + 1;
|
||||
}
|
||||
|
||||
static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
|
||||
|
@ -119,21 +113,15 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
|
|||
|
||||
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
|
||||
|
||||
struct ib_ucontext_per_mm {
|
||||
struct mmu_notifier mn;
|
||||
struct pid *tgid;
|
||||
|
||||
struct rb_root_cached umem_tree;
|
||||
/* Protects umem_tree */
|
||||
struct rw_semaphore umem_rwsem;
|
||||
};
|
||||
|
||||
struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
|
||||
size_t size, int access);
|
||||
struct ib_umem_odp *
|
||||
ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size,
|
||||
int access, const struct mmu_interval_notifier_ops *ops);
|
||||
struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
|
||||
int access);
|
||||
struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem,
|
||||
unsigned long addr, size_t size);
|
||||
struct ib_umem_odp *
|
||||
ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, unsigned long addr,
|
||||
size_t size,
|
||||
const struct mmu_interval_notifier_ops *ops);
|
||||
void ib_umem_odp_release(struct ib_umem_odp *umem_odp);
|
||||
|
||||
int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
|
||||
|
@ -143,39 +131,11 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
|
|||
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
|
||||
u64 bound);
|
||||
|
||||
typedef int (*umem_call_back)(struct ib_umem_odp *item, u64 start, u64 end,
|
||||
void *cookie);
|
||||
/*
|
||||
* Call the callback on each ib_umem in the range. Returns the logical or of
|
||||
* the return values of the functions called.
|
||||
*/
|
||||
int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
|
||||
u64 start, u64 end,
|
||||
umem_call_back cb,
|
||||
bool blockable, void *cookie);
|
||||
|
||||
static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp,
|
||||
unsigned long mmu_seq)
|
||||
{
|
||||
/*
|
||||
* This code is strongly based on the KVM code from
|
||||
* mmu_notifier_retry. Should be called with
|
||||
* the relevant locks taken (umem_odp->umem_mutex
|
||||
* and the ucontext umem_mutex semaphore locked for read).
|
||||
*/
|
||||
|
||||
if (unlikely(umem_odp->notifiers_count))
|
||||
return 1;
|
||||
if (umem_odp->notifiers_seq != mmu_seq)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
|
||||
|
||||
static inline struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata,
|
||||
unsigned long addr,
|
||||
size_t size, int access)
|
||||
static inline struct ib_umem_odp *
|
||||
ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size,
|
||||
int access, const struct mmu_interval_notifier_ops *ops)
|
||||
{
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
|
|
@ -2451,8 +2451,6 @@ struct ib_device_ops {
|
|||
u64 iova);
|
||||
int (*unmap_fmr)(struct list_head *fmr_list);
|
||||
int (*dealloc_fmr)(struct ib_fmr *fmr);
|
||||
void (*invalidate_range)(struct ib_umem_odp *umem_odp,
|
||||
unsigned long start, unsigned long end);
|
||||
int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
|
||||
int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
|
||||
struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device,
|
||||
|
|
|
@ -40,7 +40,6 @@
|
|||
#include <linux/binfmts.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/hmm.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/vmacache.h>
|
||||
|
|
|
@ -284,6 +284,7 @@ config VIRT_TO_BUS
|
|||
config MMU_NOTIFIER
|
||||
bool
|
||||
select SRCU
|
||||
select INTERVAL_TREE
|
||||
|
||||
config KSM
|
||||
bool "Enable KSM for page merging"
|
||||
|
@ -674,7 +675,6 @@ config DEV_PAGEMAP_OPS
|
|||
config HMM_MIRROR
|
||||
bool
|
||||
depends on MMU
|
||||
depends on MMU_NOTIFIER
|
||||
|
||||
config DEVICE_PRIVATE
|
||||
bool "Unaddressable device memory (GPU memory, ...)"
|
||||
|
|
531
mm/hmm.c
531
mm/hmm.c
|
@ -26,193 +26,6 @@
|
|||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/memory_hotplug.h>
|
||||
|
||||
static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm)
|
||||
{
|
||||
struct hmm *hmm;
|
||||
|
||||
hmm = kzalloc(sizeof(*hmm), GFP_KERNEL);
|
||||
if (!hmm)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
init_waitqueue_head(&hmm->wq);
|
||||
INIT_LIST_HEAD(&hmm->mirrors);
|
||||
init_rwsem(&hmm->mirrors_sem);
|
||||
INIT_LIST_HEAD(&hmm->ranges);
|
||||
spin_lock_init(&hmm->ranges_lock);
|
||||
hmm->notifiers = 0;
|
||||
return &hmm->mmu_notifier;
|
||||
}
|
||||
|
||||
static void hmm_free_notifier(struct mmu_notifier *mn)
|
||||
{
|
||||
struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
|
||||
|
||||
WARN_ON(!list_empty(&hmm->ranges));
|
||||
WARN_ON(!list_empty(&hmm->mirrors));
|
||||
kfree(hmm);
|
||||
}
|
||||
|
||||
static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
|
||||
{
|
||||
struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
|
||||
struct hmm_mirror *mirror;
|
||||
|
||||
/*
|
||||
* Since hmm_range_register() holds the mmget() lock hmm_release() is
|
||||
* prevented as long as a range exists.
|
||||
*/
|
||||
WARN_ON(!list_empty_careful(&hmm->ranges));
|
||||
|
||||
down_read(&hmm->mirrors_sem);
|
||||
list_for_each_entry(mirror, &hmm->mirrors, list) {
|
||||
/*
|
||||
* Note: The driver is not allowed to trigger
|
||||
* hmm_mirror_unregister() from this thread.
|
||||
*/
|
||||
if (mirror->ops->release)
|
||||
mirror->ops->release(mirror);
|
||||
}
|
||||
up_read(&hmm->mirrors_sem);
|
||||
}
|
||||
|
||||
static void notifiers_decrement(struct hmm *hmm)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&hmm->ranges_lock, flags);
|
||||
hmm->notifiers--;
|
||||
if (!hmm->notifiers) {
|
||||
struct hmm_range *range;
|
||||
|
||||
list_for_each_entry(range, &hmm->ranges, list) {
|
||||
if (range->valid)
|
||||
continue;
|
||||
range->valid = true;
|
||||
}
|
||||
wake_up_all(&hmm->wq);
|
||||
}
|
||||
spin_unlock_irqrestore(&hmm->ranges_lock, flags);
|
||||
}
|
||||
|
||||
static int hmm_invalidate_range_start(struct mmu_notifier *mn,
|
||||
const struct mmu_notifier_range *nrange)
|
||||
{
|
||||
struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
|
||||
struct hmm_mirror *mirror;
|
||||
struct hmm_range *range;
|
||||
unsigned long flags;
|
||||
int ret = 0;
|
||||
|
||||
spin_lock_irqsave(&hmm->ranges_lock, flags);
|
||||
hmm->notifiers++;
|
||||
list_for_each_entry(range, &hmm->ranges, list) {
|
||||
if (nrange->end < range->start || nrange->start >= range->end)
|
||||
continue;
|
||||
|
||||
range->valid = false;
|
||||
}
|
||||
spin_unlock_irqrestore(&hmm->ranges_lock, flags);
|
||||
|
||||
if (mmu_notifier_range_blockable(nrange))
|
||||
down_read(&hmm->mirrors_sem);
|
||||
else if (!down_read_trylock(&hmm->mirrors_sem)) {
|
||||
ret = -EAGAIN;
|
||||
goto out;
|
||||
}
|
||||
|
||||
list_for_each_entry(mirror, &hmm->mirrors, list) {
|
||||
int rc;
|
||||
|
||||
rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange);
|
||||
if (rc) {
|
||||
if (WARN_ON(mmu_notifier_range_blockable(nrange) ||
|
||||
rc != -EAGAIN))
|
||||
continue;
|
||||
ret = -EAGAIN;
|
||||
break;
|
||||
}
|
||||
}
|
||||
up_read(&hmm->mirrors_sem);
|
||||
|
||||
out:
|
||||
if (ret)
|
||||
notifiers_decrement(hmm);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void hmm_invalidate_range_end(struct mmu_notifier *mn,
|
||||
const struct mmu_notifier_range *nrange)
|
||||
{
|
||||
struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
|
||||
|
||||
notifiers_decrement(hmm);
|
||||
}
|
||||
|
||||
static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
|
||||
.release = hmm_release,
|
||||
.invalidate_range_start = hmm_invalidate_range_start,
|
||||
.invalidate_range_end = hmm_invalidate_range_end,
|
||||
.alloc_notifier = hmm_alloc_notifier,
|
||||
.free_notifier = hmm_free_notifier,
|
||||
};
|
||||
|
||||
/*
|
||||
* hmm_mirror_register() - register a mirror against an mm
|
||||
*
|
||||
* @mirror: new mirror struct to register
|
||||
* @mm: mm to register against
|
||||
* Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments
|
||||
*
|
||||
* To start mirroring a process address space, the device driver must register
|
||||
* an HMM mirror struct.
|
||||
*
|
||||
* The caller cannot unregister the hmm_mirror while any ranges are
|
||||
* registered.
|
||||
*
|
||||
* Callers using this function must put a call to mmu_notifier_synchronize()
|
||||
* in their module exit functions.
|
||||
*/
|
||||
int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
|
||||
{
|
||||
struct mmu_notifier *mn;
|
||||
|
||||
lockdep_assert_held_write(&mm->mmap_sem);
|
||||
|
||||
/* Sanity check */
|
||||
if (!mm || !mirror || !mirror->ops)
|
||||
return -EINVAL;
|
||||
|
||||
mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm);
|
||||
if (IS_ERR(mn))
|
||||
return PTR_ERR(mn);
|
||||
mirror->hmm = container_of(mn, struct hmm, mmu_notifier);
|
||||
|
||||
down_write(&mirror->hmm->mirrors_sem);
|
||||
list_add(&mirror->list, &mirror->hmm->mirrors);
|
||||
up_write(&mirror->hmm->mirrors_sem);
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(hmm_mirror_register);
|
||||
|
||||
/*
|
||||
* hmm_mirror_unregister() - unregister a mirror
|
||||
*
|
||||
* @mirror: mirror struct to unregister
|
||||
*
|
||||
* Stop mirroring a process address space, and cleanup.
|
||||
*/
|
||||
void hmm_mirror_unregister(struct hmm_mirror *mirror)
|
||||
{
|
||||
struct hmm *hmm = mirror->hmm;
|
||||
|
||||
down_write(&hmm->mirrors_sem);
|
||||
list_del(&mirror->list);
|
||||
up_write(&hmm->mirrors_sem);
|
||||
mmu_notifier_put(&hmm->mmu_notifier);
|
||||
}
|
||||
EXPORT_SYMBOL(hmm_mirror_unregister);
|
||||
|
||||
struct hmm_vma_walk {
|
||||
struct hmm_range *range;
|
||||
struct dev_pagemap *pgmap;
|
||||
|
@ -252,18 +65,15 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
|
|||
return -EFAULT;
|
||||
}
|
||||
|
||||
static int hmm_pfns_bad(unsigned long addr,
|
||||
unsigned long end,
|
||||
struct mm_walk *walk)
|
||||
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
|
||||
struct hmm_range *range, enum hmm_pfn_value_e value)
|
||||
{
|
||||
struct hmm_vma_walk *hmm_vma_walk = walk->private;
|
||||
struct hmm_range *range = hmm_vma_walk->range;
|
||||
uint64_t *pfns = range->pfns;
|
||||
unsigned long i;
|
||||
|
||||
i = (addr - range->start) >> PAGE_SHIFT;
|
||||
for (; addr < end; addr += PAGE_SIZE, i++)
|
||||
pfns[i] = range->values[HMM_PFN_ERROR];
|
||||
pfns[i] = range->values[value];
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -532,8 +342,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
|
|||
if (unlikely(!hmm_vma_walk->pgmap))
|
||||
return -EBUSY;
|
||||
} else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
|
||||
*pfn = range->values[HMM_PFN_SPECIAL];
|
||||
return -EFAULT;
|
||||
if (!is_zero_pfn(pte_pfn(pte))) {
|
||||
*pfn = range->values[HMM_PFN_SPECIAL];
|
||||
return -EFAULT;
|
||||
}
|
||||
/*
|
||||
* Since each architecture defines a struct page for the zero
|
||||
* page, just fall through and treat it like a normal page.
|
||||
*/
|
||||
}
|
||||
|
||||
*pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
|
||||
|
@ -584,7 +400,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
|
|||
}
|
||||
return 0;
|
||||
} else if (!pmd_present(pmd))
|
||||
return hmm_pfns_bad(start, end, walk);
|
||||
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
|
||||
|
||||
if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
|
||||
/*
|
||||
|
@ -612,7 +428,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
|
|||
* recover.
|
||||
*/
|
||||
if (pmd_bad(pmd))
|
||||
return hmm_pfns_bad(start, end, walk);
|
||||
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
|
||||
|
||||
ptep = pte_offset_map(pmdp, addr);
|
||||
i = (addr - range->start) >> PAGE_SHIFT;
|
||||
|
@ -770,93 +586,55 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
|
|||
#define hmm_vma_walk_hugetlb_entry NULL
|
||||
#endif /* CONFIG_HUGETLB_PAGE */
|
||||
|
||||
static void hmm_pfns_clear(struct hmm_range *range,
|
||||
uint64_t *pfns,
|
||||
unsigned long addr,
|
||||
unsigned long end)
|
||||
static int hmm_vma_walk_test(unsigned long start, unsigned long end,
|
||||
struct mm_walk *walk)
|
||||
{
|
||||
for (; addr < end; addr += PAGE_SIZE, pfns++)
|
||||
*pfns = range->values[HMM_PFN_NONE];
|
||||
}
|
||||
|
||||
/*
|
||||
* hmm_range_register() - start tracking change to CPU page table over a range
|
||||
* @range: range
|
||||
* @mm: the mm struct for the range of virtual address
|
||||
*
|
||||
* Return: 0 on success, -EFAULT if the address space is no longer valid
|
||||
*
|
||||
* Track updates to the CPU page table see include/linux/hmm.h
|
||||
*/
|
||||
int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror)
|
||||
{
|
||||
struct hmm *hmm = mirror->hmm;
|
||||
unsigned long flags;
|
||||
|
||||
range->valid = false;
|
||||
range->hmm = NULL;
|
||||
|
||||
if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1)))
|
||||
return -EINVAL;
|
||||
if (range->start >= range->end)
|
||||
return -EINVAL;
|
||||
|
||||
/* Prevent hmm_release() from running while the range is valid */
|
||||
if (!mmget_not_zero(hmm->mmu_notifier.mm))
|
||||
return -EFAULT;
|
||||
|
||||
/* Initialize range to track CPU page table updates. */
|
||||
spin_lock_irqsave(&hmm->ranges_lock, flags);
|
||||
|
||||
range->hmm = hmm;
|
||||
list_add(&range->list, &hmm->ranges);
|
||||
struct hmm_vma_walk *hmm_vma_walk = walk->private;
|
||||
struct hmm_range *range = hmm_vma_walk->range;
|
||||
struct vm_area_struct *vma = walk->vma;
|
||||
|
||||
/*
|
||||
* If there are any concurrent notifiers we have to wait for them for
|
||||
* the range to be valid (see hmm_range_wait_until_valid()).
|
||||
* Skip vma ranges that don't have struct page backing them or
|
||||
* map I/O devices directly.
|
||||
*/
|
||||
if (!hmm->notifiers)
|
||||
range->valid = true;
|
||||
spin_unlock_irqrestore(&hmm->ranges_lock, flags);
|
||||
if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP))
|
||||
return -EFAULT;
|
||||
|
||||
/*
|
||||
* If the vma does not allow read access, then assume that it does not
|
||||
* allow write access either. HMM does not support architectures
|
||||
* that allow write without read.
|
||||
*/
|
||||
if (!(vma->vm_flags & VM_READ)) {
|
||||
bool fault, write_fault;
|
||||
|
||||
/*
|
||||
* Check to see if a fault is requested for any page in the
|
||||
* range.
|
||||
*/
|
||||
hmm_range_need_fault(hmm_vma_walk, range->pfns +
|
||||
((start - range->start) >> PAGE_SHIFT),
|
||||
(end - start) >> PAGE_SHIFT,
|
||||
0, &fault, &write_fault);
|
||||
if (fault || write_fault)
|
||||
return -EFAULT;
|
||||
|
||||
hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
|
||||
hmm_vma_walk->last = end;
|
||||
|
||||
/* Skip this vma and continue processing the next vma. */
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(hmm_range_register);
|
||||
|
||||
/*
|
||||
* hmm_range_unregister() - stop tracking change to CPU page table over a range
|
||||
* @range: range
|
||||
*
|
||||
* Range struct is used to track updates to the CPU page table after a call to
|
||||
* hmm_range_register(). See include/linux/hmm.h for how to use it.
|
||||
*/
|
||||
void hmm_range_unregister(struct hmm_range *range)
|
||||
{
|
||||
struct hmm *hmm = range->hmm;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&hmm->ranges_lock, flags);
|
||||
list_del_init(&range->list);
|
||||
spin_unlock_irqrestore(&hmm->ranges_lock, flags);
|
||||
|
||||
/* Drop reference taken by hmm_range_register() */
|
||||
mmput(hmm->mmu_notifier.mm);
|
||||
|
||||
/*
|
||||
* The range is now invalid and the ref on the hmm is dropped, so
|
||||
* poison the pointer. Leave other fields in place, for the caller's
|
||||
* use.
|
||||
*/
|
||||
range->valid = false;
|
||||
memset(&range->hmm, POISON_INUSE, sizeof(range->hmm));
|
||||
}
|
||||
EXPORT_SYMBOL(hmm_range_unregister);
|
||||
|
||||
static const struct mm_walk_ops hmm_walk_ops = {
|
||||
.pud_entry = hmm_vma_walk_pud,
|
||||
.pmd_entry = hmm_vma_walk_pmd,
|
||||
.pte_hole = hmm_vma_walk_hole,
|
||||
.hugetlb_entry = hmm_vma_walk_hugetlb_entry,
|
||||
.test_walk = hmm_vma_walk_test,
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -889,210 +667,27 @@ static const struct mm_walk_ops hmm_walk_ops = {
|
|||
*/
|
||||
long hmm_range_fault(struct hmm_range *range, unsigned int flags)
|
||||
{
|
||||
const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
|
||||
unsigned long start = range->start, end;
|
||||
struct hmm_vma_walk hmm_vma_walk;
|
||||
struct hmm *hmm = range->hmm;
|
||||
struct vm_area_struct *vma;
|
||||
struct hmm_vma_walk hmm_vma_walk = {
|
||||
.range = range,
|
||||
.last = range->start,
|
||||
.flags = flags,
|
||||
};
|
||||
struct mm_struct *mm = range->notifier->mm;
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem);
|
||||
lockdep_assert_held(&mm->mmap_sem);
|
||||
|
||||
do {
|
||||
/* If range is no longer valid force retry. */
|
||||
if (!range->valid)
|
||||
if (mmu_interval_check_retry(range->notifier,
|
||||
range->notifier_seq))
|
||||
return -EBUSY;
|
||||
ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
|
||||
&hmm_walk_ops, &hmm_vma_walk);
|
||||
} while (ret == -EBUSY);
|
||||
|
||||
vma = find_vma(hmm->mmu_notifier.mm, start);
|
||||
if (vma == NULL || (vma->vm_flags & device_vma))
|
||||
return -EFAULT;
|
||||
|
||||
if (!(vma->vm_flags & VM_READ)) {
|
||||
/*
|
||||
* If vma do not allow read access, then assume that it
|
||||
* does not allow write access, either. HMM does not
|
||||
* support architecture that allow write without read.
|
||||
*/
|
||||
hmm_pfns_clear(range, range->pfns,
|
||||
range->start, range->end);
|
||||
return -EPERM;
|
||||
}
|
||||
|
||||
hmm_vma_walk.pgmap = NULL;
|
||||
hmm_vma_walk.last = start;
|
||||
hmm_vma_walk.flags = flags;
|
||||
hmm_vma_walk.range = range;
|
||||
end = min(range->end, vma->vm_end);
|
||||
|
||||
walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops,
|
||||
&hmm_vma_walk);
|
||||
|
||||
do {
|
||||
ret = walk_page_range(vma->vm_mm, start, end,
|
||||
&hmm_walk_ops, &hmm_vma_walk);
|
||||
start = hmm_vma_walk.last;
|
||||
|
||||
/* Keep trying while the range is valid. */
|
||||
} while (ret == -EBUSY && range->valid);
|
||||
|
||||
if (ret) {
|
||||
unsigned long i;
|
||||
|
||||
i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
|
||||
hmm_pfns_clear(range, &range->pfns[i],
|
||||
hmm_vma_walk.last, range->end);
|
||||
return ret;
|
||||
}
|
||||
start = end;
|
||||
|
||||
} while (start < range->end);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
|
||||
}
|
||||
EXPORT_SYMBOL(hmm_range_fault);
|
||||
|
||||
/**
|
||||
* hmm_range_dma_map - hmm_range_fault() and dma map page all in one.
|
||||
* @range: range being faulted
|
||||
* @device: device to map page to
|
||||
* @daddrs: array of dma addresses for the mapped pages
|
||||
* @flags: HMM_FAULT_*
|
||||
*
|
||||
* Return: the number of pages mapped on success (including zero), or any
|
||||
* status return from hmm_range_fault() otherwise.
|
||||
*/
|
||||
long hmm_range_dma_map(struct hmm_range *range, struct device *device,
|
||||
dma_addr_t *daddrs, unsigned int flags)
|
||||
{
|
||||
unsigned long i, npages, mapped;
|
||||
long ret;
|
||||
|
||||
ret = hmm_range_fault(range, flags);
|
||||
if (ret <= 0)
|
||||
return ret ? ret : -EBUSY;
|
||||
|
||||
npages = (range->end - range->start) >> PAGE_SHIFT;
|
||||
for (i = 0, mapped = 0; i < npages; ++i) {
|
||||
enum dma_data_direction dir = DMA_TO_DEVICE;
|
||||
struct page *page;
|
||||
|
||||
/*
|
||||
* FIXME need to update DMA API to provide invalid DMA address
|
||||
* value instead of a function to test dma address value. This
|
||||
* would remove lot of dumb code duplicated accross many arch.
|
||||
*
|
||||
* For now setting it to 0 here is good enough as the pfns[]
|
||||
* value is what is use to check what is valid and what isn't.
|
||||
*/
|
||||
daddrs[i] = 0;
|
||||
|
||||
page = hmm_device_entry_to_page(range, range->pfns[i]);
|
||||
if (page == NULL)
|
||||
continue;
|
||||
|
||||
/* Check if range is being invalidated */
|
||||
if (!range->valid) {
|
||||
ret = -EBUSY;
|
||||
goto unmap;
|
||||
}
|
||||
|
||||
/* If it is read and write than map bi-directional. */
|
||||
if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
|
||||
dir = DMA_BIDIRECTIONAL;
|
||||
|
||||
daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
|
||||
if (dma_mapping_error(device, daddrs[i])) {
|
||||
ret = -EFAULT;
|
||||
goto unmap;
|
||||
}
|
||||
|
||||
mapped++;
|
||||
}
|
||||
|
||||
return mapped;
|
||||
|
||||
unmap:
|
||||
for (npages = i, i = 0; (i < npages) && mapped; ++i) {
|
||||
enum dma_data_direction dir = DMA_TO_DEVICE;
|
||||
struct page *page;
|
||||
|
||||
page = hmm_device_entry_to_page(range, range->pfns[i]);
|
||||
if (page == NULL)
|
||||
continue;
|
||||
|
||||
if (dma_mapping_error(device, daddrs[i]))
|
||||
continue;
|
||||
|
||||
/* If it is read and write than map bi-directional. */
|
||||
if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
|
||||
dir = DMA_BIDIRECTIONAL;
|
||||
|
||||
dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
|
||||
mapped--;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(hmm_range_dma_map);
|
||||
|
||||
/**
|
||||
* hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
|
||||
* @range: range being unmapped
|
||||
* @device: device against which dma map was done
|
||||
* @daddrs: dma address of mapped pages
|
||||
* @dirty: dirty page if it had the write flag set
|
||||
* Return: number of page unmapped on success, -EINVAL otherwise
|
||||
*
|
||||
* Note that caller MUST abide by mmu notifier or use HMM mirror and abide
|
||||
* to the sync_cpu_device_pagetables() callback so that it is safe here to
|
||||
* call set_page_dirty(). Caller must also take appropriate locks to avoid
|
||||
* concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
|
||||
*/
|
||||
long hmm_range_dma_unmap(struct hmm_range *range,
|
||||
struct device *device,
|
||||
dma_addr_t *daddrs,
|
||||
bool dirty)
|
||||
{
|
||||
unsigned long i, npages;
|
||||
long cpages = 0;
|
||||
|
||||
/* Sanity check. */
|
||||
if (range->end <= range->start)
|
||||
return -EINVAL;
|
||||
if (!daddrs)
|
||||
return -EINVAL;
|
||||
if (!range->pfns)
|
||||
return -EINVAL;
|
||||
|
||||
npages = (range->end - range->start) >> PAGE_SHIFT;
|
||||
for (i = 0; i < npages; ++i) {
|
||||
enum dma_data_direction dir = DMA_TO_DEVICE;
|
||||
struct page *page;
|
||||
|
||||
page = hmm_device_entry_to_page(range, range->pfns[i]);
|
||||
if (page == NULL)
|
||||
continue;
|
||||
|
||||
/* If it is read and write than map bi-directional. */
|
||||
if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) {
|
||||
dir = DMA_BIDIRECTIONAL;
|
||||
|
||||
/*
|
||||
* See comments in function description on why it is
|
||||
* safe here to call set_page_dirty()
|
||||
*/
|
||||
if (dirty)
|
||||
set_page_dirty(page);
|
||||
}
|
||||
|
||||
/* Unmap and clear pfns/dma address */
|
||||
dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
|
||||
range->pfns[i] = range->values[HMM_PFN_NONE];
|
||||
/* FIXME see comments in hmm_vma_dma_map() */
|
||||
daddrs[i] = 0;
|
||||
cpages++;
|
||||
}
|
||||
|
||||
return cpages;
|
||||
}
|
||||
EXPORT_SYMBOL(hmm_range_dma_unmap);
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <linux/export.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/interval_tree.h>
|
||||
#include <linux/srcu.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/sched.h>
|
||||
|
@ -27,6 +28,254 @@ struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
|
|||
};
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The mmu notifier_mm structure is allocated and installed in
|
||||
* mm->mmu_notifier_mm inside the mm_take_all_locks() protected
|
||||
* critical section and it's released only when mm_count reaches zero
|
||||
* in mmdrop().
|
||||
*/
|
||||
struct mmu_notifier_mm {
|
||||
/* all mmu notifiers registered in this mm are queued in this list */
|
||||
struct hlist_head list;
|
||||
bool has_itree;
|
||||
/* to serialize the list modifications and hlist_unhashed */
|
||||
spinlock_t lock;
|
||||
unsigned long invalidate_seq;
|
||||
unsigned long active_invalidate_ranges;
|
||||
struct rb_root_cached itree;
|
||||
wait_queue_head_t wq;
|
||||
struct hlist_head deferred_list;
|
||||
};
|
||||
|
||||
/*
|
||||
* This is a collision-retry read-side/write-side 'lock', a lot like a
|
||||
* seqcount, however this allows multiple write-sides to hold it at
|
||||
* once. Conceptually the write side is protecting the values of the PTEs in
|
||||
* this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any
|
||||
* writer exists.
|
||||
*
|
||||
* Note that the core mm creates nested invalidate_range_start()/end() regions
|
||||
* within the same thread, and runs invalidate_range_start()/end() in parallel
|
||||
* on multiple CPUs. This is designed to not reduce concurrency or block
|
||||
* progress on the mm side.
|
||||
*
|
||||
* As a secondary function, holding the full write side also serves to prevent
|
||||
* writers for the itree, this is an optimization to avoid extra locking
|
||||
* during invalidate_range_start/end notifiers.
|
||||
*
|
||||
* The write side has two states, fully excluded:
|
||||
* - mm->active_invalidate_ranges != 0
|
||||
* - mnn->invalidate_seq & 1 == True (odd)
|
||||
* - some range on the mm_struct is being invalidated
|
||||
* - the itree is not allowed to change
|
||||
*
|
||||
* And partially excluded:
|
||||
* - mm->active_invalidate_ranges != 0
|
||||
* - mnn->invalidate_seq & 1 == False (even)
|
||||
* - some range on the mm_struct is being invalidated
|
||||
* - the itree is allowed to change
|
||||
*
|
||||
* Operations on mmu_notifier_mm->invalidate_seq (under spinlock):
|
||||
* seq |= 1 # Begin writing
|
||||
* seq++ # Release the writing state
|
||||
* seq & 1 # True if a writer exists
|
||||
*
|
||||
* The later state avoids some expensive work on inv_end in the common case of
|
||||
* no mni monitoring the VA.
|
||||
*/
|
||||
static bool mn_itree_is_invalidating(struct mmu_notifier_mm *mmn_mm)
|
||||
{
|
||||
lockdep_assert_held(&mmn_mm->lock);
|
||||
return mmn_mm->invalidate_seq & 1;
|
||||
}
|
||||
|
||||
static struct mmu_interval_notifier *
|
||||
mn_itree_inv_start_range(struct mmu_notifier_mm *mmn_mm,
|
||||
const struct mmu_notifier_range *range,
|
||||
unsigned long *seq)
|
||||
{
|
||||
struct interval_tree_node *node;
|
||||
struct mmu_interval_notifier *res = NULL;
|
||||
|
||||
spin_lock(&mmn_mm->lock);
|
||||
mmn_mm->active_invalidate_ranges++;
|
||||
node = interval_tree_iter_first(&mmn_mm->itree, range->start,
|
||||
range->end - 1);
|
||||
if (node) {
|
||||
mmn_mm->invalidate_seq |= 1;
|
||||
res = container_of(node, struct mmu_interval_notifier,
|
||||
interval_tree);
|
||||
}
|
||||
|
||||
*seq = mmn_mm->invalidate_seq;
|
||||
spin_unlock(&mmn_mm->lock);
|
||||
return res;
|
||||
}
|
||||
|
||||
static struct mmu_interval_notifier *
|
||||
mn_itree_inv_next(struct mmu_interval_notifier *mni,
|
||||
const struct mmu_notifier_range *range)
|
||||
{
|
||||
struct interval_tree_node *node;
|
||||
|
||||
node = interval_tree_iter_next(&mni->interval_tree, range->start,
|
||||
range->end - 1);
|
||||
if (!node)
|
||||
return NULL;
|
||||
return container_of(node, struct mmu_interval_notifier, interval_tree);
|
||||
}
|
||||
|
||||
static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm)
|
||||
{
|
||||
struct mmu_interval_notifier *mni;
|
||||
struct hlist_node *next;
|
||||
|
||||
spin_lock(&mmn_mm->lock);
|
||||
if (--mmn_mm->active_invalidate_ranges ||
|
||||
!mn_itree_is_invalidating(mmn_mm)) {
|
||||
spin_unlock(&mmn_mm->lock);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Make invalidate_seq even */
|
||||
mmn_mm->invalidate_seq++;
|
||||
|
||||
/*
|
||||
* The inv_end incorporates a deferred mechanism like rtnl_unlock().
|
||||
* Adds and removes are queued until the final inv_end happens then
|
||||
* they are progressed. This arrangement for tree updates is used to
|
||||
* avoid using a blocking lock during invalidate_range_start.
|
||||
*/
|
||||
hlist_for_each_entry_safe(mni, next, &mmn_mm->deferred_list,
|
||||
deferred_item) {
|
||||
if (RB_EMPTY_NODE(&mni->interval_tree.rb))
|
||||
interval_tree_insert(&mni->interval_tree,
|
||||
&mmn_mm->itree);
|
||||
else
|
||||
interval_tree_remove(&mni->interval_tree,
|
||||
&mmn_mm->itree);
|
||||
hlist_del(&mni->deferred_item);
|
||||
}
|
||||
spin_unlock(&mmn_mm->lock);
|
||||
|
||||
wake_up_all(&mmn_mm->wq);
|
||||
}
|
||||
|
||||
/**
|
||||
* mmu_interval_read_begin - Begin a read side critical section against a VA
|
||||
* range
|
||||
* mni: The range to use
|
||||
*
|
||||
* mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a
|
||||
* collision-retry scheme similar to seqcount for the VA range under mni. If
|
||||
* the mm invokes invalidation during the critical section then
|
||||
* mmu_interval_read_retry() will return true.
|
||||
*
|
||||
* This is useful to obtain shadow PTEs where teardown or setup of the SPTEs
|
||||
* require a blocking context. The critical region formed by this can sleep,
|
||||
* and the required 'user_lock' can also be a sleeping lock.
|
||||
*
|
||||
* The caller is required to provide a 'user_lock' to serialize both teardown
|
||||
* and setup.
|
||||
*
|
||||
* The return value should be passed to mmu_interval_read_retry().
|
||||
*/
|
||||
unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni)
|
||||
{
|
||||
struct mmu_notifier_mm *mmn_mm = mni->mm->mmu_notifier_mm;
|
||||
unsigned long seq;
|
||||
bool is_invalidating;
|
||||
|
||||
/*
|
||||
* If the mni has a different seq value under the user_lock than we
|
||||
* started with then it has collided.
|
||||
*
|
||||
* If the mni currently has the same seq value as the mmn_mm seq, then
|
||||
* it is currently between invalidate_start/end and is colliding.
|
||||
*
|
||||
* The locking looks broadly like this:
|
||||
* mn_tree_invalidate_start(): mmu_interval_read_begin():
|
||||
* spin_lock
|
||||
* seq = READ_ONCE(mni->invalidate_seq);
|
||||
* seq == mmn_mm->invalidate_seq
|
||||
* spin_unlock
|
||||
* spin_lock
|
||||
* seq = ++mmn_mm->invalidate_seq
|
||||
* spin_unlock
|
||||
* op->invalidate_range():
|
||||
* user_lock
|
||||
* mmu_interval_set_seq()
|
||||
* mni->invalidate_seq = seq
|
||||
* user_unlock
|
||||
*
|
||||
* [Required: mmu_interval_read_retry() == true]
|
||||
*
|
||||
* mn_itree_inv_end():
|
||||
* spin_lock
|
||||
* seq = ++mmn_mm->invalidate_seq
|
||||
* spin_unlock
|
||||
*
|
||||
* user_lock
|
||||
* mmu_interval_read_retry():
|
||||
* mni->invalidate_seq != seq
|
||||
* user_unlock
|
||||
*
|
||||
* Barriers are not needed here as any races here are closed by an
|
||||
* eventual mmu_interval_read_retry(), which provides a barrier via the
|
||||
* user_lock.
|
||||
*/
|
||||
spin_lock(&mmn_mm->lock);
|
||||
/* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
|
||||
seq = READ_ONCE(mni->invalidate_seq);
|
||||
is_invalidating = seq == mmn_mm->invalidate_seq;
|
||||
spin_unlock(&mmn_mm->lock);
|
||||
|
||||
/*
|
||||
* mni->invalidate_seq must always be set to an odd value via
|
||||
* mmu_interval_set_seq() using the provided cur_seq from
|
||||
* mn_itree_inv_start_range(). This ensures that if seq does wrap we
|
||||
* will always clear the below sleep in some reasonable time as
|
||||
* mmn_mm->invalidate_seq is even in the idle state.
|
||||
*/
|
||||
lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
|
||||
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
|
||||
if (is_invalidating)
|
||||
wait_event(mmn_mm->wq,
|
||||
READ_ONCE(mmn_mm->invalidate_seq) != seq);
|
||||
|
||||
/*
|
||||
* Notice that mmu_interval_read_retry() can already be true at this
|
||||
* point, avoiding loops here allows the caller to provide a global
|
||||
* time bound.
|
||||
*/
|
||||
|
||||
return seq;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mmu_interval_read_begin);
|
||||
|
||||
static void mn_itree_release(struct mmu_notifier_mm *mmn_mm,
|
||||
struct mm_struct *mm)
|
||||
{
|
||||
struct mmu_notifier_range range = {
|
||||
.flags = MMU_NOTIFIER_RANGE_BLOCKABLE,
|
||||
.event = MMU_NOTIFY_RELEASE,
|
||||
.mm = mm,
|
||||
.start = 0,
|
||||
.end = ULONG_MAX,
|
||||
};
|
||||
struct mmu_interval_notifier *mni;
|
||||
unsigned long cur_seq;
|
||||
bool ret;
|
||||
|
||||
for (mni = mn_itree_inv_start_range(mmn_mm, &range, &cur_seq); mni;
|
||||
mni = mn_itree_inv_next(mni, &range)) {
|
||||
ret = mni->ops->invalidate(mni, &range, cur_seq);
|
||||
WARN_ON(!ret);
|
||||
}
|
||||
|
||||
mn_itree_inv_end(mmn_mm);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function can't run concurrently against mmu_notifier_register
|
||||
* because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
|
||||
|
@ -39,7 +288,8 @@ struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
|
|||
* can't go away from under us as exit_mmap holds an mm_count pin
|
||||
* itself.
|
||||
*/
|
||||
void __mmu_notifier_release(struct mm_struct *mm)
|
||||
static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm,
|
||||
struct mm_struct *mm)
|
||||
{
|
||||
struct mmu_notifier *mn;
|
||||
int id;
|
||||
|
@ -49,7 +299,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
|
|||
* ->release returns.
|
||||
*/
|
||||
id = srcu_read_lock(&srcu);
|
||||
hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist)
|
||||
hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist)
|
||||
/*
|
||||
* If ->release runs before mmu_notifier_unregister it must be
|
||||
* handled, as it's the only way for the driver to flush all
|
||||
|
@ -59,10 +309,9 @@ void __mmu_notifier_release(struct mm_struct *mm)
|
|||
if (mn->ops->release)
|
||||
mn->ops->release(mn, mm);
|
||||
|
||||
spin_lock(&mm->mmu_notifier_mm->lock);
|
||||
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
|
||||
mn = hlist_entry(mm->mmu_notifier_mm->list.first,
|
||||
struct mmu_notifier,
|
||||
spin_lock(&mmn_mm->lock);
|
||||
while (unlikely(!hlist_empty(&mmn_mm->list))) {
|
||||
mn = hlist_entry(mmn_mm->list.first, struct mmu_notifier,
|
||||
hlist);
|
||||
/*
|
||||
* We arrived before mmu_notifier_unregister so
|
||||
|
@ -72,7 +321,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
|
|||
*/
|
||||
hlist_del_init_rcu(&mn->hlist);
|
||||
}
|
||||
spin_unlock(&mm->mmu_notifier_mm->lock);
|
||||
spin_unlock(&mmn_mm->lock);
|
||||
srcu_read_unlock(&srcu, id);
|
||||
|
||||
/*
|
||||
|
@ -87,6 +336,17 @@ void __mmu_notifier_release(struct mm_struct *mm)
|
|||
synchronize_srcu(&srcu);
|
||||
}
|
||||
|
||||
void __mmu_notifier_release(struct mm_struct *mm)
|
||||
{
|
||||
struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm;
|
||||
|
||||
if (mmn_mm->has_itree)
|
||||
mn_itree_release(mmn_mm, mm);
|
||||
|
||||
if (!hlist_empty(&mmn_mm->list))
|
||||
mn_hlist_release(mmn_mm, mm);
|
||||
}
|
||||
|
||||
/*
|
||||
* If no young bitflag is supported by the hardware, ->clear_flush_young can
|
||||
* unmap the address and return 1 or 0 depending if the mapping previously
|
||||
|
@ -159,14 +419,43 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
|
|||
srcu_read_unlock(&srcu, id);
|
||||
}
|
||||
|
||||
int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
|
||||
static int mn_itree_invalidate(struct mmu_notifier_mm *mmn_mm,
|
||||
const struct mmu_notifier_range *range)
|
||||
{
|
||||
struct mmu_interval_notifier *mni;
|
||||
unsigned long cur_seq;
|
||||
|
||||
for (mni = mn_itree_inv_start_range(mmn_mm, range, &cur_seq); mni;
|
||||
mni = mn_itree_inv_next(mni, range)) {
|
||||
bool ret;
|
||||
|
||||
ret = mni->ops->invalidate(mni, range, cur_seq);
|
||||
if (!ret) {
|
||||
if (WARN_ON(mmu_notifier_range_blockable(range)))
|
||||
continue;
|
||||
goto out_would_block;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
out_would_block:
|
||||
/*
|
||||
* On -EAGAIN the non-blocking caller is not allowed to call
|
||||
* invalidate_range_end()
|
||||
*/
|
||||
mn_itree_inv_end(mmn_mm);
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
static int mn_hlist_invalidate_range_start(struct mmu_notifier_mm *mmn_mm,
|
||||
struct mmu_notifier_range *range)
|
||||
{
|
||||
struct mmu_notifier *mn;
|
||||
int ret = 0;
|
||||
int id;
|
||||
|
||||
id = srcu_read_lock(&srcu);
|
||||
hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
|
||||
hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) {
|
||||
if (mn->ops->invalidate_range_start) {
|
||||
int _ret;
|
||||
|
||||
|
@ -190,15 +479,30 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
|
|||
return ret;
|
||||
}
|
||||
|
||||
void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
|
||||
bool only_end)
|
||||
int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
|
||||
{
|
||||
struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm;
|
||||
int ret;
|
||||
|
||||
if (mmn_mm->has_itree) {
|
||||
ret = mn_itree_invalidate(mmn_mm, range);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
if (!hlist_empty(&mmn_mm->list))
|
||||
return mn_hlist_invalidate_range_start(mmn_mm, range);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm,
|
||||
struct mmu_notifier_range *range,
|
||||
bool only_end)
|
||||
{
|
||||
struct mmu_notifier *mn;
|
||||
int id;
|
||||
|
||||
lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
|
||||
id = srcu_read_lock(&srcu);
|
||||
hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
|
||||
hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) {
|
||||
/*
|
||||
* Call invalidate_range here too to avoid the need for the
|
||||
* subsystem of having to register an invalidate_range_end
|
||||
|
@ -225,6 +529,19 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
|
|||
}
|
||||
}
|
||||
srcu_read_unlock(&srcu, id);
|
||||
}
|
||||
|
||||
void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
|
||||
bool only_end)
|
||||
{
|
||||
struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm;
|
||||
|
||||
lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
|
||||
if (mmn_mm->has_itree)
|
||||
mn_itree_inv_end(mmn_mm);
|
||||
|
||||
if (!hlist_empty(&mmn_mm->list))
|
||||
mn_hlist_invalidate_end(mmn_mm, range, only_end);
|
||||
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
|
||||
}
|
||||
|
||||
|
@ -243,8 +560,9 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
|
|||
}
|
||||
|
||||
/*
|
||||
* Same as mmu_notifier_register but here the caller must hold the
|
||||
* mmap_sem in write mode.
|
||||
* Same as mmu_notifier_register but here the caller must hold the mmap_sem in
|
||||
* write mode. A NULL mn signals the notifier is being registered for itree
|
||||
* mode.
|
||||
*/
|
||||
int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
|
||||
{
|
||||
|
@ -261,9 +579,6 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
|
|||
fs_reclaim_release(GFP_KERNEL);
|
||||
}
|
||||
|
||||
mn->mm = mm;
|
||||
mn->users = 1;
|
||||
|
||||
if (!mm->mmu_notifier_mm) {
|
||||
/*
|
||||
* kmalloc cannot be called under mm_take_all_locks(), but we
|
||||
|
@ -271,21 +586,22 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
|
|||
* the write side of the mmap_sem.
|
||||
*/
|
||||
mmu_notifier_mm =
|
||||
kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
|
||||
kzalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
|
||||
if (!mmu_notifier_mm)
|
||||
return -ENOMEM;
|
||||
|
||||
INIT_HLIST_HEAD(&mmu_notifier_mm->list);
|
||||
spin_lock_init(&mmu_notifier_mm->lock);
|
||||
mmu_notifier_mm->invalidate_seq = 2;
|
||||
mmu_notifier_mm->itree = RB_ROOT_CACHED;
|
||||
init_waitqueue_head(&mmu_notifier_mm->wq);
|
||||
INIT_HLIST_HEAD(&mmu_notifier_mm->deferred_list);
|
||||
}
|
||||
|
||||
ret = mm_take_all_locks(mm);
|
||||
if (unlikely(ret))
|
||||
goto out_clean;
|
||||
|
||||
/* Pairs with the mmdrop in mmu_notifier_unregister_* */
|
||||
mmgrab(mm);
|
||||
|
||||
/*
|
||||
* Serialize the update against mmu_notifier_unregister. A
|
||||
* side note: mmu_notifier_release can't run concurrently with
|
||||
|
@ -293,13 +609,28 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
|
|||
* current->mm or explicitly with get_task_mm() or similar).
|
||||
* We can't race against any other mmu notifier method either
|
||||
* thanks to mm_take_all_locks().
|
||||
*
|
||||
* release semantics on the initialization of the mmu_notifier_mm's
|
||||
* contents are provided for unlocked readers. acquire can only be
|
||||
* used while holding the mmgrab or mmget, and is safe because once
|
||||
* created the mmu_notififer_mm is not freed until the mm is
|
||||
* destroyed. As above, users holding the mmap_sem or one of the
|
||||
* mm_take_all_locks() do not need to use acquire semantics.
|
||||
*/
|
||||
if (mmu_notifier_mm)
|
||||
mm->mmu_notifier_mm = mmu_notifier_mm;
|
||||
smp_store_release(&mm->mmu_notifier_mm, mmu_notifier_mm);
|
||||
|
||||
spin_lock(&mm->mmu_notifier_mm->lock);
|
||||
hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
|
||||
spin_unlock(&mm->mmu_notifier_mm->lock);
|
||||
if (mn) {
|
||||
/* Pairs with the mmdrop in mmu_notifier_unregister_* */
|
||||
mmgrab(mm);
|
||||
mn->mm = mm;
|
||||
mn->users = 1;
|
||||
|
||||
spin_lock(&mm->mmu_notifier_mm->lock);
|
||||
hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
|
||||
spin_unlock(&mm->mmu_notifier_mm->lock);
|
||||
} else
|
||||
mm->mmu_notifier_mm->has_itree = true;
|
||||
|
||||
mm_drop_all_locks(mm);
|
||||
BUG_ON(atomic_read(&mm->mm_users) <= 0);
|
||||
|
@ -516,6 +847,180 @@ void mmu_notifier_put(struct mmu_notifier *mn)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(mmu_notifier_put);
|
||||
|
||||
static int __mmu_interval_notifier_insert(
|
||||
struct mmu_interval_notifier *mni, struct mm_struct *mm,
|
||||
struct mmu_notifier_mm *mmn_mm, unsigned long start,
|
||||
unsigned long length, const struct mmu_interval_notifier_ops *ops)
|
||||
{
|
||||
mni->mm = mm;
|
||||
mni->ops = ops;
|
||||
RB_CLEAR_NODE(&mni->interval_tree.rb);
|
||||
mni->interval_tree.start = start;
|
||||
/*
|
||||
* Note that the representation of the intervals in the interval tree
|
||||
* considers the ending point as contained in the interval.
|
||||
*/
|
||||
if (length == 0 ||
|
||||
check_add_overflow(start, length - 1, &mni->interval_tree.last))
|
||||
return -EOVERFLOW;
|
||||
|
||||
/* Must call with a mmget() held */
|
||||
if (WARN_ON(atomic_read(&mm->mm_count) <= 0))
|
||||
return -EINVAL;
|
||||
|
||||
/* pairs with mmdrop in mmu_interval_notifier_remove() */
|
||||
mmgrab(mm);
|
||||
|
||||
/*
|
||||
* If some invalidate_range_start/end region is going on in parallel
|
||||
* we don't know what VA ranges are affected, so we must assume this
|
||||
* new range is included.
|
||||
*
|
||||
* If the itree is invalidating then we are not allowed to change
|
||||
* it. Retrying until invalidation is done is tricky due to the
|
||||
* possibility for live lock, instead defer the add to
|
||||
* mn_itree_inv_end() so this algorithm is deterministic.
|
||||
*
|
||||
* In all cases the value for the mni->invalidate_seq should be
|
||||
* odd, see mmu_interval_read_begin()
|
||||
*/
|
||||
spin_lock(&mmn_mm->lock);
|
||||
if (mmn_mm->active_invalidate_ranges) {
|
||||
if (mn_itree_is_invalidating(mmn_mm))
|
||||
hlist_add_head(&mni->deferred_item,
|
||||
&mmn_mm->deferred_list);
|
||||
else {
|
||||
mmn_mm->invalidate_seq |= 1;
|
||||
interval_tree_insert(&mni->interval_tree,
|
||||
&mmn_mm->itree);
|
||||
}
|
||||
mni->invalidate_seq = mmn_mm->invalidate_seq;
|
||||
} else {
|
||||
WARN_ON(mn_itree_is_invalidating(mmn_mm));
|
||||
/*
|
||||
* The starting seq for a mni not under invalidation should be
|
||||
* odd, not equal to the current invalidate_seq and
|
||||
* invalidate_seq should not 'wrap' to the new seq any time
|
||||
* soon.
|
||||
*/
|
||||
mni->invalidate_seq = mmn_mm->invalidate_seq - 1;
|
||||
interval_tree_insert(&mni->interval_tree, &mmn_mm->itree);
|
||||
}
|
||||
spin_unlock(&mmn_mm->lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* mmu_interval_notifier_insert - Insert an interval notifier
|
||||
* @mni: Interval notifier to register
|
||||
* @start: Starting virtual address to monitor
|
||||
* @length: Length of the range to monitor
|
||||
* @mm : mm_struct to attach to
|
||||
*
|
||||
* This function subscribes the interval notifier for notifications from the
|
||||
* mm. Upon return the ops related to mmu_interval_notifier will be called
|
||||
* whenever an event that intersects with the given range occurs.
|
||||
*
|
||||
* Upon return the range_notifier may not be present in the interval tree yet.
|
||||
* The caller must use the normal interval notifier read flow via
|
||||
* mmu_interval_read_begin() to establish SPTEs for this range.
|
||||
*/
|
||||
int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
|
||||
struct mm_struct *mm, unsigned long start,
|
||||
unsigned long length,
|
||||
const struct mmu_interval_notifier_ops *ops)
|
||||
{
|
||||
struct mmu_notifier_mm *mmn_mm;
|
||||
int ret;
|
||||
|
||||
might_lock(&mm->mmap_sem);
|
||||
|
||||
mmn_mm = smp_load_acquire(&mm->mmu_notifier_mm);
|
||||
if (!mmn_mm || !mmn_mm->has_itree) {
|
||||
ret = mmu_notifier_register(NULL, mm);
|
||||
if (ret)
|
||||
return ret;
|
||||
mmn_mm = mm->mmu_notifier_mm;
|
||||
}
|
||||
return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length,
|
||||
ops);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert);
|
||||
|
||||
int mmu_interval_notifier_insert_locked(
|
||||
struct mmu_interval_notifier *mni, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long length,
|
||||
const struct mmu_interval_notifier_ops *ops)
|
||||
{
|
||||
struct mmu_notifier_mm *mmn_mm;
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held_write(&mm->mmap_sem);
|
||||
|
||||
mmn_mm = mm->mmu_notifier_mm;
|
||||
if (!mmn_mm || !mmn_mm->has_itree) {
|
||||
ret = __mmu_notifier_register(NULL, mm);
|
||||
if (ret)
|
||||
return ret;
|
||||
mmn_mm = mm->mmu_notifier_mm;
|
||||
}
|
||||
return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length,
|
||||
ops);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
|
||||
|
||||
/**
|
||||
* mmu_interval_notifier_remove - Remove a interval notifier
|
||||
* @mni: Interval notifier to unregister
|
||||
*
|
||||
* This function must be paired with mmu_interval_notifier_insert(). It cannot
|
||||
* be called from any ops callback.
|
||||
*
|
||||
* Once this returns ops callbacks are no longer running on other CPUs and
|
||||
* will not be called in future.
|
||||
*/
|
||||
void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni)
|
||||
{
|
||||
struct mm_struct *mm = mni->mm;
|
||||
struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm;
|
||||
unsigned long seq = 0;
|
||||
|
||||
might_sleep();
|
||||
|
||||
spin_lock(&mmn_mm->lock);
|
||||
if (mn_itree_is_invalidating(mmn_mm)) {
|
||||
/*
|
||||
* remove is being called after insert put this on the
|
||||
* deferred list, but before the deferred list was processed.
|
||||
*/
|
||||
if (RB_EMPTY_NODE(&mni->interval_tree.rb)) {
|
||||
hlist_del(&mni->deferred_item);
|
||||
} else {
|
||||
hlist_add_head(&mni->deferred_item,
|
||||
&mmn_mm->deferred_list);
|
||||
seq = mmn_mm->invalidate_seq;
|
||||
}
|
||||
} else {
|
||||
WARN_ON(RB_EMPTY_NODE(&mni->interval_tree.rb));
|
||||
interval_tree_remove(&mni->interval_tree, &mmn_mm->itree);
|
||||
}
|
||||
spin_unlock(&mmn_mm->lock);
|
||||
|
||||
/*
|
||||
* The possible sleep on progress in the invalidation requires the
|
||||
* caller not hold any locks held by invalidation callbacks.
|
||||
*/
|
||||
lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
|
||||
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
|
||||
if (seq)
|
||||
wait_event(mmn_mm->wq,
|
||||
READ_ONCE(mmn_mm->invalidate_seq) != seq);
|
||||
|
||||
/* pairs with mmgrab in mmu_interval_notifier_insert() */
|
||||
mmdrop(mm);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mmu_interval_notifier_remove);
|
||||
|
||||
/**
|
||||
* mmu_notifier_synchronize - Ensure all mmu_notifiers are freed
|
||||
*
|
||||
|
|
Loading…
Reference in New Issue