drm/amdkfd: Handle VM faults in KFD

1. Pre-GFX9 the amdgpu ISR saves the vm-fault status and address per
   per-vmid. amdkfd needs to get the information from amdgpu through the
   new get_vm_fault_info interface. On GFX9 and later, all the required
   information is in the IH ring
2. amdkfd unmaps all queues from the faulting process and create new
   run-list without the guilty process
3. amdkfd notifies the runtime of the vm fault trap via EVENT_TYPE_MEMORY

Signed-off-by: shaoyun liu <shaoyun.liu@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
This commit is contained in:
shaoyunl 2018-07-11 22:32:50 -04:00 committed by Oded Gabbay
parent b97dfa27ef
commit 2640c3facb
7 changed files with 98 additions and 7 deletions

View File

@ -48,18 +48,19 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE;
ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT;
}
static void cik_event_interrupt_wq(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
{
unsigned int pasid;
const struct cik_ih_ring_entry *ihre =
(const struct cik_ih_ring_entry *)ih_ring_entry;
uint32_t context_id = ihre->data & 0xfffffff;
pasid = (ihre->ring_id & 0xffff0000) >> 16;
unsigned int vmid = (ihre->ring_id & 0x0000ff00) >> 8;
unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16;
if (pasid == 0)
return;
@ -72,6 +73,22 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
kfd_signal_event_interrupt(pasid, context_id & 0xff, 8);
else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
kfd_signal_hw_exception_event(pasid);
else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
kfd_process_vm_fault(dev->dqm, pasid);
memset(&info, 0, sizeof(info));
dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
if (!info.page_addr && !info.status)
return;
if (info.vmid == vmid)
kfd_signal_vm_fault_event(dev, pasid, &info);
else
kfd_signal_vm_fault_event(dev, pasid, NULL);
}
}
const struct kfd_event_interrupt_class event_interrupt_class_cik = {

View File

@ -37,6 +37,8 @@ struct cik_ih_ring_entry {
#define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6
#define CIK_INTSRC_SDMA_TRAP 0xE0
#define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF
#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92
#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93
#endif

View File

@ -1684,6 +1684,23 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
kfree(dqm);
}
int kfd_process_vm_fault(struct device_queue_manager *dqm,
unsigned int pasid)
{
struct kfd_process_device *pdd;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
int ret = 0;
if (!p)
return -EINVAL;
pdd = kfd_get_process_device_data(dqm->dev, p);
if (pdd)
ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
kfd_unref_process(p);
return ret;
}
#if defined(CONFIG_DEBUG_FS)
static void seq_reg_dump(struct seq_file *m,

View File

@ -963,3 +963,40 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
mutex_unlock(&p->event_mutex);
kfd_unref_process(p);
}
void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
struct kfd_vm_fault_info *info)
{
struct kfd_event *ev;
uint32_t id;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
struct kfd_hsa_memory_exception_data memory_exception_data;
if (!p)
return; /* Presumably process exited. */
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
memory_exception_data.gpu_id = dev->id;
memory_exception_data.failure.imprecise = 1;
/* Set failure reason */
if (info) {
memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
memory_exception_data.failure.NotPresent =
info->prot_valid ? 1 : 0;
memory_exception_data.failure.NoExecute =
info->prot_exec ? 1 : 0;
memory_exception_data.failure.ReadOnly =
info->prot_write ? 1 : 0;
memory_exception_data.failure.imprecise = 0;
}
mutex_lock(&p->event_mutex);
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
idr_for_each_entry_continue(&p->event_idr, ev, id)
if (ev->type == KFD_EVENT_TYPE_MEMORY) {
ev->memory_exception_data = memory_exception_data;
set_event(ev);
}
mutex_unlock(&p->event_mutex);
kfd_unref_process(p);
}

View File

@ -57,7 +57,9 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
source_id == SOC15_INTSRC_SDMA_TRAP ||
source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
source_id == SOC15_INTSRC_CP_BAD_OPCODE;
source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
client_id == SOC15_IH_CLIENTID_VMC ||
client_id == SOC15_IH_CLIENTID_UTCL2;
}
static void event_interrupt_wq_v9(struct kfd_dev *dev,
@ -82,7 +84,19 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
kfd_signal_hw_exception_event(pasid);
else if (client_id == SOC15_IH_CLIENTID_VMC ||
client_id == SOC15_IH_CLIENTID_UTCL2) {
/* TODO */
struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
info.vmid = vmid;
info.mc_id = client_id;
info.page_addr = ih_ring_entry[4] |
(uint64_t)(ih_ring_entry[5] & 0xf) << 32;
info.prot_valid = ring_id & 0x08;
info.prot_read = ring_id & 0x10;
info.prot_write = ring_id & 0x20;
kfd_process_vm_fault(dev->dqm, pasid);
kfd_signal_vm_fault_event(dev, pasid, &info);
}
}

View File

@ -838,6 +838,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm);
struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
enum kfd_queue_type type);
void kernel_queue_uninit(struct kernel_queue *kq);
int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
/* Process Queue Manager */
struct process_queue_node {
@ -964,6 +965,9 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
uint64_t *event_page_offset, uint32_t *event_slot_index);
int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
struct kfd_vm_fault_info *info);
void kfd_flush_tlb(struct kfd_process_device *pdd);
int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);

View File

@ -219,7 +219,7 @@ struct kfd_memory_exception_failure {
__u32 NotPresent; /* Page not present or supervisor privilege */
__u32 ReadOnly; /* Write access to a read-only page */
__u32 NoExecute; /* Execute access to a page marked NX */
__u32 pad;
__u32 imprecise; /* Can't determine the exact fault address */
};
/* memory exception data*/