drm/amdkfd: Add thermal throttling SMI event

Add support for reporting thermal throttling events through SMI.
Also, add a counter to count the number of throttling interrupts
observed and report the count in the SMI event message.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Mukul Joshi 2020-07-23 23:09:57 -04:00 committed by Alex Deucher
parent df9c8d1aa2
commit 2c2b0d880f
10 changed files with 74 additions and 18 deletions

View File

@ -789,4 +789,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
{
}
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
{
}
#endif

View File

@ -270,5 +270,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
struct dma_fence *fence);
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask);
#endif /* AMDGPU_AMDKFD_H_INCLUDED */

View File

@ -29,6 +29,7 @@
#include "cwsr_trap_handler.h"
#include "kfd_iommu.h"
#include "amdgpu_amdkfd.h"
#include "kfd_smi_events.h"
#define MQD_SIZE_ALIGNED 768
@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)
WARN_ONCE(count < 0, "Compute profile ref. count error");
}
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
{
if (kfd)
kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask);
}
#if defined(CONFIG_DEBUG_FS)
/* This function will send a package to HIQ to hang the HWS

View File

@ -24,6 +24,7 @@
#include <linux/wait.h>
#include <linux/anon_inodes.h>
#include <uapi/linux/kfd_ioctl.h>
#include "amdgpu.h"
#include "amdgpu_vm.h"
#include "kfd_priv.h"
#include "kfd_smi_events.h"
@ -148,6 +149,54 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
return 0;
}
static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event,
char *event_msg, int len)
{
struct kfd_smi_client *client;
rcu_read_lock();
list_for_each_entry_rcu(client, &dev->smi_clients, list) {
if (!(READ_ONCE(client->events) & smi_event))
continue;
spin_lock(&client->lock);
if (kfifo_avail(&client->fifo) >= len) {
kfifo_in(&client->fifo, event_msg, len);
wake_up_all(&client->wait_queue);
} else {
pr_debug("smi_event(EventID: %llu): no space left\n",
smi_event);
}
spin_unlock(&client->lock);
}
rcu_read_unlock();
}
void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
uint32_t throttle_bitmask)
{
struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
/*
* ThermalThrottle msg = throttle_bitmask(8):
* thermal_interrupt_count(16):
* 16 bytes event + 1 byte space + 8 byte throttle_bitmask +
* 1 byte : + 16 byte thermal_interupt_counter + 1 byte \n +
* 1 byte \0 = 44
*/
char fifo_in[44];
int len;
if (list_empty(&dev->smi_clients))
return;
len = snprintf(fifo_in, 44, "%x %x:%llx\n",
KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask,
atomic64_read(&adev->smu.throttle_int_counter));
add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len);
}
void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
{
struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
@ -156,7 +205,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
/* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
*/
char fifo_in[43];
struct kfd_smi_client *client;
int len;
if (list_empty(&dev->smi_clients))
@ -171,22 +219,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
task_info.pid, task_info.task_name);
rcu_read_lock();
list_for_each_entry_rcu(client, &dev->smi_clients, list) {
if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
continue;
spin_lock(&client->lock);
if (kfifo_avail(&client->fifo) >= len) {
kfifo_in(&client->fifo, fifo_in, len);
wake_up_all(&client->wait_queue);
}
else
pr_debug("smi_event(vmfault): no space left\n");
spin_unlock(&client->lock);
}
rcu_read_unlock();
add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
}
int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)

View File

@ -25,5 +25,7 @@
int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
uint32_t throttle_bitmask);
#endif

View File

@ -640,6 +640,7 @@ static int smu_sw_init(void *handle)
mutex_init(&smu->message_lock);
INIT_WORK(&smu->throttling_logging_work, smu_throttling_logging_work_fn);
atomic64_set(&smu->throttle_int_counter, 0);
smu->watermarks_bitmap = 0;
smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;

View File

@ -2251,6 +2251,7 @@ static void arcturus_log_thermal_throttling_event(struct smu_context *smu)
dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, expect performance decrease. %s.\n",
log_buf);
kgd2kfd_smi_event_throttle(smu->adev->kfd.dev, throttler_status);
}
static const struct pptable_funcs arcturus_ppt_funcs = {

View File

@ -446,6 +446,7 @@ struct smu_context
bool dc_controlled_by_gpio;
struct work_struct throttling_logging_work;
atomic64_t throttle_int_counter;
};
struct i2c_adapter;

View File

@ -1311,6 +1311,11 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
smu_v11_0_ack_ac_dc_interrupt(&adev->smu);
break;
case 0x7:
/*
* Increment the throttle interrupt counter
*/
atomic64_inc(&smu->throttle_int_counter);
if (!atomic_read(&adev->throttling_logging_enabled))
return 0;

View File

@ -450,7 +450,8 @@ struct kfd_ioctl_import_dmabuf_args {
* KFD SMI(System Management Interface) events
*/
/* Event type (defined by bitmask) */
#define KFD_SMI_EVENT_VMFAULT 0x0000000000000001
#define KFD_SMI_EVENT_VMFAULT 0x0000000000000001
#define KFD_SMI_EVENT_THERMAL_THROTTLE 0x0000000000000002
struct kfd_ioctl_smi_events_args {
__u32 gpuid; /* to KFD */