summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
diff options
context:
space:
mode:
authorShaoyun Liu2018-07-12 04:32:58 +0200
committerOded Gabbay2018-07-12 04:32:58 +0200
commit73ea648d921ed5c58895c8592321456fe12b697f (patch)
tree5d02a5722b37ad2da07d8bffb14d81e99bc3f19b /drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
parentdrm/amdgpu: Enable the gpu reset from KFD (diff)
downloadkernel-qcow2-linux-73ea648d921ed5c58895c8592321456fe12b697f.tar.gz
kernel-qcow2-linux-73ea648d921ed5c58895c8592321456fe12b697f.tar.xz
kernel-qcow2-linux-73ea648d921ed5c58895c8592321456fe12b697f.zip
drm/amdkfd: Implement hang detection in KFD and call amdgpu
The reset will be performed in a new hw_exception work thread to handle HWS hang without blocking the thread that detected the hang. Signed-off-by: Shaoyun Liu <Shaoyun.Liu@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c21
1 files changed, 20 insertions, 1 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 44fc2038770e..6b59eab39fbe 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
static void deallocate_sdma_queue(struct device_queue_manager *dqm,
unsigned int sdma_queue_id);
+static void kfd_process_hw_exception(struct work_struct *work);
+
static inline
enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
{
@@ -1010,6 +1012,8 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
dqm->active_runlist = false;
dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1;
+ INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception);
+
return 0;
}
@@ -1042,6 +1046,8 @@ static int start_cpsch(struct device_queue_manager *dqm)
init_interrupts(dqm);
dqm_lock(dqm);
+ /* clear hang status when driver try to start the hw scheduler */
+ dqm->is_hws_hang = false;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
dqm_unlock(dqm);
@@ -1255,6 +1261,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
{
int retval = 0;
+ if (dqm->is_hws_hang)
+ return -EIO;
if (!dqm->active_runlist)
return retval;
@@ -1293,9 +1301,13 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
{
int retval;
+ if (dqm->is_hws_hang)
+ return -EIO;
retval = unmap_queues_cpsch(dqm, filter, filter_param);
if (retval) {
pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
+ dqm->is_hws_hang = true;
+ schedule_work(&dqm->hw_exception_work);
return retval;
}
@@ -1543,7 +1555,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
}
retval = execute_queues_cpsch(dqm, filter, 0);
- if (retval || qpd->reset_wavefronts) {
+ if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process);
qpd->reset_wavefronts = false;
@@ -1701,6 +1713,13 @@ int kfd_process_vm_fault(struct device_queue_manager *dqm,
return ret;
}
+static void kfd_process_hw_exception(struct work_struct *work)
+{
+ struct device_queue_manager *dqm = container_of(work,
+ struct device_queue_manager, hw_exception_work);
+ dqm->dev->kfd2kgd->gpu_recover(dqm->dev->kgd);
+}
+
#if defined(CONFIG_DEBUG_FS)
static void seq_reg_dump(struct seq_file *m,