summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdkfd
diff options
context:
space:
mode:
authorEric Huang2019-01-11 20:38:51 +0100
committerAlex Deucher2019-03-19 21:36:51 +0100
commit9b54d2017687df9fa827faf9e4022973b87fc0ff (patch)
treebbe1df5476b9ba2c9292279a0fa23c75262e2d04 /drivers/gpu/drm/amd/amdkfd
parentdrm/amdkfd: add RAS capabilities in topology for Vega20 (v2) (diff)
downloadkernel-qcow2-linux-9b54d2017687df9fa827faf9e4022973b87fc0ff.tar.gz
kernel-qcow2-linux-9b54d2017687df9fa827faf9e4022973b87fc0ff.tar.xz
kernel-qcow2-linux-9b54d2017687df9fa827faf9e4022973b87fc0ff.zip
drm/amdkfd: add RAS ECC event support (v3)
RAS ECC event will combine with GPU reset event, due to ECC interrupts are caused by uncorrectable error that triggers GPU reset. v2: Fix misleading-indentation warning v3: fix build with CONFIG_HSA_AMD disabled Signed-off-by: Eric Huang <JinhuiEric.Huang@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device.c11
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c18
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h3
3 files changed, 31 insertions, 1 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 8be9677c0c07..b3cdbf79f47b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -466,6 +466,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
memset(&kfd->doorbell_available_index, 0,
sizeof(kfd->doorbell_available_index));
+ atomic_set(&kfd->sram_ecc_flag, 0);
+
return kfd;
}
@@ -661,6 +663,9 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
return ret;
count = atomic_dec_return(&kfd_locked);
WARN_ONCE(count != 0, "KFD reset ref. error");
+
+ atomic_set(&kfd->sram_ecc_flag, 0);
+
return 0;
}
@@ -1024,6 +1029,12 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
return 0;
}
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
+{
+ if (kfd)
+ atomic_inc(&kfd->sram_ecc_flag);
+}
+
#if defined(CONFIG_DEBUG_FS)
/* This function will send a package to HIQ to hang the HWS
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index e9f0e0a1b41c..6e1d41c5bf86 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1011,25 +1011,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
void kfd_signal_reset_event(struct kfd_dev *dev)
{
struct kfd_hsa_hw_exception_data hw_exception_data;
+ struct kfd_hsa_memory_exception_data memory_exception_data;
struct kfd_process *p;
struct kfd_event *ev;
unsigned int temp;
uint32_t id, idx;
+ int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
+ KFD_HW_EXCEPTION_ECC :
+ KFD_HW_EXCEPTION_GPU_HANG;
/* Whole gpu reset caused by GPU hang and memory is lost */
memset(&hw_exception_data, 0, sizeof(hw_exception_data));
hw_exception_data.gpu_id = dev->id;
hw_exception_data.memory_lost = 1;
+ hw_exception_data.reset_cause = reset_cause;
+
+ memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+ memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
+ memory_exception_data.gpu_id = dev->id;
+ memory_exception_data.failure.imprecise = true;
idx = srcu_read_lock(&kfd_processes_srcu);
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
mutex_lock(&p->event_mutex);
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
- idr_for_each_entry_continue(&p->event_idr, ev, id)
+ idr_for_each_entry_continue(&p->event_idr, ev, id) {
if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
ev->hw_exception_data = hw_exception_data;
set_event(ev);
}
+ if (ev->type == KFD_EVENT_TYPE_MEMORY &&
+ reset_cause == KFD_HW_EXCEPTION_ECC) {
+ ev->memory_exception_data = memory_exception_data;
+ set_event(ev);
+ }
+ }
mutex_unlock(&p->event_mutex);
}
srcu_read_unlock(&kfd_processes_srcu, idx);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 0eeee3c6d6dc..9e0230965675 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -276,6 +276,9 @@ struct kfd_dev {
uint64_t hive_id;
bool pci_atomic_requested;
+
+ /* SRAM ECC flag */
+ atomic_t sram_ecc_flag;
};
enum kfd_mempool {