summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdkfd
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/Kconfig1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device.c32
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c11
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c18
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h10
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.c19
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.h4
7 files changed, 85 insertions, 10 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig
index c3613604a4f8..a1a35d4d594b 100644
--- a/drivers/gpu/drm/amd/amdkfd/Kconfig
+++ b/drivers/gpu/drm/amd/amdkfd/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Heterogenous system architecture configuration
#
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index cf9a49f49d3a..765b58a17dc7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -355,6 +355,7 @@ static const struct kfd_deviceid supported_devices[] = {
{ 0x67CF, &polaris10_device_info }, /* Polaris10 */
{ 0x67D0, &polaris10_vf_device_info }, /* Polaris10 vf*/
{ 0x67DF, &polaris10_device_info }, /* Polaris10 */
+ { 0x6FDF, &polaris10_device_info }, /* Polaris10 */
{ 0x67E0, &polaris11_device_info }, /* Polaris11 */
{ 0x67E1, &polaris11_device_info }, /* Polaris11 */
{ 0x67E3, &polaris11_device_info }, /* Polaris11 */
@@ -462,11 +463,14 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
kfd->pdev = pdev;
kfd->init_complete = false;
kfd->kfd2kgd = f2g;
+ atomic_set(&kfd->compute_profile, 0);
mutex_init(&kfd->doorbell_mutex);
memset(&kfd->doorbell_available_index, 0,
sizeof(kfd->doorbell_available_index));
+ atomic_set(&kfd->sram_ecc_flag, 0);
+
return kfd;
}
@@ -492,9 +496,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
{
unsigned int size;
- kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd,
+ kfd->mec_fw_version = amdgpu_amdkfd_get_fw_version(kfd->kgd,
KGD_ENGINE_MEC1);
- kfd->sdma_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd,
+ kfd->sdma_fw_version = amdgpu_amdkfd_get_fw_version(kfd->kgd,
KGD_ENGINE_SDMA1);
kfd->shared_resources = *gpu_resources;
@@ -662,6 +666,9 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
return ret;
count = atomic_dec_return(&kfd_locked);
WARN_ONCE(count != 0, "KFD reset ref. error");
+
+ atomic_set(&kfd->sram_ecc_flag, 0);
+
return 0;
}
@@ -1025,6 +1032,27 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
return 0;
}
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
+{
+ if (kfd)
+ atomic_inc(&kfd->sram_ecc_flag);
+}
+
+void kfd_inc_compute_active(struct kfd_dev *kfd)
+{
+ if (atomic_inc_return(&kfd->compute_profile) == 1)
+ amdgpu_amdkfd_set_compute_idle(kfd->kgd, false);
+}
+
+void kfd_dec_compute_active(struct kfd_dev *kfd)
+{
+ int count = atomic_dec_return(&kfd->compute_profile);
+
+ if (count == 0)
+ amdgpu_amdkfd_set_compute_idle(kfd->kgd, true);
+ WARN_ONCE(count < 0, "Compute profile ref. count error");
+}
+
#if defined(CONFIG_DEBUG_FS)
/* This function will send a package to HIQ to hang the HWS
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c6c9530e704e..ae381450601c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -811,8 +811,8 @@ static int register_process(struct device_queue_manager *dqm,
retval = dqm->asic_ops.update_qpd(dqm, qpd);
- if (dqm->processes_count++ == 0)
- amdgpu_amdkfd_set_compute_idle(dqm->dev->kgd, false);
+ dqm->processes_count++;
+ kfd_inc_compute_active(dqm->dev);
dqm_unlock(dqm);
@@ -835,9 +835,8 @@ static int unregister_process(struct device_queue_manager *dqm,
if (qpd == cur->qpd) {
list_del(&cur->list);
kfree(cur);
- if (--dqm->processes_count == 0)
- amdgpu_amdkfd_set_compute_idle(
- dqm->dev->kgd, true);
+ dqm->processes_count--;
+ kfd_dec_compute_active(dqm->dev);
goto out;
}
}
@@ -1539,6 +1538,7 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm,
list_del(&cur->list);
kfree(cur);
dqm->processes_count--;
+ kfd_dec_compute_active(dqm->dev);
break;
}
}
@@ -1626,6 +1626,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
list_del(&cur->list);
kfree(cur);
dqm->processes_count--;
+ kfd_dec_compute_active(dqm->dev);
break;
}
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index e9f0e0a1b41c..6e1d41c5bf86 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1011,25 +1011,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
void kfd_signal_reset_event(struct kfd_dev *dev)
{
struct kfd_hsa_hw_exception_data hw_exception_data;
+ struct kfd_hsa_memory_exception_data memory_exception_data;
struct kfd_process *p;
struct kfd_event *ev;
unsigned int temp;
uint32_t id, idx;
+ int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
+ KFD_HW_EXCEPTION_ECC :
+ KFD_HW_EXCEPTION_GPU_HANG;
/* Whole gpu reset caused by GPU hang and memory is lost */
memset(&hw_exception_data, 0, sizeof(hw_exception_data));
hw_exception_data.gpu_id = dev->id;
hw_exception_data.memory_lost = 1;
+ hw_exception_data.reset_cause = reset_cause;
+
+ memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+ memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
+ memory_exception_data.gpu_id = dev->id;
+ memory_exception_data.failure.imprecise = true;
idx = srcu_read_lock(&kfd_processes_srcu);
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
mutex_lock(&p->event_mutex);
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
- idr_for_each_entry_continue(&p->event_idr, ev, id)
+ idr_for_each_entry_continue(&p->event_idr, ev, id) {
if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
ev->hw_exception_data = hw_exception_data;
set_event(ev);
}
+ if (ev->type == KFD_EVENT_TYPE_MEMORY &&
+ reset_cause == KFD_HW_EXCEPTION_ECC) {
+ ev->memory_exception_data = memory_exception_data;
+ set_event(ev);
+ }
+ }
mutex_unlock(&p->event_mutex);
}
srcu_read_unlock(&kfd_processes_srcu, idx);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 0eeee3c6d6dc..487d5da337c1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -276,6 +276,12 @@ struct kfd_dev {
uint64_t hive_id;
bool pci_atomic_requested;
+
+ /* SRAM ECC flag */
+ atomic_t sram_ecc_flag;
+
+ /* Compute Profile ref. count */
+ atomic_t compute_profile;
};
enum kfd_mempool {
@@ -975,6 +981,10 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
bool kfd_is_locked(void);
+/* Compute profile */
+void kfd_inc_compute_active(struct kfd_dev *dev);
+void kfd_dec_compute_active(struct kfd_dev *dev);
+
/* Debugfs */
#if defined(CONFIG_DEBUG_FS)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 09da91644f9f..769dbc7be8cb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -37,6 +37,7 @@
#include "kfd_device_queue_manager.h"
#include "kfd_iommu.h"
#include "amdgpu_amdkfd.h"
+#include "amdgpu_ras.h"
/* topology_device_list - Master list of all topology devices */
static struct list_head topology_device_list;
@@ -1197,6 +1198,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
void *crat_image = NULL;
size_t image_size = 0;
int proximity_domain;
+ struct amdgpu_ras *ctx;
INIT_LIST_HEAD(&temp_topology_device_list);
@@ -1270,8 +1272,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
dev->node_props.vendor_id = gpu->pdev->vendor;
dev->node_props.device_id = gpu->pdev->device;
- dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number,
- gpu->pdev->devfn);
+ dev->node_props.location_id = pci_dev_id(gpu->pdev);
dev->node_props.max_engine_clk_fcompute =
amdgpu_amdkfd_get_max_engine_clock_in_mhz(dev->gpu->kgd);
dev->node_props.max_engine_clk_ccompute =
@@ -1328,6 +1329,20 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
}
+ ctx = amdgpu_ras_get_context((struct amdgpu_device *)(dev->gpu->kgd));
+ if (ctx) {
+ /* kfd only concerns sram ecc on GFX/SDMA and HBM ecc on UMC */
+ dev->node_props.capability |=
+ (((ctx->features & BIT(AMDGPU_RAS_BLOCK__SDMA)) != 0) ||
+ ((ctx->features & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0)) ?
+ HSA_CAP_SRAM_EDCSUPPORTED : 0;
+ dev->node_props.capability |= ((ctx->features & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ?
+ HSA_CAP_MEM_EDCSUPPORTED : 0;
+
+ dev->node_props.capability |= (ctx->features != 0) ?
+ HSA_CAP_RASEVENTNOTIFY : 0;
+ }
+
kfd_debug_print_topology();
if (!res)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index 92a19be07344..84710cfd23c2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -48,6 +48,10 @@
#define HSA_CAP_DOORBELL_TYPE_2_0 0x2
#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000
+#define HSA_CAP_SRAM_EDCSUPPORTED 0x00080000
+#define HSA_CAP_MEM_EDCSUPPORTED 0x00100000
+#define HSA_CAP_RASEVENTNOTIFY 0x00200000
+
struct kfd_node_properties {
uint64_t hive_id;
uint32_t cpu_cores_count;