diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
22 files changed, 436 insertions, 220 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig index 3858820a0055..fbf0ee5201c3 100644 --- a/drivers/gpu/drm/amd/amdkfd/Kconfig +++ b/drivers/gpu/drm/amd/amdkfd/Kconfig @@ -3,7 +3,7 @@ # config HSA_AMD - tristate "HSA kernel driver for AMD GPU devices" + bool "HSA kernel driver for AMD GPU devices" depends on DRM_AMDGPU && X86_64 imply AMD_IOMMU_V2 select MMU_NOTIFIER diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index ffd096fffc1c..69ec96998bb9 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -23,26 +23,41 @@ # Makefile for Heterogenous System Architecture support for AMD GPU devices # -ccflags-y := -Idrivers/gpu/drm/amd/include/ \ - -Idrivers/gpu/drm/amd/include/asic_reg - -amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ - kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ - kfd_process.o kfd_queue.o kfd_mqd_manager.o \ - kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ - kfd_mqd_manager_v9.o \ - kfd_kernel_queue.o kfd_kernel_queue_cik.o \ - kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \ - kfd_packet_manager.o kfd_process_queue_manager.o \ - kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \ - kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \ - kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ - kfd_int_process_v9.o kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o +AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ + $(AMDKFD_PATH)/kfd_device.o \ + $(AMDKFD_PATH)/kfd_chardev.o \ + $(AMDKFD_PATH)/kfd_topology.o \ + $(AMDKFD_PATH)/kfd_pasid.o \ + $(AMDKFD_PATH)/kfd_doorbell.o \ + $(AMDKFD_PATH)/kfd_flat_memory.o \ + $(AMDKFD_PATH)/kfd_process.o \ + $(AMDKFD_PATH)/kfd_queue.o \ + $(AMDKFD_PATH)/kfd_mqd_manager.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_cik.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_vi.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_v9.o \ + $(AMDKFD_PATH)/kfd_kernel_queue.o \ + $(AMDKFD_PATH)/kfd_kernel_queue_cik.o \ + $(AMDKFD_PATH)/kfd_kernel_queue_vi.o \ + $(AMDKFD_PATH)/kfd_kernel_queue_v9.o \ + $(AMDKFD_PATH)/kfd_packet_manager.o \ + $(AMDKFD_PATH)/kfd_process_queue_manager.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager_cik.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager_vi.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager_v9.o \ + $(AMDKFD_PATH)/kfd_interrupt.o \ + $(AMDKFD_PATH)/kfd_events.o \ + $(AMDKFD_PATH)/cik_event_interrupt.o \ + $(AMDKFD_PATH)/kfd_int_process_v9.o \ + $(AMDKFD_PATH)/kfd_dbgdev.o \ + $(AMDKFD_PATH)/kfd_dbgmgr.o \ + $(AMDKFD_PATH)/kfd_crat.o ifneq ($(CONFIG_AMD_IOMMU_V2),) -amdkfd-y += kfd_iommu.o +AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o endif -amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o - -obj-$(CONFIG_HSA_AMD) += amdkfd.o +ifneq ($(CONFIG_DEBUG_FS),) +AMDKFD_FILES += $(AMDKFD_PATH)/kfd_debugfs.o +endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 297b36c26a05..14d5b5fa822d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -447,6 +447,24 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, return retval; } +static int kfd_ioctl_get_queue_wave_state(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_get_queue_wave_state_args *args = data; + int r; + + mutex_lock(&p->mutex); + + r = pqm_get_wave_state(&p->pqm, args->queue_id, + (void __user *)args->ctl_stack_address, + &args->ctl_stack_used_size, + &args->save_area_used_size); + + mutex_unlock(&p->mutex); + + return r; +} + static int kfd_ioctl_set_memory_policy(struct file *filep, struct kfd_process *p, void *data) { @@ -1210,7 +1228,7 @@ err_unlock: return ret; } -static bool kfd_dev_is_large_bar(struct kfd_dev *dev) +bool kfd_dev_is_large_bar(struct kfd_dev *dev) { struct kfd_local_mem_info mem_info; @@ -1615,6 +1633,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK, kfd_ioctl_set_cu_mask, 0), + AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_QUEUE_WAVE_STATE, + kfd_ioctl_get_queue_wave_state, 0) + }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index ee4996029a86..56412b0e7e1c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -346,15 +346,15 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, struct list_head *device_list) { struct kfd_iolink_properties *props = NULL, *props2; - struct kfd_topology_device *dev, *cpu_dev; + struct kfd_topology_device *dev, *to_dev; uint32_t id_from; uint32_t id_to; id_from = iolink->proximity_domain_from; id_to = iolink->proximity_domain_to; - pr_debug("Found IO link entry in CRAT table with id_from=%d\n", - id_from); + pr_debug("Found IO link entry in CRAT table with id_from=%d, id_to %d\n", + id_from, id_to); list_for_each_entry(dev, device_list, list) { if (id_from == dev->proximity_domain) { props = kfd_alloc_struct(props); @@ -369,6 +369,8 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) props->weight = 20; + else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI) + props->weight = 15; else props->weight = node_distance(id_from, id_to); @@ -389,20 +391,23 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, /* CPU topology is created before GPUs are detected, so CPU->GPU * links are not built at that time. If a PCIe type is discovered, it * means a GPU is detected and we are adding GPU->CPU to the topology. - * At this time, also add the corresponded CPU->GPU link. + * At this time, also add the corresponded CPU->GPU link if GPU + * is large bar. + * For xGMI, we only added the link with one direction in the crat + * table, add corresponded reversed direction link now. */ - if (props && props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) { - cpu_dev = kfd_topology_device_by_proximity_domain(id_to); - if (!cpu_dev) + if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) { + to_dev = kfd_topology_device_by_proximity_domain(id_to); + if (!to_dev) return -ENODEV; /* same everything but the other direction */ props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL); props2->node_from = id_to; props2->node_to = id_from; props2->kobj = NULL; - cpu_dev->io_link_count++; - cpu_dev->node_props.io_links_count++; - list_add_tail(&props2->list, &cpu_dev->io_link_props); + to_dev->io_link_count++; + to_dev->node_props.io_links_count++; + list_add_tail(&props2->list, &to_dev->io_link_props); } return 0; @@ -642,6 +647,7 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); break; case CHIP_VEGA10: + case CHIP_VEGA20: pcache_info = vega10_cache_info; num_of_cache_types = ARRAY_SIZE(vega10_cache_info); break; @@ -1037,7 +1043,7 @@ static int kfd_fill_gpu_memory_affinity(int *avail_size, * * Return 0 if successful else return -ve value */ -static int kfd_fill_gpu_direct_io_link(int *avail_size, +static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size, struct kfd_dev *kdev, struct crat_subtype_iolink *sub_type_hdr, uint32_t proximity_domain) @@ -1052,6 +1058,8 @@ static int kfd_fill_gpu_direct_io_link(int *avail_size, sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; sub_type_hdr->length = sizeof(struct crat_subtype_iolink); sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; + if (kfd_dev_is_large_bar(kdev)) + sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; /* Fill in IOLINK subtype. * TODO: Fill-in other fields of iolink subtype @@ -1069,6 +1077,29 @@ static int kfd_fill_gpu_direct_io_link(int *avail_size, return 0; } +static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size, + struct kfd_dev *kdev, + struct crat_subtype_iolink *sub_type_hdr, + uint32_t proximity_domain_from, + uint32_t proximity_domain_to) +{ + *avail_size -= sizeof(struct crat_subtype_iolink); + if (*avail_size < 0) + return -ENOMEM; + + memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); + + sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; + sub_type_hdr->length = sizeof(struct crat_subtype_iolink); + sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED | + CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; + + sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI; + sub_type_hdr->proximity_domain_from = proximity_domain_from; + sub_type_hdr->proximity_domain_to = proximity_domain_to; + return 0; +} + /* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU * * @pcrat_image: Fill in VCRAT for GPU @@ -1081,14 +1112,16 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, { struct crat_header *crat_table = (struct crat_header *)pcrat_image; struct crat_subtype_generic *sub_type_hdr; + struct kfd_local_mem_info local_mem_info; + struct kfd_topology_device *peer_dev; struct crat_subtype_computeunit *cu; struct kfd_cu_info cu_info; int avail_size = *size; uint32_t total_num_of_cu; int num_of_cache_entries = 0; int cache_mem_filled = 0; + uint32_t nid = 0; int ret = 0; - struct kfd_local_mem_info local_mem_info; if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU) return -EINVAL; @@ -1212,7 +1245,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, */ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + cache_mem_filled); - ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev, + ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev, (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); if (ret < 0) @@ -1221,6 +1254,35 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, crat_table->length += sub_type_hdr->length; crat_table->total_entries++; + + /* Fill in Subtype: IO_LINKS + * Direct links from GPU to other GPUs through xGMI. + * We will loop GPUs that already be processed (with lower value + * of proximity_domain), add the link for the GPUs with same + * hive id (from this GPU to other GPU) . The reversed iolink + * (from other GPU to this GPU) will be added + * in kfd_parse_subtype_iolink. + */ + if (kdev->hive_id) { + for (nid = 0; nid < proximity_domain; ++nid) { + peer_dev = kfd_topology_device_by_proximity_domain(nid); + if (!peer_dev->gpu) + continue; + if (peer_dev->gpu->hive_id != kdev->hive_id) + continue; + sub_type_hdr = (typeof(sub_type_hdr))( + (char *)sub_type_hdr + + sizeof(struct crat_subtype_iolink)); + ret = kfd_fill_gpu_xgmi_link_to_gpu( + &avail_size, kdev, + (struct crat_subtype_iolink *)sub_type_hdr, + proximity_domain, nid); + if (ret < 0) + return ret; + crat_table->length += sub_type_hdr->length; + crat_table->total_entries++; + } + } *size = crat_table->length; pr_info("Virtual CRAT table created for GPU\n"); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h index b5cd182b9edd..7c3f192fe25f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -232,7 +232,8 @@ struct crat_subtype_ccompute { #define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) #define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) #define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) -#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0 +#define CRAT_IOLINK_FLAGS_BI_DIRECTIONAL (1 << 31) +#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0x7fffffe0 /* * IO interface types @@ -248,7 +249,12 @@ struct crat_subtype_ccompute { #define CRAT_IOLINK_TYPE_RAPID_IO 8 #define CRAT_IOLINK_TYPE_INFINIBAND 9 #define CRAT_IOLINK_TYPE_RESERVED3 10 -#define CRAT_IOLINK_TYPE_OTHER 11 +#define CRAT_IOLINK_TYPE_XGMI 11 +#define CRAT_IOLINK_TYPE_XGOP 12 +#define CRAT_IOLINK_TYPE_GZ 13 +#define CRAT_IOLINK_TYPE_ETHERNET_RDMA 14 +#define CRAT_IOLINK_TYPE_RDMA_OTHER 15 +#define CRAT_IOLINK_TYPE_OTHER 16 #define CRAT_IOLINK_TYPE_MAX 255 #define CRAT_IOLINK_RESERVED_LENGTH 24 diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 29ac74f40dce..a9f18ea7e354 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -53,6 +53,7 @@ static const struct kfd_device_info kaveri_device_info = { .needs_iommu_device = true, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info carrizo_device_info = { @@ -69,6 +70,7 @@ static const struct kfd_device_info carrizo_device_info = { .needs_iommu_device = true, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info raven_device_info = { @@ -84,6 +86,7 @@ static const struct kfd_device_info raven_device_info = { .needs_iommu_device = true, .needs_pci_atomics = true, .num_sdma_engines = 1, + .num_sdma_queues_per_engine = 2, }; #endif @@ -101,6 +104,7 @@ static const struct kfd_device_info hawaii_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info tonga_device_info = { @@ -116,21 +120,7 @@ static const struct kfd_device_info tonga_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, -}; - -static const struct kfd_device_info tonga_vf_device_info = { - .asic_family = CHIP_TONGA, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 4, - .ih_ring_entry_size = 4 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_cik, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .supports_cwsr = false, - .needs_iommu_device = false, - .needs_pci_atomics = false, - .num_sdma_engines = 2, + .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info fiji_device_info = { @@ -146,6 +136,7 @@ static const struct kfd_device_info fiji_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info fiji_vf_device_info = { @@ -161,6 +152,7 @@ static const struct kfd_device_info fiji_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_sdma_queues_per_engine = 2, }; @@ -177,6 +169,7 @@ static const struct kfd_device_info polaris10_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info polaris10_vf_device_info = { @@ -192,6 +185,7 @@ static const struct kfd_device_info polaris10_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info polaris11_device_info = { @@ -207,6 +201,7 @@ static const struct kfd_device_info polaris11_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info vega10_device_info = { @@ -222,6 +217,7 @@ static const struct kfd_device_info vega10_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info vega10_vf_device_info = { @@ -237,8 +233,24 @@ static const struct kfd_device_info vega10_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_sdma_queues_per_engine = 2, }; +static const struct kfd_device_info vega20_device_info = { + .asic_family = CHIP_VEGA20, + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = false, + .needs_pci_atomics = false, + .num_sdma_engines = 2, + .num_sdma_queues_per_engine = 8, +}; struct kfd_deviceid { unsigned short did; @@ -293,7 +305,6 @@ static const struct kfd_deviceid supported_devices[] = { { 0x6928, &tonga_device_info }, /* Tonga */ { 0x6929, &tonga_device_info }, /* Tonga */ { 0x692B, &tonga_device_info }, /* Tonga */ - { 0x692F, &tonga_vf_device_info }, /* Tonga vf */ { 0x6938, &tonga_device_info }, /* Tonga */ { 0x6939, &tonga_device_info }, /* Tonga */ { 0x7300, &fiji_device_info }, /* Fiji */ @@ -328,6 +339,12 @@ static const struct kfd_deviceid supported_devices[] = { { 0x6868, &vega10_device_info }, /* Vega10 */ { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/ { 0x687F, &vega10_device_info }, /* Vega10 */ + { 0x66a0, &vega20_device_info }, /* Vega20 */ + { 0x66a1, &vega20_device_info }, /* Vega20 */ + { 0x66a2, &vega20_device_info }, /* Vega20 */ + { 0x66a3, &vega20_device_info }, /* Vega20 */ + { 0x66a7, &vega20_device_info }, /* Vega20 */ + { 0x66af, &vega20_device_info } /* Vega20 */ }; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, @@ -366,6 +383,10 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, return NULL; } + kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); + if (!kfd) + return NULL; + /* Allow BIF to recode atomics to PCIe 3.0 AtomicOps. * 32 and 64-bit requests are possible and must be * supported. @@ -377,12 +398,10 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, dev_info(kfd_device, "skipped device %x:%x, PCI rejects atomics\n", pdev->vendor, pdev->device); + kfree(kfd); return NULL; - } - - kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); - if (!kfd) - return NULL; + } else if (!ret) + kfd->pci_atomic_requested = true; kfd->kgd = kgd; kfd->device_info = device_info; @@ -419,6 +438,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, { unsigned int size; + kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd, + KGD_ENGINE_MEC1); + kfd->sdma_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd, + KGD_ENGINE_SDMA1); kfd->shared_resources = *gpu_resources; kfd->vm_info.first_vmid_kfd = ffs(gpu_resources->compute_vmid_bitmap)-1; @@ -477,6 +500,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, goto kfd_doorbell_error; } + if (kfd->kfd2kgd->get_hive_id) + kfd->hive_id = kfd->kfd2kgd->get_hive_id(kfd->kgd); + if (kfd_topology_add_device(kfd)) { dev_err(kfd_device, "Error adding device to topology\n"); goto kfd_topology_add_device_error; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index ec0d62a16e53..a3b933967171 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -109,7 +109,7 @@ static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm) unsigned int get_num_sdma_queues(struct device_queue_manager *dqm) { return dqm->dev->device_info->num_sdma_engines - * KFD_SDMA_QUEUES_PER_ENGINE; + * dqm->dev->device_info->num_sdma_queues_per_engine; } void program_sh_mem_settings(struct device_queue_manager *dqm, @@ -358,8 +358,8 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd) { - int retval; struct mqd_manager *mqd_mgr; + int retval; mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); if (!mqd_mgr) @@ -387,8 +387,12 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, if (!q->properties.is_active) return 0; - retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, - &q->properties, q->process->mm); + if (WARN(q->process->mm != current->mm, + "should only run in user thread")) + retval = -EFAULT; + else + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, + &q->properties, current->mm); if (retval) goto out_uninit_mqd; @@ -545,9 +549,15 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) retval = map_queues_cpsch(dqm); else if (q->properties.is_active && (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_SDMA)) - retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, - &q->properties, q->process->mm); + q->properties.type == KFD_QUEUE_TYPE_SDMA)) { + if (WARN(q->process->mm != current->mm, + "should only run in user thread")) + retval = -EFAULT; + else + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, + q->pipe, q->queue, + &q->properties, current->mm); + } out_unlock: dqm_unlock(dqm); @@ -653,10 +663,11 @@ out: static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { + struct mm_struct *mm = NULL; struct queue *q; struct mqd_manager *mqd_mgr; struct kfd_process_device *pdd; - uint32_t pd_base; + uint64_t pd_base; int retval = 0; pdd = qpd_to_pdd(qpd); @@ -676,7 +687,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, /* Update PD Base in QPD */ qpd->page_table_base = pd_base; - pr_debug("Updated PD address to 0x%08x\n", pd_base); + pr_debug("Updated PD address to 0x%llx\n", pd_base); if (!list_empty(&qpd->queues_list)) { dqm->dev->kfd2kgd->set_vm_context_page_table_base( @@ -686,6 +697,15 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, kfd_flush_tlb(pdd); } + /* Take a safe reference to the mm_struct, which may otherwise + * disappear even while the kfd_process is still referenced. + */ + mm = get_task_mm(pdd->process->lead_thread); + if (!mm) { + retval = -EFAULT; + goto out; + } + /* activate all active queues on the qpd */ list_for_each_entry(q, &qpd->queues_list, list) { if (!q->properties.is_evicted) @@ -700,14 +720,15 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, q->properties.is_evicted = false; q->properties.is_active = true; retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, - q->queue, &q->properties, - q->process->mm); + q->queue, &q->properties, mm); if (retval) goto out; dqm->queue_count++; } qpd->evicted = 0; out: + if (mm) + mmput(mm); dqm_unlock(dqm); return retval; } @@ -717,7 +738,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, { struct queue *q; struct kfd_process_device *pdd; - uint32_t pd_base; + uint64_t pd_base; int retval = 0; pdd = qpd_to_pdd(qpd); @@ -737,7 +758,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, /* Update PD Base in QPD */ qpd->page_table_base = pd_base; - pr_debug("Updated PD address to 0x%08x\n", pd_base); + pr_debug("Updated PD address to 0x%llx\n", pd_base); /* activate all active queues on the qpd */ list_for_each_entry(q, &qpd->queues_list, list) { @@ -761,7 +782,7 @@ static int register_process(struct device_queue_manager *dqm, { struct device_process_node *n; struct kfd_process_device *pdd; - uint32_t pd_base; + uint64_t pd_base; int retval; n = kzalloc(sizeof(*n), GFP_KERNEL); @@ -779,6 +800,7 @@ static int register_process(struct device_queue_manager *dqm, /* Update PD Base in QPD */ qpd->page_table_base = pd_base; + pr_debug("Updated PD address to 0x%llx\n", pd_base); retval = dqm->asic_ops.update_qpd(dqm, qpd); @@ -1342,9 +1364,6 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, { int retval; struct mqd_manager *mqd_mgr; - bool preempt_all_queues; - - preempt_all_queues = false; retval = 0; @@ -1528,6 +1547,41 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm, return retval; } +static int get_wave_state(struct device_queue_manager *dqm, + struct queue *q, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size) +{ + struct mqd_manager *mqd; + int r; + + dqm_lock(dqm); + + if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE || + q->properties.is_active || !q->device->cwsr_enabled) { + r = -EINVAL; + goto dqm_unlock; + } + + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); + if (!mqd) { + r = -ENOMEM; + goto dqm_unlock; + } + + if (!mqd->get_wave_state) { + r = -EINVAL; + goto dqm_unlock; + } + + r = mqd->get_wave_state(mqd, q->mqd, ctl_stack, ctl_stack_used_size, + save_area_used_size); + +dqm_unlock: + dqm_unlock(dqm); + return r; +} static int process_termination_cpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) @@ -1649,6 +1703,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.process_termination = process_termination_cpsch; dqm->ops.evict_process_queues = evict_process_queues_cpsch; dqm->ops.restore_process_queues = restore_process_queues_cpsch; + dqm->ops.get_wave_state = get_wave_state; break; case KFD_SCHED_POLICY_NO_HWS: /* initialize dqm for no cp scheduling */ @@ -1668,6 +1723,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.evict_process_queues = evict_process_queues_nocpsch; dqm->ops.restore_process_queues = restore_process_queues_nocpsch; + dqm->ops.get_wave_state = get_wave_state; break; default: pr_err("Invalid scheduling policy %d\n", dqm->sched_policy); @@ -1695,6 +1751,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) break; case CHIP_VEGA10: + case CHIP_VEGA20: case CHIP_RAVEN: device_queue_manager_init_v9(&dqm->asic_ops); break; @@ -1806,7 +1863,9 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) } for (pipe = 0; pipe < get_num_sdma_engines(dqm); pipe++) { - for (queue = 0; queue < KFD_SDMA_QUEUES_PER_ENGINE; queue++) { + for (queue = 0; + queue < dqm->dev->device_info->num_sdma_queues_per_engine; + queue++) { r = dqm->dev->kfd2kgd->hqd_sdma_dump( dqm->dev->kgd, pipe, queue, &dump, &n_regs); if (r) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 00da3169a004..70e38a2e23b9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -33,7 +33,6 @@ #define KFD_UNMAP_LATENCY_MS (4000) #define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000) -#define KFD_SDMA_QUEUES_PER_ENGINE (2) struct device_process_node { struct qcm_process_device *qpd; @@ -82,6 +81,8 @@ struct device_process_node { * * @restore_process_queues: Restore all evicted queues queues of a process * + * @get_wave_state: Retrieves context save state and optionally copies the + * control stack, if kept in the MQD, to the given userspace address. */ struct device_queue_manager_ops { @@ -137,6 +138,12 @@ struct device_queue_manager_ops { struct qcm_process_device *qpd); int (*restore_process_queues)(struct device_queue_manager *dqm, struct qcm_process_device *qpd); + + int (*get_wave_state)(struct device_queue_manager *dqm, + struct queue *q, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size); }; struct device_queue_manager_asic_ops { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 97d5423c5673..3d66cec414af 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -400,6 +400,7 @@ int kfd_init_apertures(struct kfd_process *process) kfd_init_apertures_vi(pdd, id); break; case CHIP_VEGA10: + case CHIP_VEGA20: case CHIP_RAVEN: kfd_init_apertures_v9(pdd, id); break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 9f84b4d9fb88..6c31f7370193 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -322,6 +322,7 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, break; case CHIP_VEGA10: + case CHIP_VEGA20: case CHIP_RAVEN: kernel_queue_init_v9(&kq->ops_asic_specific); break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c index 684a3bf07efd..33830b1a5a54 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c @@ -71,8 +71,7 @@ static int pm_map_process_v9(struct packet_manager *pm, uint32_t *buffer, struct qcm_process_device *qpd) { struct pm4_mes_map_process *packet; - uint64_t vm_page_table_base_addr = - (uint64_t)(qpd->page_table_base) << 12; + uint64_t vm_page_table_base_addr = qpd->page_table_base; packet = (struct pm4_mes_map_process *)buffer; memset(buffer, 0, sizeof(struct pm4_mes_map_process)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index 6e1f5c7c2d4b..8018163414ff 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -20,21 +20,10 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -#include <linux/module.h> #include <linux/sched.h> -#include <linux/moduleparam.h> #include <linux/device.h> -#include <linux/printk.h> #include "kfd_priv.h" -#define KFD_DRIVER_AUTHOR "AMD Inc. and others" - -#define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs" -#define KFD_DRIVER_DATE "20150421" -#define KFD_DRIVER_MAJOR 0 -#define KFD_DRIVER_MINOR 7 -#define KFD_DRIVER_PATCHLEVEL 2 - static const struct kgd2kfd_calls kgd2kfd = { .exit = kgd2kfd_exit, .probe = kgd2kfd_probe, @@ -51,77 +40,7 @@ static const struct kgd2kfd_calls kgd2kfd = { .post_reset = kgd2kfd_post_reset, }; -int sched_policy = KFD_SCHED_POLICY_HWS; -module_param(sched_policy, int, 0444); -MODULE_PARM_DESC(sched_policy, - "Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)"); - -int hws_max_conc_proc = 8; -module_param(hws_max_conc_proc, int, 0444); -MODULE_PARM_DESC(hws_max_conc_proc, - "Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))"); - -int cwsr_enable = 1; -module_param(cwsr_enable, int, 0444); -MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = off, 1 = on (default))"); - -int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; -module_param(max_num_of_queues_per_device, int, 0444); -MODULE_PARM_DESC(max_num_of_queues_per_device, - "Maximum number of supported queues per device (1 = Minimum, 4096 = default)"); - -int send_sigterm; -module_param(send_sigterm, int, 0444); -MODULE_PARM_DESC(send_sigterm, - "Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)"); - -int debug_largebar; -module_param(debug_largebar, int, 0444); -MODULE_PARM_DESC(debug_largebar, - "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); - -int ignore_crat; -module_param(ignore_crat, int, 0444); -MODULE_PARM_DESC(ignore_crat, - "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); - -int noretry; -module_param(noretry, int, 0644); -MODULE_PARM_DESC(noretry, - "Set sh_mem_config.retry_disable on GFXv9+ dGPUs (0 = retry enabled (default), 1 = retry disabled)"); - -int halt_if_hws_hang; -module_param(halt_if_hws_hang, int, 0644); -MODULE_PARM_DESC(halt_if_hws_hang, "Halt if HWS hang is detected (0 = off (default), 1 = on)"); - - -static int amdkfd_init_completed; - - -int kgd2kfd_init(unsigned int interface_version, - const struct kgd2kfd_calls **g2f) -{ - if (!amdkfd_init_completed) - return -EPROBE_DEFER; - - /* - * Only one interface version is supported, - * no kfd/kgd version skew allowed. - */ - if (interface_version != KFD_INTERFACE_VERSION) - return -EINVAL; - - *g2f = &kgd2kfd; - - return 0; -} -EXPORT_SYMBOL(kgd2kfd_init); - -void kgd2kfd_exit(void) -{ -} - -static int __init kfd_module_init(void) +static int kfd_init(void) { int err; @@ -129,7 +48,7 @@ static int __init kfd_module_init(void) if ((sched_policy < KFD_SCHED_POLICY_HWS) || (sched_policy > KFD_SCHED_POLICY_NO_HWS)) { pr_err("sched_policy has invalid value\n"); - return -1; + return -EINVAL; } /* Verify module parameters */ @@ -137,7 +56,7 @@ static int __init kfd_module_init(void) (max_num_of_queues_per_device > KFD_MAX_NUM_OF_QUEUES_PER_DEVICE)) { pr_err("max_num_of_queues_per_device must be between 1 to KFD_MAX_NUM_OF_QUEUES_PER_DEVICE\n"); - return -1; + return -EINVAL; } err = kfd_chardev_init(); @@ -154,10 +73,6 @@ static int __init kfd_module_init(void) kfd_debugfs_init(); - amdkfd_init_completed = 1; - - dev_info(kfd_device, "Initialized module\n"); - return 0; err_create_wq: @@ -168,23 +83,30 @@ err_ioctl: return err; } -static void __exit kfd_module_exit(void) +static void kfd_exit(void) { - amdkfd_init_completed = 0; - kfd_debugfs_fini(); kfd_process_destroy_wq(); kfd_topology_shutdown(); kfd_chardev_exit(); - pr_info("amdkfd: Removed module\n"); } -module_init(kfd_module_init); -module_exit(kfd_module_exit); +int kgd2kfd_init(unsigned int interface_version, + const struct kgd2kfd_calls **g2f) +{ + int err; + + err = kfd_init(); + if (err) + return err; + + *g2f = &kgd2kfd; + + return 0; +} +EXPORT_SYMBOL(kgd2kfd_init); -MODULE_AUTHOR(KFD_DRIVER_AUTHOR); -MODULE_DESCRIPTION(KFD_DRIVER_DESC); -MODULE_LICENSE("GPL and additional rights"); -MODULE_VERSION(__stringify(KFD_DRIVER_MAJOR) "." - __stringify(KFD_DRIVER_MINOR) "." - __stringify(KFD_DRIVER_PATCHLEVEL)); +void kgd2kfd_exit(void) +{ + kfd_exit(); +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index 3bc25ab84f34..e33019a7a883 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -39,6 +39,7 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, case CHIP_POLARIS11: return mqd_manager_init_vi_tonga(type, dev); case CHIP_VEGA10: + case CHIP_VEGA20: case CHIP_RAVEN: return mqd_manager_init_v9(type, dev); default: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h index 4e84052d4e21..f8261313ae7b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -43,6 +43,9 @@ * * @is_occupied: Checks if the relevant HQD slot is occupied. * + * @get_wave_state: Retrieves context save state and optionally copies the + * control stack, if kept in the MQD, to the given userspace address. + * * @mqd_mutex: Mqd manager mutex. * * @dev: The kfd device structure coupled with this module. @@ -85,6 +88,11 @@ struct mqd_manager { uint64_t queue_address, uint32_t pipe_id, uint32_t queue_id); + int (*get_wave_state)(struct mqd_manager *mm, void *mqd, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size); + #if defined(CONFIG_DEBUG_FS) int (*debugfs_show_mqd)(struct seq_file *m, void *data); #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 0cedb37cf513..f381c1cb27bd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -266,6 +266,28 @@ static bool is_occupied(struct mqd_manager *mm, void *mqd, pipe_id, queue_id); } +static int get_wave_state(struct mqd_manager *mm, void *mqd, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size) +{ + struct v9_mqd *m; + + /* Control stack is located one page after MQD. */ + void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); + + m = get_mqd(mqd); + + *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - + m->cp_hqd_cntl_stack_offset; + *save_area_used_size = m->cp_hqd_wg_state_offset; + + if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size)) + return -EFAULT; + + return 0; +} + static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) @@ -435,6 +457,7 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; + mqd->get_wave_state = get_wave_state; #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index b81fda3754da..6469b3456f00 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -269,6 +269,28 @@ static bool is_occupied(struct mqd_manager *mm, void *mqd, pipe_id, queue_id); } +static int get_wave_state(struct mqd_manager *mm, void *mqd, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size) +{ + struct vi_mqd *m; + + m = get_mqd(mqd); + + *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - + m->cp_hqd_cntl_stack_offset; + *save_area_used_size = m->cp_hqd_wg_state_offset - + m->cp_hqd_cntl_stack_size; + + /* Control stack is not copied to user mode for GFXv8 because + * it's part of the context save area that is already + * accessible to user mode + */ + + return 0; +} + static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) @@ -436,6 +458,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; + mqd->get_wave_state = get_wave_state; #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 1092631765cb..c6080ed3b6a7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -229,6 +229,7 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) pm->pmf = &kfd_vi_pm_funcs; break; case CHIP_VEGA10: + case CHIP_VEGA20: case CHIP_RAVEN: pm->pmf = &kfd_v9_pm_funcs; break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 92b285ca73aa..53ff86d45d91 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -103,7 +103,6 @@ */ extern int max_num_of_queues_per_device; -#define KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT 4096 #define KFD_MAX_NUM_OF_QUEUES_PER_DEVICE \ (KFD_MAX_NUM_OF_PROCESSES * \ KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) @@ -149,33 +148,6 @@ extern int noretry; */ extern int halt_if_hws_hang; -/** - * enum kfd_sched_policy - * - * @KFD_SCHED_POLICY_HWS: H/W scheduling policy known as command processor (cp) - * scheduling. In this scheduling mode we're using the firmware code to - * schedule the user mode queues and kernel queues such as HIQ and DIQ. - * the HIQ queue is used as a special queue that dispatches the configuration - * to the cp and the user mode queues list that are currently running. - * the DIQ queue is a debugging queue that dispatches debugging commands to the - * firmware. - * in this scheduling mode user mode queues over subscription feature is - * enabled. - * - * @KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: The same as above but the over - * subscription feature disabled. - * - * @KFD_SCHED_POLICY_NO_HWS: no H/W scheduling policy is a mode which directly - * set the command processor registers and sets the queues "manually". This - * mode is used *ONLY* for debugging proposes. - * - */ -enum kfd_sched_policy { - KFD_SCHED_POLICY_HWS = 0, - KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION, - KFD_SCHED_POLICY_NO_HWS -}; - enum cache_policy { cache_policy_coherent, cache_policy_noncoherent @@ -204,6 +176,7 @@ struct kfd_device_info { bool needs_iommu_device; bool needs_pci_atomics; unsigned int num_sdma_engines; + unsigned int num_sdma_queues_per_engine; }; struct kfd_mem_obj { @@ -275,6 +248,10 @@ struct kfd_dev { /* Debug manager */ struct kfd_dbgmgr *dbgmgr; + /* Firmware versions */ + uint16_t mec_fw_version; + uint16_t sdma_fw_version; + /* Maximum process number mapped to HW scheduler */ unsigned int max_proc_per_quantum; @@ -282,6 +259,11 @@ struct kfd_dev { bool cwsr_enabled; const void *cwsr_isa; unsigned int cwsr_isa_size; + + /* xGMI */ + uint64_t hive_id; + + bool pci_atomic_requested; }; /* KGD2KFD callbacks */ @@ -525,11 +507,11 @@ struct qcm_process_device { * All the memory management data should be here too */ uint64_t gds_context_area; + uint64_t page_table_base; uint32_t sh_mem_config; uint32_t sh_mem_bases; uint32_t sh_mem_ape1_base; uint32_t sh_mem_ape1_limit; - uint32_t page_table_base; uint32_t gds_size; uint32_t num_gws; uint32_t num_oac; @@ -721,6 +703,7 @@ struct amdkfd_ioctl_desc { unsigned int cmd_drv; const char *name; }; +bool kfd_dev_is_large_bar(struct kfd_dev *dev); int kfd_process_create_wq(void); void kfd_process_destroy_wq(void); @@ -880,6 +863,11 @@ int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, struct queue_properties *p); struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, unsigned int qid); +int pqm_get_wave_state(struct process_queue_manager *pqm, + unsigned int qid, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size); int amdkfd_fence_wait_timeout(unsigned int *fence_addr, unsigned int fence_value, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 4694386cc623..0039e451d9af 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -322,8 +322,10 @@ static void kfd_process_destroy_pdds(struct kfd_process *p) pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n", pdd->dev->id, p->pasid); - if (pdd->drm_file) + if (pdd->drm_file) { + pdd->dev->kfd2kgd->release_process_vm(pdd->dev->kgd, pdd->vm); fput(pdd->drm_file); + } else if (pdd->vm) pdd->dev->kfd2kgd->destroy_process_vm( pdd->dev->kgd, pdd->vm); @@ -687,11 +689,11 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd, if (drm_file) ret = dev->kfd2kgd->acquire_process_vm( - dev->kgd, drm_file, + dev->kgd, drm_file, p->pasid, &pdd->vm, &p->kgd_process_info, &p->ef); else ret = dev->kfd2kgd->create_process_vm( - dev->kgd, &pdd->vm, &p->kgd_process_info, &p->ef); + dev->kgd, p->pasid, &pdd->vm, &p->kgd_process_info, &p->ef); if (ret) { pr_err("Failed to create process VM object\n"); return ret; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index c8cad9c078ae..fcaaf93681ac 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -408,6 +408,28 @@ struct kernel_queue *pqm_get_kernel_queue( return NULL; } +int pqm_get_wave_state(struct process_queue_manager *pqm, + unsigned int qid, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size) +{ + struct process_queue_node *pqn; + + pqn = get_queue_by_qid(pqm, qid); + if (!pqn) { + pr_debug("amdkfd: No queue %d exists for operation\n", + qid); + return -EFAULT; + } + + return pqn->q->device->dqm->ops.get_wave_state(pqn->q->device->dqm, + pqn->q, + ctl_stack, + ctl_stack_used_size, + save_area_used_size); +} + #if defined(CONFIG_DEBUG_FS) int pqm_debugfs_mqds(struct seq_file *m, void *data) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 80f5db4ef75f..e3843c5929ed 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -454,6 +454,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.location_id); sysfs_show_32bit_prop(buffer, "drm_render_minor", dev->node_props.drm_render_minor); + sysfs_show_64bit_prop(buffer, "hive_id", + dev->node_props.hive_id); if (dev->gpu) { log_max_watch_addr = @@ -480,11 +482,11 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, (unsigned long long int) 0); sysfs_show_32bit_prop(buffer, "fw_version", - dev->gpu->kfd2kgd->get_fw_version( - dev->gpu->kgd, - KGD_ENGINE_MEC1)); + dev->gpu->mec_fw_version); sysfs_show_32bit_prop(buffer, "capability", dev->node_props.capability); + sysfs_show_32bit_prop(buffer, "sdma_fw_version", + dev->gpu->sdma_fw_version); } return sysfs_show_32bit_prop(buffer, "max_engine_clk_ccompute", @@ -1125,17 +1127,40 @@ static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev) static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) { - struct kfd_iolink_properties *link; + struct kfd_iolink_properties *link, *cpu_link; + struct kfd_topology_device *cpu_dev; + uint32_t cap; + uint32_t cpu_flag = CRAT_IOLINK_FLAGS_ENABLED; + uint32_t flag = CRAT_IOLINK_FLAGS_ENABLED; if (!dev || !dev->gpu) return; - /* GPU only creates direck links so apply flags setting to all */ - if (dev->gpu->device_info->asic_family == CHIP_HAWAII) - list_for_each_entry(link, &dev->io_link_props, list) - link->flags = CRAT_IOLINK_FLAGS_ENABLED | - CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT | - CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT; + pcie_capability_read_dword(dev->gpu->pdev, + PCI_EXP_DEVCAP2, &cap); + + if (!(cap & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | + PCI_EXP_DEVCAP2_ATOMIC_COMP64))) + cpu_flag |= CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT | + CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT; + + if (!dev->gpu->pci_atomic_requested || + dev->gpu->device_info->asic_family == CHIP_HAWAII) + flag |= CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT | + CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT; + + /* GPU only creates direct links so apply flags setting to all */ + list_for_each_entry(link, &dev->io_link_props, list) { + link->flags = flag; + cpu_dev = kfd_topology_device_by_proximity_domain( + link->node_to); + if (cpu_dev) { + list_for_each_entry(cpu_link, + &cpu_dev->io_link_props, list) + if (cpu_link->node_to == link->node_from) + cpu_link->flags = cpu_flag; + } + } } int kfd_topology_add_device(struct kfd_dev *gpu) @@ -1230,6 +1255,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu) dev->node_props.drm_render_minor = gpu->shared_resources.drm_render_minor; + dev->node_props.hive_id = gpu->hive_id; + kfd_fill_mem_clk_max_info(dev); kfd_fill_iolink_non_crat_info(dev); @@ -1251,6 +1278,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); break; case CHIP_VEGA10: + case CHIP_VEGA20: case CHIP_RAVEN: dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index 7d9c3f948dff..92a19be07344 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -49,6 +49,7 @@ #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 struct kfd_node_properties { + uint64_t hive_id; uint32_t cpu_cores_count; uint32_t simd_count; uint32_t mem_banks_count; |