summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c43
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h1
-rw-r--r--drivers/gpu/drm/amd/scheduler/gpu_scheduler.c11
-rw-r--r--drivers/gpu/drm/amd/scheduler/gpu_scheduler.h7
7 files changed, 63 insertions, 10 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e2cafbd690c0..a2dd218e35b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -109,6 +109,7 @@ extern int amdgpu_prim_buf_per_se;
extern int amdgpu_pos_buf_per_se;
extern int amdgpu_cntl_sb_buf_per_se;
extern int amdgpu_param_buf_per_se;
+extern int amdgpu_job_hang_limit;
#define AMDGPU_DEFAULT_GTT_SIZE_MB 3072ULL /* 3GB by default */
#define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS 3000
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41c18700e275..8b0f4864a885 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2617,7 +2617,7 @@ err:
*/
int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
{
- int i, r = 0;
+ int i, j, r = 0;
int resched;
struct amdgpu_bo *bo, *tmp;
struct amdgpu_ring *ring;
@@ -2630,19 +2630,36 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
/* block TTM */
resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
- /* block scheduler */
- for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
- ring = adev->rings[i];
+ /* we start from the ring trigger GPU hang */
+ j = job ? job->ring->idx : 0;
+ /* block scheduler */
+ for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) {
+ ring = adev->rings[i % AMDGPU_MAX_RINGS];
if (!ring || !ring->sched.thread)
continue;
kthread_park(ring->sched.thread);
+
+ if (job && j != i)
+ continue;
+
+ /* here give the last chance to check if fence signaled
+ * since we already pay some time on kthread_park */
+ if (job && dma_fence_is_signaled(&job->base.s_fence->finished)) {
+ kthread_unpark(ring->sched.thread);
+ goto give_up_reset;
+ }
+
+ if (amd_sched_invalidate_job(&job->base, amdgpu_job_hang_limit))
+ amd_sched_job_kickout(&job->base);
+
+ /* only do job_reset on the hang ring if @job not NULL */
amd_sched_hw_job_reset(&ring->sched);
- }
- /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
- amdgpu_fence_driver_force_completion(adev);
+ /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
+ amdgpu_fence_driver_force_completion_ring(ring);
+ }
/* request to take full control of GPU before re-initialization */
if (job)
@@ -2695,20 +2712,28 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job)
}
dma_fence_put(fence);
- for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
- struct amdgpu_ring *ring = adev->rings[i];
+ for (i = j; i < j + AMDGPU_MAX_RINGS; ++i) {
+ ring = adev->rings[i % AMDGPU_MAX_RINGS];
if (!ring || !ring->sched.thread)
continue;
+ if (job && j != i) {
+ kthread_unpark(ring->sched.thread);
+ continue;
+ }
+
amd_sched_job_recovery(&ring->sched);
kthread_unpark(ring->sched.thread);
}
drm_helper_resume_force_mode(adev->ddev);
+give_up_reset:
ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
if (r) {
/* bad news, how to tell it to userspace ? */
dev_info(adev->dev, "GPU reset failed\n");
+ } else {
+ dev_info(adev->dev, "GPU reset successed!\n");
}
adev->gfx.in_reset = false;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 2b746e078b9a..7b07ac2c52b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -111,6 +111,7 @@ int amdgpu_prim_buf_per_se = 0;
int amdgpu_pos_buf_per_se = 0;
int amdgpu_cntl_sb_buf_per_se = 0;
int amdgpu_param_buf_per_se = 0;
+int amdgpu_job_hang_limit = 0;
MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
@@ -233,6 +234,9 @@ module_param_named(cntl_sb_buf_per_se, amdgpu_cntl_sb_buf_per_se, int, 0444);
MODULE_PARM_DESC(param_buf_per_se, "the size of Off-Chip Pramater Cache per Shader Engine (default depending on gfx)");
module_param_named(param_buf_per_se, amdgpu_param_buf_per_se, int, 0444);
+MODULE_PARM_DESC(job_hang_limit, "how much time allow a job hang and not drop it (default 0)");
+module_param_named(job_hang_limit, amdgpu_job_hang_limit, int ,0444);
+
static const struct pci_device_id pciidlist[] = {
#ifdef CONFIG_DRM_AMDGPU_SI
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index ef6c643115b8..333bad749067 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -541,6 +541,12 @@ void amdgpu_fence_driver_force_completion(struct amdgpu_device *adev)
}
}
+void amdgpu_fence_driver_force_completion_ring(struct amdgpu_ring *ring)
+{
+ if (ring)
+ amdgpu_fence_write(ring, ring->fence_drv.sync_seq);
+}
+
/*
* Common fence implementation
*/
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 2b7b3c56d446..fc7329b468f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -76,6 +76,7 @@ struct amdgpu_fence_driver {
int amdgpu_fence_driver_init(struct amdgpu_device *adev);
void amdgpu_fence_driver_fini(struct amdgpu_device *adev);
void amdgpu_fence_driver_force_completion(struct amdgpu_device *adev);
+void amdgpu_fence_driver_force_completion_ring(struct amdgpu_ring *ring);
int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
unsigned num_hw_submission);
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index fea96a765cf1..38cea6fb25a8 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -409,9 +409,18 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched)
&s_job->s_fence->cb)) {
dma_fence_put(s_job->s_fence->parent);
s_job->s_fence->parent = NULL;
+ atomic_dec(&sched->hw_rq_count);
}
}
- atomic_set(&sched->hw_rq_count, 0);
+ spin_unlock(&sched->job_list_lock);
+}
+
+void amd_sched_job_kickout(struct amd_sched_job *s_job)
+{
+ struct amd_gpu_scheduler *sched = s_job->sched;
+
+ spin_lock(&sched->job_list_lock);
+ list_del_init(&s_job->node);
spin_unlock(&sched->job_list_lock);
}
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index 924d4a5899e1..f9d8f28efd16 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -81,6 +81,7 @@ struct amd_sched_job {
struct list_head node;
struct delayed_work work_tdr;
uint64_t id;
+ atomic_t karma;
};
extern const struct dma_fence_ops amd_sched_fence_ops_scheduled;
@@ -96,6 +97,11 @@ static inline struct amd_sched_fence *to_amd_sched_fence(struct dma_fence *f)
return NULL;
}
+static inline bool amd_sched_invalidate_job(struct amd_sched_job *s_job, int threshold)
+{
+ return (s_job && atomic_inc_return(&s_job->karma) > threshold);
+}
+
/**
* Define the backend operations called by the scheduler,
* these functions should be implemented in driver side
@@ -160,4 +166,5 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched);
void amd_sched_job_recovery(struct amd_gpu_scheduler *sched);
bool amd_sched_dependency_optimized(struct dma_fence* fence,
struct amd_sched_entity *entity);
+void amd_sched_job_kickout(struct amd_sched_job *s_job);
#endif