summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/i915/i915_reset.c
diff options
context:
space:
mode:
authorChris Wilson2019-01-25 14:22:28 +0100
committerChris Wilson2019-01-25 15:27:22 +0100
commiteb8d0f5af4ec2d172baf8b4b9a2199cd916b4e54 (patch)
tree28293a5cdfd09863ce764d181c5039cce25b79a2 /drivers/gpu/drm/i915/i915_reset.c
parentdrm/i915/guc: Disable global reset (diff)
downloadkernel-qcow2-linux-eb8d0f5af4ec2d172baf8b4b9a2199cd916b4e54.tar.gz
kernel-qcow2-linux-eb8d0f5af4ec2d172baf8b4b9a2199cd916b4e54.tar.xz
kernel-qcow2-linux-eb8d0f5af4ec2d172baf8b4b9a2199cd916b4e54.zip
drm/i915: Remove GPU reset dependence on struct_mutex
Now that the submission backends are controlled via their own spinlocks, with a wave of a magic wand we can lift the struct_mutex requirement around GPU reset. That is we allow the submission frontend (userspace) to keep on submitting while we process the GPU reset as we can suspend the backend independently. The major change is around the backoff/handoff strategy for performing the reset. With no mutex deadlock, we no longer have to coordinate with any waiter, and just perform the reset immediately. Testcase: igt/gem_mmap_gtt/hang # regresses Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190125132230.22221-3-chris@chris-wilson.co.uk
Diffstat (limited to 'drivers/gpu/drm/i915/i915_reset.c')
-rw-r--r--drivers/gpu/drm/i915/i915_reset.c392
1 files changed, 179 insertions, 213 deletions
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 33408c4e6358..68af017ee548 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -5,6 +5,7 @@
*/
#include <linux/sched/mm.h>
+#include <linux/stop_machine.h>
#include "i915_drv.h"
#include "i915_gpu_error.h"
@@ -14,27 +15,31 @@
#define RESET_MAX_RETRIES 3
+/* XXX How to handle concurrent GGTT updates using tiling registers? */
+#define RESET_UNDER_STOP_MACHINE 0
+
static void engine_skip_context(struct i915_request *rq)
{
struct intel_engine_cs *engine = rq->engine;
struct i915_gem_context *hung_ctx = rq->gem_context;
struct i915_timeline *timeline = rq->timeline;
- unsigned long flags;
+ lockdep_assert_held(&engine->timeline.lock);
GEM_BUG_ON(timeline == &engine->timeline);
- spin_lock_irqsave(&engine->timeline.lock, flags);
spin_lock(&timeline->lock);
- list_for_each_entry_continue(rq, &engine->timeline.requests, link)
- if (rq->gem_context == hung_ctx)
- i915_request_skip(rq, -EIO);
+ if (rq->global_seqno) {
+ list_for_each_entry_continue(rq,
+ &engine->timeline.requests, link)
+ if (rq->gem_context == hung_ctx)
+ i915_request_skip(rq, -EIO);
+ }
list_for_each_entry(rq, &timeline->requests, link)
i915_request_skip(rq, -EIO);
spin_unlock(&timeline->lock);
- spin_unlock_irqrestore(&engine->timeline.lock, flags);
}
static void client_mark_guilty(struct drm_i915_file_private *file_priv,
@@ -61,7 +66,7 @@ static void client_mark_guilty(struct drm_i915_file_private *file_priv,
}
}
-static void context_mark_guilty(struct i915_gem_context *ctx)
+static bool context_mark_guilty(struct i915_gem_context *ctx)
{
unsigned int score;
bool banned, bannable;
@@ -74,7 +79,7 @@ static void context_mark_guilty(struct i915_gem_context *ctx)
/* Cool contexts don't accumulate client ban score */
if (!bannable)
- return;
+ return false;
if (banned) {
DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
@@ -85,6 +90,8 @@ static void context_mark_guilty(struct i915_gem_context *ctx)
if (!IS_ERR_OR_NULL(ctx->file_priv))
client_mark_guilty(ctx->file_priv, ctx);
+
+ return banned;
}
static void context_mark_innocent(struct i915_gem_context *ctx)
@@ -92,6 +99,21 @@ static void context_mark_innocent(struct i915_gem_context *ctx)
atomic_inc(&ctx->active_count);
}
+void i915_reset_request(struct i915_request *rq, bool guilty)
+{
+ lockdep_assert_held(&rq->engine->timeline.lock);
+ GEM_BUG_ON(i915_request_completed(rq));
+
+ if (guilty) {
+ i915_request_skip(rq, -EIO);
+ if (context_mark_guilty(rq->gem_context))
+ engine_skip_context(rq);
+ } else {
+ dma_fence_set_error(&rq->fence, -EAGAIN);
+ context_mark_innocent(rq->gem_context);
+ }
+}
+
static void gen3_stop_engine(struct intel_engine_cs *engine)
{
struct drm_i915_private *dev_priv = engine->i915;
@@ -604,11 +626,8 @@ int intel_reset_guc(struct drm_i915_private *i915)
* Ensure irq handler finishes, and not run again.
* Also return the active request so that we only search for it once.
*/
-static struct i915_request *
-reset_prepare_engine(struct intel_engine_cs *engine)
+static void reset_prepare_engine(struct intel_engine_cs *engine)
{
- struct i915_request *rq;
-
/*
* During the reset sequence, we must prevent the engine from
* entering RC6. As the context state is undefined until we restart
@@ -617,162 +636,85 @@ reset_prepare_engine(struct intel_engine_cs *engine)
* GPU state upon resume, i.e. fail to restart after a reset.
*/
intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
-
- rq = engine->reset.prepare(engine);
- if (rq && rq->fence.error == -EIO)
- rq = ERR_PTR(-EIO); /* Previous reset failed! */
-
- return rq;
+ engine->reset.prepare(engine);
}
-static int reset_prepare(struct drm_i915_private *i915)
+static void reset_prepare(struct drm_i915_private *i915)
{
struct intel_engine_cs *engine;
- struct i915_request *rq;
enum intel_engine_id id;
- int err = 0;
- for_each_engine(engine, i915, id) {
- rq = reset_prepare_engine(engine);
- if (IS_ERR(rq)) {
- err = PTR_ERR(rq);
- continue;
- }
-
- engine->hangcheck.active_request = rq;
- }
+ for_each_engine(engine, i915, id)
+ reset_prepare_engine(engine);
- i915_gem_revoke_fences(i915);
intel_uc_sanitize(i915);
-
- return err;
}
-/* Returns the request if it was guilty of the hang */
-static struct i915_request *
-reset_request(struct intel_engine_cs *engine,
- struct i915_request *rq,
- bool stalled)
+static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
{
+ struct intel_engine_cs *engine;
+ enum intel_engine_id id;
+ int err;
+
/*
- * The guilty request will get skipped on a hung engine.
- *
- * Users of client default contexts do not rely on logical
- * state preserved between batches so it is safe to execute
- * queued requests following the hang. Non default contexts
- * rely on preserved state, so skipping a batch loses the
- * evolution of the state and it needs to be considered corrupted.
- * Executing more queued batches on top of corrupted state is
- * risky. But we take the risk by trying to advance through
- * the queued requests in order to make the client behaviour
- * more predictable around resets, by not throwing away random
- * amount of batches it has prepared for execution. Sophisticated
- * clients can use gem_reset_stats_ioctl and dma fence status
- * (exported via sync_file info ioctl on explicit fences) to observe
- * when it loses the context state and should rebuild accordingly.
- *
- * The context ban, and ultimately the client ban, mechanism are safety
- * valves if client submission ends up resulting in nothing more than
- * subsequent hangs.
+ * Everything depends on having the GTT running, so we need to start
+ * there.
*/
+ err = i915_ggtt_enable_hw(i915);
+ if (err)
+ return err;
- if (i915_request_completed(rq)) {
- GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n",
- engine->name, rq->global_seqno,
- rq->fence.context, rq->fence.seqno,
- intel_engine_get_seqno(engine));
- stalled = false;
- }
-
- if (stalled) {
- context_mark_guilty(rq->gem_context);
- i915_request_skip(rq, -EIO);
+ for_each_engine(engine, i915, id)
+ intel_engine_reset(engine, stalled_mask & ENGINE_MASK(id));
- /* If this context is now banned, skip all pending requests. */
- if (i915_gem_context_is_banned(rq->gem_context))
- engine_skip_context(rq);
- } else {
- /*
- * Since this is not the hung engine, it may have advanced
- * since the hang declaration. Double check by refinding
- * the active request at the time of the reset.
- */
- rq = i915_gem_find_active_request(engine);
- if (rq) {
- unsigned long flags;
-
- context_mark_innocent(rq->gem_context);
- dma_fence_set_error(&rq->fence, -EAGAIN);
-
- /* Rewind the engine to replay the incomplete rq */
- spin_lock_irqsave(&engine->timeline.lock, flags);
- rq = list_prev_entry(rq, link);
- if (&rq->link == &engine->timeline.requests)
- rq = NULL;
- spin_unlock_irqrestore(&engine->timeline.lock, flags);
- }
- }
+ i915_gem_restore_fences(i915);
- return rq;
+ return err;
}
-static void reset_engine(struct intel_engine_cs *engine,
- struct i915_request *rq,
- bool stalled)
+static void reset_finish_engine(struct intel_engine_cs *engine)
{
- if (rq)
- rq = reset_request(engine, rq, stalled);
-
- /* Setup the CS to resume from the breadcrumb of the hung request */
- engine->reset.reset(engine, rq);
+ engine->reset.finish(engine);
+ intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
}
-static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+struct i915_gpu_restart {
+ struct work_struct work;
+ struct drm_i915_private *i915;
+};
+
+static void restart_work(struct work_struct *work)
{
+ struct i915_gpu_restart *arg = container_of(work, typeof(*arg), work);
+ struct drm_i915_private *i915 = arg->i915;
struct intel_engine_cs *engine;
enum intel_engine_id id;
+ intel_wakeref_t wakeref;
- lockdep_assert_held(&i915->drm.struct_mutex);
-
- i915_retire_requests(i915);
+ wakeref = intel_runtime_pm_get(i915);
+ mutex_lock(&i915->drm.struct_mutex);
+ WRITE_ONCE(i915->gpu_error.restart, NULL);
for_each_engine(engine, i915, id) {
- struct intel_context *ce;
-
- reset_engine(engine,
- engine->hangcheck.active_request,
- stalled_mask & ENGINE_MASK(id));
- ce = fetch_and_zero(&engine->last_retired_context);
- if (ce)
- intel_context_unpin(ce);
+ struct i915_request *rq;
/*
* Ostensibily, we always want a context loaded for powersaving,
* so if the engine is idle after the reset, send a request
* to load our scratch kernel_context.
- *
- * More mysteriously, if we leave the engine idle after a reset,
- * the next userspace batch may hang, with what appears to be
- * an incoherent read by the CS (presumably stale TLB). An
- * empty request appears sufficient to paper over the glitch.
*/
- if (intel_engine_is_idle(engine)) {
- struct i915_request *rq;
+ if (!intel_engine_is_idle(engine))
+ continue;
- rq = i915_request_alloc(engine, i915->kernel_context);
- if (!IS_ERR(rq))
- i915_request_add(rq);
- }
+ rq = i915_request_alloc(engine, i915->kernel_context);
+ if (!IS_ERR(rq))
+ i915_request_add(rq);
}
- i915_gem_restore_fences(i915);
-}
-
-static void reset_finish_engine(struct intel_engine_cs *engine)
-{
- engine->reset.finish(engine);
+ mutex_unlock(&i915->drm.struct_mutex);
+ intel_runtime_pm_put(i915, wakeref);
- intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
+ kfree(arg);
}
static void reset_finish(struct drm_i915_private *i915)
@@ -780,11 +722,30 @@ static void reset_finish(struct drm_i915_private *i915)
struct intel_engine_cs *engine;
enum intel_engine_id id;
- lockdep_assert_held(&i915->drm.struct_mutex);
-
- for_each_engine(engine, i915, id) {
- engine->hangcheck.active_request = NULL;
+ for_each_engine(engine, i915, id)
reset_finish_engine(engine);
+}
+
+static void reset_restart(struct drm_i915_private *i915)
+{
+ struct i915_gpu_restart *arg;
+
+ /*
+ * Following the reset, ensure that we always reload context for
+ * powersaving, and to correct engine->last_retired_context. Since
+ * this requires us to submit a request, queue a worker to do that
+ * task for us to evade any locking here.
+ */
+ if (READ_ONCE(i915->gpu_error.restart))
+ return;
+
+ arg = kmalloc(sizeof(*arg), GFP_KERNEL);
+ if (arg) {
+ arg->i915 = i915;
+ INIT_WORK(&arg->work, restart_work);
+
+ WRITE_ONCE(i915->gpu_error.restart, arg);
+ queue_work(i915->wq, &arg->work);
}
}
@@ -873,8 +834,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
struct i915_timeline *tl;
bool ret = false;
- lockdep_assert_held(&i915->drm.struct_mutex);
-
if (!test_bit(I915_WEDGED, &error->flags))
return true;
@@ -897,9 +856,9 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
*/
list_for_each_entry(tl, &i915->gt.timelines, link) {
struct i915_request *rq;
+ long timeout;
- rq = i915_gem_active_peek(&tl->last_request,
- &i915->drm.struct_mutex);
+ rq = i915_gem_active_get_unlocked(&tl->last_request);
if (!rq)
continue;
@@ -914,12 +873,12 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
* and when the seqno passes the fence, the signaler
* then signals the fence waking us up).
*/
- if (dma_fence_default_wait(&rq->fence, true,
- MAX_SCHEDULE_TIMEOUT) < 0)
+ timeout = dma_fence_default_wait(&rq->fence, true,
+ MAX_SCHEDULE_TIMEOUT);
+ i915_request_put(rq);
+ if (timeout < 0)
goto unlock;
}
- i915_retire_requests(i915);
- GEM_BUG_ON(i915->gt.active_requests);
intel_engines_sanitize(i915, false);
@@ -933,7 +892,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
* context and do not require stop_machine().
*/
intel_engines_reset_default_submission(i915);
- i915_gem_contexts_lost(i915);
GEM_TRACE("end\n");
@@ -946,6 +904,52 @@ unlock:
return ret;
}
+struct __i915_reset {
+ struct drm_i915_private *i915;
+ unsigned int stalled_mask;
+};
+
+static int __i915_reset__BKL(void *data)
+{
+ struct __i915_reset *arg = data;
+ int err;
+
+ err = intel_gpu_reset(arg->i915, ALL_ENGINES);
+ if (err)
+ return err;
+
+ return gt_reset(arg->i915, arg->stalled_mask);
+}
+
+#if RESET_UNDER_STOP_MACHINE
+/*
+ * XXX An alternative to using stop_machine would be to park only the
+ * processes that have a GGTT mmap. By remote parking the threads (SIGSTOP)
+ * we should be able to prevent their memmory accesses via the lost fence
+ * registers over the course of the reset without the potential recursive
+ * of mutexes between the pagefault handler and reset.
+ *
+ * See igt/gem_mmap_gtt/hang
+ */
+#define __do_reset(fn, arg) stop_machine(fn, arg, NULL)
+#else
+#define __do_reset(fn, arg) fn(arg)
+#endif
+
+static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+{
+ struct __i915_reset arg = { i915, stalled_mask };
+ int err, i;
+
+ err = __do_reset(__i915_reset__BKL, &arg);
+ for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
+ msleep(100);
+ err = __do_reset(__i915_reset__BKL, &arg);
+ }
+
+ return err;
+}
+
/**
* i915_reset - reset chip after a hang
* @i915: #drm_i915_private to reset
@@ -971,31 +975,22 @@ void i915_reset(struct drm_i915_private *i915,
{
struct i915_gpu_error *error = &i915->gpu_error;
int ret;
- int i;
GEM_TRACE("flags=%lx\n", error->flags);
might_sleep();
- lockdep_assert_held(&i915->drm.struct_mutex);
assert_rpm_wakelock_held(i915);
GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
- if (!test_bit(I915_RESET_HANDOFF, &error->flags))
- return;
-
/* Clear any previous failed attempts at recovery. Time to try again. */
if (!i915_gem_unset_wedged(i915))
- goto wakeup;
+ return;
if (reason)
dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
error->reset_count++;
- ret = reset_prepare(i915);
- if (ret) {
- dev_err(i915->drm.dev, "GPU recovery failed\n");
- goto taint;
- }
+ reset_prepare(i915);
if (!intel_has_gpu_reset(i915)) {
if (i915_modparams.reset)
@@ -1005,32 +1000,11 @@ void i915_reset(struct drm_i915_private *i915,
goto error;
}
- for (i = 0; i < RESET_MAX_RETRIES; i++) {
- ret = intel_gpu_reset(i915, ALL_ENGINES);
- if (ret == 0)
- break;
-
- msleep(100);
- }
- if (ret) {
+ if (do_reset(i915, stalled_mask)) {
dev_err(i915->drm.dev, "Failed to reset chip\n");
goto taint;
}
- /* Ok, now get things going again... */
-
- /*
- * Everything depends on having the GTT running, so we need to start
- * there.
- */
- ret = i915_ggtt_enable_hw(i915);
- if (ret) {
- DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
- ret);
- goto error;
- }
-
- gt_reset(i915, stalled_mask);
intel_overlay_reset(i915);
/*
@@ -1052,9 +1026,8 @@ void i915_reset(struct drm_i915_private *i915,
finish:
reset_finish(i915);
-wakeup:
- clear_bit(I915_RESET_HANDOFF, &error->flags);
- wake_up_bit(&error->flags, I915_RESET_HANDOFF);
+ if (!i915_terminally_wedged(error))
+ reset_restart(i915);
return;
taint:
@@ -1073,7 +1046,6 @@ taint:
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
error:
i915_gem_set_wedged(i915);
- i915_retire_requests(i915);
goto finish;
}
@@ -1099,18 +1071,16 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
{
struct i915_gpu_error *error = &engine->i915->gpu_error;
- struct i915_request *active_request;
int ret;
GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
- active_request = reset_prepare_engine(engine);
- if (IS_ERR_OR_NULL(active_request)) {
- /* Either the previous reset failed, or we pardon the reset. */
- ret = PTR_ERR(active_request);
- goto out;
- }
+ if (i915_seqno_passed(intel_engine_get_seqno(engine),
+ intel_engine_last_submit(engine)))
+ return 0;
+
+ reset_prepare_engine(engine);
if (msg)
dev_notice(engine->i915->drm.dev,
@@ -1134,7 +1104,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
* active request and can drop it, adjust head to skip the offending
* request to resume executing remaining requests in the queue.
*/
- reset_engine(engine, active_request, true);
+ intel_engine_reset(engine, true);
/*
* The engine and its registers (and workarounds in case of render)
@@ -1171,30 +1141,7 @@ static void i915_reset_device(struct drm_i915_private *i915,
i915_wedge_on_timeout(&w, i915, 5 * HZ) {
intel_prepare_reset(i915);
- error->reason = reason;
- error->stalled_mask = engine_mask;
-
- /* Signal that locked waiters should reset the GPU */
- smp_mb__before_atomic();
- set_bit(I915_RESET_HANDOFF, &error->flags);
- wake_up_all(&error->wait_queue);
-
- /*
- * Wait for anyone holding the lock to wakeup, without
- * blocking indefinitely on struct_mutex.
- */
- do {
- if (mutex_trylock(&i915->drm.struct_mutex)) {
- i915_reset(i915, engine_mask, reason);
- mutex_unlock(&i915->drm.struct_mutex);
- }
- } while (wait_on_bit_timeout(&error->flags,
- I915_RESET_HANDOFF,
- TASK_UNINTERRUPTIBLE,
- 1));
-
- error->stalled_mask = 0;
- error->reason = NULL;
+ i915_reset(i915, engine_mask, reason);
intel_finish_reset(i915);
}
@@ -1350,6 +1297,25 @@ out:
intel_runtime_pm_put(i915, wakeref);
}
+bool i915_reset_flush(struct drm_i915_private *i915)
+{
+ int err;
+
+ cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
+
+ flush_workqueue(i915->wq);
+ GEM_BUG_ON(READ_ONCE(i915->gpu_error.restart));
+
+ mutex_lock(&i915->drm.struct_mutex);
+ err = i915_gem_wait_for_idle(i915,
+ I915_WAIT_LOCKED |
+ I915_WAIT_FOR_IDLE_BOOST,
+ MAX_SCHEDULE_TIMEOUT);
+ mutex_unlock(&i915->drm.struct_mutex);
+
+ return !err;
+}
+
static void i915_wedge_me(struct work_struct *work)
{
struct i915_wedge_me *w = container_of(work, typeof(*w), work.work);