summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/i915/gem/i915_gem_context.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/i915/gem/i915_gem_context.c')
-rw-r--r--drivers/gpu/drm/i915/gem/i915_gem_context.c210
1 files changed, 209 insertions, 1 deletions
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 7b01f4605f21..de6e55af82cf 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -69,8 +69,10 @@
#include <drm/i915_drm.h>
-#include "gt/intel_lrc_reg.h"
+#include "gt/intel_engine_heartbeat.h"
#include "gt/intel_engine_user.h"
+#include "gt/intel_lrc_reg.h"
+#include "gt/intel_ring.h"
#include "i915_gem_context.h"
#include "i915_globals.h"
@@ -276,6 +278,153 @@ void i915_gem_context_release(struct kref *ref)
schedule_work(&gc->free_work);
}
+static inline struct i915_gem_engines *
+__context_engines_static(const struct i915_gem_context *ctx)
+{
+ return rcu_dereference_protected(ctx->engines, true);
+}
+
+static bool __reset_engine(struct intel_engine_cs *engine)
+{
+ struct intel_gt *gt = engine->gt;
+ bool success = false;
+
+ if (!intel_has_reset_engine(gt))
+ return false;
+
+ if (!test_and_set_bit(I915_RESET_ENGINE + engine->id,
+ &gt->reset.flags)) {
+ success = intel_engine_reset(engine, NULL) == 0;
+ clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
+ &gt->reset.flags);
+ }
+
+ return success;
+}
+
+static void __reset_context(struct i915_gem_context *ctx,
+ struct intel_engine_cs *engine)
+{
+ intel_gt_handle_error(engine->gt, engine->mask, 0,
+ "context closure in %s", ctx->name);
+}
+
+static bool __cancel_engine(struct intel_engine_cs *engine)
+{
+ /*
+ * Send a "high priority pulse" down the engine to cause the
+ * current request to be momentarily preempted. (If it fails to
+ * be preempted, it will be reset). As we have marked our context
+ * as banned, any incomplete request, including any running, will
+ * be skipped following the preemption.
+ *
+ * If there is no hangchecking (one of the reasons why we try to
+ * cancel the context) and no forced preemption, there may be no
+ * means by which we reset the GPU and evict the persistent hog.
+ * Ergo if we are unable to inject a preemptive pulse that can
+ * kill the banned context, we fallback to doing a local reset
+ * instead.
+ */
+ if (IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT) &&
+ !intel_engine_pulse(engine))
+ return true;
+
+ /* If we are unable to send a pulse, try resetting this engine. */
+ return __reset_engine(engine);
+}
+
+static struct intel_engine_cs *__active_engine(struct i915_request *rq)
+{
+ struct intel_engine_cs *engine, *locked;
+
+ /*
+ * Serialise with __i915_request_submit() so that it sees
+ * is-banned?, or we know the request is already inflight.
+ */
+ locked = READ_ONCE(rq->engine);
+ spin_lock_irq(&locked->active.lock);
+ while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) {
+ spin_unlock(&locked->active.lock);
+ spin_lock(&engine->active.lock);
+ locked = engine;
+ }
+
+ engine = NULL;
+ if (i915_request_is_active(rq) && !rq->fence.error)
+ engine = rq->engine;
+
+ spin_unlock_irq(&locked->active.lock);
+
+ return engine;
+}
+
+static struct intel_engine_cs *active_engine(struct intel_context *ce)
+{
+ struct intel_engine_cs *engine = NULL;
+ struct i915_request *rq;
+
+ if (!ce->timeline)
+ return NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_reverse(rq, &ce->timeline->requests, link) {
+ if (i915_request_completed(rq))
+ break;
+
+ /* Check with the backend if the request is inflight */
+ engine = __active_engine(rq);
+ if (engine)
+ break;
+ }
+ rcu_read_unlock();
+
+ return engine;
+}
+
+static void kill_context(struct i915_gem_context *ctx)
+{
+ struct i915_gem_engines_iter it;
+ struct intel_context *ce;
+
+ /*
+ * If we are already banned, it was due to a guilty request causing
+ * a reset and the entire context being evicted from the GPU.
+ */
+ if (i915_gem_context_is_banned(ctx))
+ return;
+
+ i915_gem_context_set_banned(ctx);
+
+ /*
+ * Map the user's engine back to the actual engines; one virtual
+ * engine will be mapped to multiple engines, and using ctx->engine[]
+ * the same engine may be have multiple instances in the user's map.
+ * However, we only care about pending requests, so only include
+ * engines on which there are incomplete requests.
+ */
+ for_each_gem_engine(ce, __context_engines_static(ctx), it) {
+ struct intel_engine_cs *engine;
+
+ /*
+ * Check the current active state of this context; if we
+ * are currently executing on the GPU we need to evict
+ * ourselves. On the other hand, if we haven't yet been
+ * submitted to the GPU or if everything is complete,
+ * we have nothing to do.
+ */
+ engine = active_engine(ce);
+
+ /* First attempt to gracefully cancel the context */
+ if (engine && !__cancel_engine(engine))
+ /*
+ * If we are unable to send a preemptive pulse to bump
+ * the context from the GPU, we have to resort to a full
+ * reset. We hope the collateral damage is worth it.
+ */
+ __reset_context(ctx, engine);
+ }
+}
+
static void context_close(struct i915_gem_context *ctx)
{
struct i915_address_space *vm;
@@ -298,9 +447,47 @@ static void context_close(struct i915_gem_context *ctx)
lut_close(ctx);
mutex_unlock(&ctx->mutex);
+
+ /*
+ * If the user has disabled hangchecking, we can not be sure that
+ * the batches will ever complete after the context is closed,
+ * keeping the context and all resources pinned forever. So in this
+ * case we opt to forcibly kill off all remaining requests on
+ * context close.
+ */
+ if (!i915_gem_context_is_persistent(ctx) ||
+ !i915_modparams.enable_hangcheck)
+ kill_context(ctx);
+
i915_gem_context_put(ctx);
}
+static int __context_set_persistence(struct i915_gem_context *ctx, bool state)
+{
+ if (i915_gem_context_is_persistent(ctx) == state)
+ return 0;
+
+ if (state) {
+ /*
+ * Only contexts that are short-lived [that will expire or be
+ * reset] are allowed to survive past termination. We require
+ * hangcheck to ensure that the persistent requests are healthy.
+ */
+ if (!i915_modparams.enable_hangcheck)
+ return -EINVAL;
+
+ i915_gem_context_set_persistence(ctx);
+ } else {
+ /* To cancel a context we use "preempt-to-idle" */
+ if (!(ctx->i915->caps.scheduler & I915_SCHEDULER_CAP_PREEMPTION))
+ return -ENODEV;
+
+ i915_gem_context_clear_persistence(ctx);
+ }
+
+ return 0;
+}
+
static struct i915_gem_context *
__create_context(struct drm_i915_private *i915)
{
@@ -335,6 +522,7 @@ __create_context(struct drm_i915_private *i915)
i915_gem_context_set_bannable(ctx);
i915_gem_context_set_recoverable(ctx);
+ __context_set_persistence(ctx, true /* cgroup hook? */);
for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++)
ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES;
@@ -491,6 +679,7 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio)
return ctx;
i915_gem_context_clear_bannable(ctx);
+ i915_gem_context_set_persistence(ctx);
ctx->sched.priority = I915_USER_PRIORITY(prio);
GEM_BUG_ON(!i915_gem_context_is_kernel(ctx));
@@ -1601,6 +1790,16 @@ err_free:
return err;
}
+static int
+set_persistence(struct i915_gem_context *ctx,
+ const struct drm_i915_gem_context_param *args)
+{
+ if (args->size)
+ return -EINVAL;
+
+ return __context_set_persistence(ctx, args->value);
+}
+
static int ctx_setparam(struct drm_i915_file_private *fpriv,
struct i915_gem_context *ctx,
struct drm_i915_gem_context_param *args)
@@ -1678,6 +1877,10 @@ static int ctx_setparam(struct drm_i915_file_private *fpriv,
ret = set_engines(ctx, args);
break;
+ case I915_CONTEXT_PARAM_PERSISTENCE:
+ ret = set_persistence(ctx, args);
+ break;
+
case I915_CONTEXT_PARAM_BAN_PERIOD:
default:
ret = -EINVAL;
@@ -2130,6 +2333,11 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
ret = get_engines(ctx, args);
break;
+ case I915_CONTEXT_PARAM_PERSISTENCE:
+ args->size = 0;
+ args->value = i915_gem_context_is_persistent(ctx);
+ break;
+
case I915_CONTEXT_PARAM_BAN_PERIOD:
default:
ret = -EINVAL;