From 742379c0c4001fd2a6e02005c1ffa1ff611b28fa Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 10 Jan 2020 12:30:56 +0000 Subject: drm/i915: Start chopping up the GPU error capture In the near future, we will want to start a GPU error capture from a new context, from inside the softirq region of a forced preemption. To do so requires us to break up the monolithic error capture to provide new entry points with finer control; in particular focusing on one engine/gt, and being able to compose an error state from little pieces of HW capture. Signed-off-by: Chris Wilson Cc: Andi Shyti Acked-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20200110123059.1348712-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_gpu_error.c | 1184 ++++++++++++++++++--------------- 1 file changed, 647 insertions(+), 537 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_gpu_error.c') diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index fda0977d2059..730129ca4c17 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -41,6 +41,7 @@ #include "gem/i915_gem_context.h" #include "gem/i915_gem_lmem.h" +#include "gt/intel_gt_pm.h" #include "i915_drv.h" #include "i915_gpu_error.h" @@ -232,14 +233,13 @@ static void pool_free(struct pagevec *pv, void *addr) #ifdef CONFIG_DRM_I915_COMPRESS_ERROR -struct compress { +struct i915_vma_compress { struct pagevec pool; struct z_stream_s zstream; void *tmp; - bool wc; }; -static bool compress_init(struct compress *c) +static bool compress_init(struct i915_vma_compress *c) { struct z_stream_s *zstream = &c->zstream; @@ -261,7 +261,7 @@ static bool compress_init(struct compress *c) return true; } -static bool compress_start(struct compress *c) +static bool compress_start(struct i915_vma_compress *c) { struct z_stream_s *zstream = &c->zstream; void *workspace = zstream->workspace; @@ -272,8 +272,8 @@ static bool compress_start(struct compress *c) return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK; } -static void *compress_next_page(struct compress *c, - struct drm_i915_error_object *dst) +static void *compress_next_page(struct i915_vma_compress *c, + struct i915_vma_coredump *dst) { void *page; @@ -287,14 +287,15 @@ static void *compress_next_page(struct compress *c, return dst->pages[dst->page_count++] = page; } -static int compress_page(struct compress *c, +static int compress_page(struct i915_vma_compress *c, void *src, - struct drm_i915_error_object *dst) + struct i915_vma_coredump *dst, + bool wc) { struct z_stream_s *zstream = &c->zstream; zstream->next_in = src; - if (c->wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) + if (wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) zstream->next_in = c->tmp; zstream->avail_in = PAGE_SIZE; @@ -318,8 +319,8 @@ static int compress_page(struct compress *c, return 0; } -static int compress_flush(struct compress *c, - struct drm_i915_error_object *dst) +static int compress_flush(struct i915_vma_compress *c, + struct i915_vma_coredump *dst) { struct z_stream_s *zstream = &c->zstream; @@ -347,12 +348,12 @@ end: return 0; } -static void compress_finish(struct compress *c) +static void compress_finish(struct i915_vma_compress *c) { zlib_deflateEnd(&c->zstream); } -static void compress_fini(struct compress *c) +static void compress_fini(struct i915_vma_compress *c) { kfree(c->zstream.workspace); if (c->tmp) @@ -367,24 +368,24 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m) #else -struct compress { +struct i915_vma_compress { struct pagevec pool; - bool wc; }; -static bool compress_init(struct compress *c) +static bool compress_init(struct i915_vma_compress *c) { return pool_init(&c->pool, ALLOW_FAIL) == 0; } -static bool compress_start(struct compress *c) +static bool compress_start(struct i915_vma_compress *c) { return true; } -static int compress_page(struct compress *c, +static int compress_page(struct i915_vma_compress *c, void *src, - struct drm_i915_error_object *dst) + struct i915_vma_coredump *dst, + bool wc) { void *ptr; @@ -392,24 +393,24 @@ static int compress_page(struct compress *c, if (!ptr) return -ENOMEM; - if (!(c->wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE))) + if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE))) memcpy(ptr, src, PAGE_SIZE); dst->pages[dst->page_count++] = ptr; return 0; } -static int compress_flush(struct compress *c, - struct drm_i915_error_object *dst) +static int compress_flush(struct i915_vma_compress *c, + struct i915_vma_coredump *dst) { return 0; } -static void compress_finish(struct compress *c) +static void compress_finish(struct i915_vma_compress *c) { } -static void compress_fini(struct compress *c) +static void compress_fini(struct i915_vma_compress *c) { pool_fini(&c->pool); } @@ -422,7 +423,7 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m) #endif static void error_print_instdone(struct drm_i915_error_state_buf *m, - const struct drm_i915_error_engine *ee) + const struct intel_engine_coredump *ee) { const struct sseu_dev_info *sseu = &RUNTIME_INFO(m->i915)->sseu; int slice; @@ -453,40 +454,56 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m, static void error_print_request(struct drm_i915_error_state_buf *m, const char *prefix, - const struct drm_i915_error_request *erq, - const unsigned long epoch) + const struct i915_request_coredump *erq) { if (!erq->seqno) return; - err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", + err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, start %08x, head %08x, tail %08x\n", prefix, erq->pid, erq->context, erq->seqno, test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &erq->flags) ? "!" : "", test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &erq->flags) ? "+" : "", erq->sched_attr.priority, - jiffies_to_msecs(erq->jiffies - epoch), erq->start, erq->head, erq->tail); } static void error_print_context(struct drm_i915_error_state_buf *m, const char *header, - const struct drm_i915_error_context *ctx) + const struct i915_gem_context_coredump *ctx) { err_printf(m, "%s%s[%d] prio %d, guilty %d active %d\n", header, ctx->comm, ctx->pid, ctx->sched_attr.priority, ctx->guilty, ctx->active); } +static struct i915_vma_coredump * +__find_vma(struct i915_vma_coredump *vma, const char *name) +{ + while (vma) { + if (strcmp(vma->name, name) == 0) + return vma; + vma = vma->next; + } + + return NULL; +} + +static struct i915_vma_coredump * +find_batch(const struct intel_engine_coredump *ee) +{ + return __find_vma(ee->vma, "batch"); +} + static void error_print_engine(struct drm_i915_error_state_buf *m, - const struct drm_i915_error_engine *ee, - const unsigned long epoch) + const struct intel_engine_coredump *ee) { + struct i915_vma_coredump *batch; int n; err_printf(m, "%s command stream:\n", ee->engine->name); - err_printf(m, " IDLE?: %s\n", yesno(ee->idle)); + err_printf(m, " CCID: 0x%08x\n", ee->ccid); err_printf(m, " START: 0x%08x\n", ee->start); err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head); err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n", @@ -501,9 +518,10 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, error_print_instdone(m, ee); - if (ee->batchbuffer) { - u64 start = ee->batchbuffer->gtt_offset; - u64 end = start + ee->batchbuffer->gtt_size; + batch = find_batch(ee); + if (batch) { + u64 start = batch->gtt_offset; + u64 end = start + batch->gtt_size; err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n", upper_32_bits(start), lower_32_bits(start), @@ -541,7 +559,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, for (n = 0; n < ee->num_ports; n++) { err_printf(m, " ELSP[%d]:", n); - error_print_request(m, " ", &ee->execlist[n], epoch); + error_print_request(m, " ", &ee->execlist[n]); } error_print_context(m, " Active context: ", &ee->context); @@ -556,38 +574,35 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) va_end(args); } -static void print_error_obj(struct drm_i915_error_state_buf *m, +static void print_error_vma(struct drm_i915_error_state_buf *m, const struct intel_engine_cs *engine, - const char *name, - const struct drm_i915_error_object *obj) + const struct i915_vma_coredump *vma) { char out[ASCII85_BUFSZ]; int page; - if (!obj) + if (!vma) return; - if (name) { - err_printf(m, "%s --- %s = 0x%08x %08x\n", - engine ? engine->name : "global", name, - upper_32_bits(obj->gtt_offset), - lower_32_bits(obj->gtt_offset)); - } + err_printf(m, "%s --- %s = 0x%08x %08x\n", + engine ? engine->name : "global", vma->name, + upper_32_bits(vma->gtt_offset), + lower_32_bits(vma->gtt_offset)); - if (obj->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K) - err_printf(m, "gtt_page_sizes = 0x%08x\n", obj->gtt_page_sizes); + if (vma->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K) + err_printf(m, "gtt_page_sizes = 0x%08x\n", vma->gtt_page_sizes); err_compression_marker(m); - for (page = 0; page < obj->page_count; page++) { + for (page = 0; page < vma->page_count; page++) { int i, len; len = PAGE_SIZE; - if (page == obj->page_count - 1) - len -= obj->unused; + if (page == vma->page_count - 1) + len -= vma->unused; len = ascii85_encode_len(len); for (i = 0; i < len; i++) - err_puts(m, ascii85_encode(obj->pages[page][i], out)); + err_puts(m, ascii85_encode(vma->pages[page][i], out)); } err_puts(m, "\n"); } @@ -626,18 +641,13 @@ static void err_print_pciid(struct drm_i915_error_state_buf *m, } static void err_print_uc(struct drm_i915_error_state_buf *m, - const struct i915_error_uc *error_uc) + const struct intel_uc_coredump *error_uc) { struct drm_printer p = i915_error_printer(m); - const struct i915_gpu_state *error = - container_of(error_uc, typeof(*error), uc); - - if (!error->device_info.has_gt_uc) - return; intel_uc_fw_dump(&error_uc->guc_fw, &p); intel_uc_fw_dump(&error_uc->huc_fw, &p); - print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log); + print_error_vma(m, NULL, error_uc->guc_log); } static void err_free_sgl(struct scatterlist *sgl) @@ -657,12 +667,78 @@ static void err_free_sgl(struct scatterlist *sgl) } } +static void err_print_gt(struct drm_i915_error_state_buf *m, + struct intel_gt_coredump *gt) +{ + const struct intel_engine_coredump *ee; + int i, j; + + err_printf(m, "GT awake: %s\n", yesno(gt->awake)); + err_printf(m, "EIR: 0x%08x\n", gt->eir); + err_printf(m, "IER: 0x%08x\n", gt->ier); + for (i = 0; i < gt->ngtier; i++) + err_printf(m, "GTIER[%d]: 0x%08x\n", i, gt->gtier[i]); + err_printf(m, "PGTBL_ER: 0x%08x\n", gt->pgtbl_er); + err_printf(m, "FORCEWAKE: 0x%08x\n", gt->forcewake); + err_printf(m, "DERRMR: 0x%08x\n", gt->derrmr); + + for (i = 0; i < gt->nfence; i++) + err_printf(m, " fence[%d] = %08llx\n", i, gt->fence[i]); + + if (IS_GEN_RANGE(m->i915, 6, 11)) { + err_printf(m, "ERROR: 0x%08x\n", gt->error); + err_printf(m, "DONE_REG: 0x%08x\n", gt->done_reg); + } + + if (INTEL_GEN(m->i915) >= 8) + err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", + gt->fault_data1, gt->fault_data0); + + if (IS_GEN(m->i915, 7)) + err_printf(m, "ERR_INT: 0x%08x\n", gt->err_int); + + if (IS_GEN_RANGE(m->i915, 8, 11)) + err_printf(m, "GTT_CACHE_EN: 0x%08x\n", gt->gtt_cache); + + if (IS_GEN(m->i915, 12)) + err_printf(m, "AUX_ERR_DBG: 0x%08x\n", gt->aux_err); + + if (INTEL_GEN(m->i915) >= 12) { + int i; + + for (i = 0; i < GEN12_SFC_DONE_MAX; i++) + err_printf(m, " SFC_DONE[%d]: 0x%08x\n", i, + gt->sfc_done[i]); + + err_printf(m, " GAM_DONE: 0x%08x\n", gt->gam_done); + } + + for (ee = gt->engine; ee; ee = ee->next) { + const struct i915_vma_coredump *vma; + + error_print_engine(m, ee); + + for (vma = ee->vma; vma; vma = vma->next) + print_error_vma(m, ee->engine, vma); + + if (ee->num_requests) { + err_printf(m, "%s --- %d requests\n", + ee->engine->name, + ee->num_requests); + for (j = 0; j < ee->num_requests; j++) + error_print_request(m, " ", &ee->requests[j]); + } + } + + if (gt->uc) + err_print_uc(m, gt->uc); +} + static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, - struct i915_gpu_state *error) + struct i915_gpu_coredump *error) { - const struct drm_i915_error_engine *ee; + const struct intel_engine_coredump *ee; struct timespec64 ts; - int i, j; if (*error->error_msg) err_printf(m, "%s\n", error->error_msg); @@ -682,7 +758,7 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, err_printf(m, "Capture: %lu jiffies; %d ms ago\n", error->capture, jiffies_to_msecs(jiffies - error->capture)); - for (ee = error->engine; ee; ee = ee->next) + for (ee = error->gt ? error->gt->engine : NULL; ee; ee = ee->next) err_printf(m, "Active process (on ring %s): %s [%d]\n", ee->engine->name, ee->context.comm, @@ -708,90 +784,11 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, CSR_VERSION_MINOR(csr->version)); } - err_printf(m, "GT awake: %s\n", yesno(error->awake)); err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock)); err_printf(m, "PM suspended: %s\n", yesno(error->suspended)); - err_printf(m, "EIR: 0x%08x\n", error->eir); - err_printf(m, "IER: 0x%08x\n", error->ier); - for (i = 0; i < error->ngtier; i++) - err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]); - err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er); - err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); - err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); - err_printf(m, "CCID: 0x%08x\n", error->ccid); - - for (i = 0; i < error->nfence; i++) - err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); - - if (IS_GEN_RANGE(m->i915, 6, 11)) { - err_printf(m, "ERROR: 0x%08x\n", error->error); - err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg); - } - - if (INTEL_GEN(m->i915) >= 8) - err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", - error->fault_data1, error->fault_data0); - - if (IS_GEN(m->i915, 7)) - err_printf(m, "ERR_INT: 0x%08x\n", error->err_int); - - if (IS_GEN_RANGE(m->i915, 8, 11)) - err_printf(m, "GTT_CACHE_EN: 0x%08x\n", error->gtt_cache); - - if (IS_GEN(m->i915, 12)) - err_printf(m, "AUX_ERR_DBG: 0x%08x\n", error->aux_err); - - if (INTEL_GEN(m->i915) >= 12) { - int i; - - for (i = 0; i < GEN12_SFC_DONE_MAX; i++) - err_printf(m, " SFC_DONE[%d]: 0x%08x\n", i, - error->sfc_done[i]); - - err_printf(m, " GAM_DONE: 0x%08x\n", error->gam_done); - } - - for (ee = error->engine; ee; ee = ee->next) - error_print_engine(m, ee, error->capture); - - for (ee = error->engine; ee; ee = ee->next) { - const struct drm_i915_error_object *obj; - - obj = ee->batchbuffer; - if (obj) { - err_puts(m, ee->engine->name); - if (ee->context.pid) - err_printf(m, " (submitted by %s [%d])", - ee->context.comm, - ee->context.pid); - err_printf(m, " --- gtt_offset = 0x%08x %08x\n", - upper_32_bits(obj->gtt_offset), - lower_32_bits(obj->gtt_offset)); - print_error_obj(m, ee->engine, NULL, obj); - } - - for (j = 0; j < ee->user_bo_count; j++) - print_error_obj(m, ee->engine, "user", ee->user_bo[j]); - - if (ee->num_requests) { - err_printf(m, "%s --- %d requests\n", - ee->engine->name, - ee->num_requests); - for (j = 0; j < ee->num_requests; j++) - error_print_request(m, " ", - &ee->requests[j], - error->capture); - } - print_error_obj(m, ee->engine, "ringbuffer", ee->ringbuffer); - print_error_obj(m, ee->engine, "HW Status", ee->hws_page); - print_error_obj(m, ee->engine, "HW context", ee->ctx); - print_error_obj(m, ee->engine, "WA context", ee->wa_ctx); - print_error_obj(m, ee->engine, - "WA batchbuffer", ee->wa_batchbuffer); - print_error_obj(m, ee->engine, - "NULL context", ee->default_state); - } + if (error->gt) + err_print_gt(m, error->gt); if (error->overlay) intel_overlay_print_error_state(m, error->overlay); @@ -802,10 +799,9 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, err_print_capabilities(m, &error->device_info, &error->runtime_info, &error->driver_caps); err_print_params(m, &error->params); - err_print_uc(m, &error->uc); } -static int err_print_to_sgl(struct i915_gpu_state *error) +static int err_print_to_sgl(struct i915_gpu_coredump *error) { struct drm_i915_error_state_buf m; @@ -842,8 +838,8 @@ static int err_print_to_sgl(struct i915_gpu_state *error) return 0; } -ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error, - char *buf, loff_t off, size_t rem) +ssize_t i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error, + char *buf, loff_t off, size_t rem) { struct scatterlist *sg; size_t count; @@ -906,85 +902,89 @@ ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error, return count; } -static void i915_error_object_free(struct drm_i915_error_object *obj) +static void i915_vma_coredump_free(struct i915_vma_coredump *vma) { - int page; - - if (obj == NULL) - return; + while (vma) { + struct i915_vma_coredump *next = vma->next; + int page; - for (page = 0; page < obj->page_count; page++) - free_page((unsigned long)obj->pages[page]); + for (page = 0; page < vma->page_count; page++) + free_page((unsigned long)vma->pages[page]); - kfree(obj); + kfree(vma); + vma = next; + } } - -static void cleanup_params(struct i915_gpu_state *error) +static void cleanup_params(struct i915_gpu_coredump *error) { i915_params_free(&error->params); } -static void cleanup_uc_state(struct i915_gpu_state *error) +static void cleanup_uc(struct intel_uc_coredump *uc) { - struct i915_error_uc *error_uc = &error->uc; + kfree(uc->guc_fw.path); + kfree(uc->huc_fw.path); + i915_vma_coredump_free(uc->guc_log); - kfree(error_uc->guc_fw.path); - kfree(error_uc->huc_fw.path); - i915_error_object_free(error_uc->guc_log); + kfree(uc); } -void __i915_gpu_state_free(struct kref *error_ref) +static void cleanup_gt(struct intel_gt_coredump *gt) { - struct i915_gpu_state *error = - container_of(error_ref, typeof(*error), ref); - long i; + while (gt->engine) { + struct intel_engine_coredump *ee = gt->engine; + + gt->engine = ee->next; - while (error->engine) { - struct drm_i915_error_engine *ee = error->engine; + i915_vma_coredump_free(ee->vma); + kfree(ee->requests); + kfree(ee); + } - error->engine = ee->next; + if (gt->uc) + cleanup_uc(gt->uc); - for (i = 0; i < ee->user_bo_count; i++) - i915_error_object_free(ee->user_bo[i]); - kfree(ee->user_bo); + kfree(gt); +} - i915_error_object_free(ee->batchbuffer); - i915_error_object_free(ee->wa_batchbuffer); - i915_error_object_free(ee->ringbuffer); - i915_error_object_free(ee->hws_page); - i915_error_object_free(ee->ctx); - i915_error_object_free(ee->wa_ctx); +void __i915_gpu_coredump_free(struct kref *error_ref) +{ + struct i915_gpu_coredump *error = + container_of(error_ref, typeof(*error), ref); - kfree(ee->requests); - kfree(ee); + while (error->gt) { + struct intel_gt_coredump *gt = error->gt; + + error->gt = gt->next; + cleanup_gt(gt); } kfree(error->overlay); kfree(error->display); cleanup_params(error); - cleanup_uc_state(error); err_free_sgl(error->sgl); kfree(error); } -static struct drm_i915_error_object * -i915_error_object_create(struct drm_i915_private *i915, - struct i915_vma *vma, - struct compress *compress) +static struct i915_vma_coredump * +i915_vma_coredump_create(const struct intel_gt *gt, + const struct i915_vma *vma, + const char *name, + struct i915_vma_compress *compress) { - struct i915_ggtt *ggtt = &i915->ggtt; + struct i915_ggtt *ggtt = gt->ggtt; const u64 slot = ggtt->error_capture.start; - struct drm_i915_error_object *dst; + struct i915_vma_coredump *dst; unsigned long num_pages; struct sgt_iter iter; int ret; might_sleep(); - if (!vma || !vma->pages) + if (!vma || !vma->pages || !compress) return NULL; num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT; @@ -998,6 +998,9 @@ i915_error_object_create(struct drm_i915_private *i915, return NULL; } + strcpy(dst->name, name); + dst->next = NULL; + dst->gtt_offset = vma->node.start; dst->gtt_size = vma->node.size; dst->gtt_page_sizes = vma->page_sizes.gtt; @@ -1005,9 +1008,6 @@ i915_error_object_create(struct drm_i915_private *i915, dst->page_count = 0; dst->unused = 0; - compress->wc = i915_gem_object_is_lmem(vma->obj) || - drm_mm_node_allocated(&ggtt->error_capture); - ret = -EINVAL; if (drm_mm_node_allocated(&ggtt->error_capture)) { void __iomem *s; @@ -1016,9 +1016,12 @@ i915_error_object_create(struct drm_i915_private *i915, for_each_sgt_daddr(dma, iter, vma->pages) { ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0); + mb(); s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE); - ret = compress_page(compress, (void __force *)s, dst); + ret = compress_page(compress, + (void __force *)s, dst, + true); io_mapping_unmap(s); if (ret) break; @@ -1031,7 +1034,9 @@ i915_error_object_create(struct drm_i915_private *i915, void __iomem *s; s = io_mapping_map_wc(&mem->iomap, dma, PAGE_SIZE); - ret = compress_page(compress, (void __force *)s, dst); + ret = compress_page(compress, + (void __force *)s, dst, + true); io_mapping_unmap(s); if (ret) break; @@ -1045,7 +1050,7 @@ i915_error_object_create(struct drm_i915_private *i915, drm_clflush_pages(&page, 1); s = kmap(page); - ret = compress_page(compress, s, dst); + ret = compress_page(compress, s, dst, false); kunmap(page); drm_clflush_pages(&page, 1); @@ -1066,77 +1071,56 @@ i915_error_object_create(struct drm_i915_private *i915, return dst; } -/* - * Generate a semi-unique error code. The code is not meant to have meaning, The - * code's only purpose is to try to prevent false duplicated bug reports by - * grossly estimating a GPU error state. - * - * TODO Ideally, hashing the batchbuffer would be a very nice way to determine - * the hang if we could strip the GTT offset information from it. - * - * It's only a small step better than a random number in its current form. - */ -static u32 i915_error_generate_code(struct i915_gpu_state *error) +static void gt_record_fences(struct intel_gt_coredump *gt) { - const struct drm_i915_error_engine *ee = error->engine; - - /* - * IPEHR would be an ideal way to detect errors, as it's the gross - * measure of "the command that hung." However, has some very common - * synchronization commands which almost always appear in the case - * strictly a client bug. Use instdone to differentiate those some. - */ - return ee ? ee->ipehr ^ ee->instdone.instdone : 0; -} - -static void gem_record_fences(struct i915_gpu_state *error) -{ - struct drm_i915_private *dev_priv = error->i915; - struct intel_uncore *uncore = &dev_priv->uncore; + struct i915_ggtt *ggtt = gt->_gt->ggtt; + struct intel_uncore *uncore = gt->_gt->uncore; int i; - if (INTEL_GEN(dev_priv) >= 6) { - for (i = 0; i < dev_priv->ggtt.num_fences; i++) - error->fence[i] = + if (INTEL_GEN(uncore->i915) >= 6) { + for (i = 0; i < ggtt->num_fences; i++) + gt->fence[i] = intel_uncore_read64(uncore, FENCE_REG_GEN6_LO(i)); - } else if (INTEL_GEN(dev_priv) >= 4) { - for (i = 0; i < dev_priv->ggtt.num_fences; i++) - error->fence[i] = + } else if (INTEL_GEN(uncore->i915) >= 4) { + for (i = 0; i < ggtt->num_fences; i++) + gt->fence[i] = intel_uncore_read64(uncore, FENCE_REG_965_LO(i)); } else { - for (i = 0; i < dev_priv->ggtt.num_fences; i++) - error->fence[i] = + for (i = 0; i < ggtt->num_fences; i++) + gt->fence[i] = intel_uncore_read(uncore, FENCE_REG(i)); } - error->nfence = i; + gt->nfence = i; } -static void error_record_engine_registers(struct i915_gpu_state *error, - struct intel_engine_cs *engine, - struct drm_i915_error_engine *ee) +static void engine_record_registers(struct intel_engine_coredump *ee) { - struct drm_i915_private *dev_priv = engine->i915; + const struct intel_engine_cs *engine = ee->engine; + struct drm_i915_private *i915 = engine->i915; - if (INTEL_GEN(dev_priv) >= 6) { + if (INTEL_GEN(i915) >= 6) { ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL); - if (INTEL_GEN(dev_priv) >= 12) - ee->fault_reg = I915_READ(GEN12_RING_FAULT_REG); - else if (INTEL_GEN(dev_priv) >= 8) - ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG); + if (INTEL_GEN(i915) >= 12) + ee->fault_reg = intel_uncore_read(engine->uncore, + GEN12_RING_FAULT_REG); + else if (INTEL_GEN(i915) >= 8) + ee->fault_reg = intel_uncore_read(engine->uncore, + GEN8_RING_FAULT_REG); else ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine); } - if (INTEL_GEN(dev_priv) >= 4) { + if (INTEL_GEN(i915) >= 4) { ee->faddr = ENGINE_READ(engine, RING_DMA_FADD); ee->ipeir = ENGINE_READ(engine, RING_IPEIR); ee->ipehr = ENGINE_READ(engine, RING_IPEHR); ee->instps = ENGINE_READ(engine, RING_INSTPS); ee->bbaddr = ENGINE_READ(engine, RING_BBADDR); - if (INTEL_GEN(dev_priv) >= 8) { + ee->ccid = ENGINE_READ(engine, CCID); + if (INTEL_GEN(i915) >= 8) { ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32; ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32; } @@ -1155,13 +1139,13 @@ static void error_record_engine_registers(struct i915_gpu_state *error, ee->head = ENGINE_READ(engine, RING_HEAD); ee->tail = ENGINE_READ(engine, RING_TAIL); ee->ctl = ENGINE_READ(engine, RING_CTL); - if (INTEL_GEN(dev_priv) > 2) + if (INTEL_GEN(i915) > 2) ee->mode = ENGINE_READ(engine, RING_MI_MODE); - if (!HWS_NEEDS_PHYSICAL(dev_priv)) { + if (!HWS_NEEDS_PHYSICAL(i915)) { i915_reg_t mmio; - if (IS_GEN(dev_priv, 7)) { + if (IS_GEN(i915, 7)) { switch (engine->id) { default: MISSING_CASE(engine->id); @@ -1186,40 +1170,40 @@ static void error_record_engine_registers(struct i915_gpu_state *error, mmio = RING_HWS_PGA(engine->mmio_base); } - ee->hws = I915_READ(mmio); + ee->hws = intel_uncore_read(engine->uncore, mmio); } - ee->idle = intel_engine_is_idle(engine); - ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error, - engine); + ee->reset_count = i915_reset_engine_count(&i915->gpu_error, engine); - if (HAS_PPGTT(dev_priv)) { + if (HAS_PPGTT(i915)) { int i; ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7); - if (IS_GEN(dev_priv, 6)) { + if (IS_GEN(i915, 6)) { ee->vm_info.pp_dir_base = ENGINE_READ(engine, RING_PP_DIR_BASE_READ); - } else if (IS_GEN(dev_priv, 7)) { + } else if (IS_GEN(i915, 7)) { ee->vm_info.pp_dir_base = ENGINE_READ(engine, RING_PP_DIR_BASE); - } else if (INTEL_GEN(dev_priv) >= 8) { + } else if (INTEL_GEN(i915) >= 8) { u32 base = engine->mmio_base; for (i = 0; i < 4; i++) { ee->vm_info.pdp[i] = - I915_READ(GEN8_RING_PDP_UDW(base, i)); + intel_uncore_read(engine->uncore, + GEN8_RING_PDP_UDW(base, i)); ee->vm_info.pdp[i] <<= 32; ee->vm_info.pdp[i] |= - I915_READ(GEN8_RING_PDP_LDW(base, i)); + intel_uncore_read(engine->uncore, + GEN8_RING_PDP_LDW(base, i)); } } } } static void record_request(const struct i915_request *request, - struct drm_i915_error_request *erq) + struct i915_request_coredump *erq) { const struct i915_gem_context *ctx; @@ -1227,7 +1211,6 @@ static void record_request(const struct i915_request *request, erq->context = request->fence.context; erq->seqno = request->fence.seqno; erq->sched_attr = request->sched.attr; - erq->jiffies = request->emitted_jiffies; erq->start = i915_ggtt_offset(request->ring->vma); erq->head = request->head; erq->tail = request->tail; @@ -1240,9 +1223,9 @@ static void record_request(const struct i915_request *request, rcu_read_unlock(); } -static void engine_record_requests(struct intel_engine_cs *engine, +static void engine_record_requests(const struct intel_engine_cs *engine, struct i915_request *first, - struct drm_i915_error_engine *ee) + struct intel_engine_coredump *ee) { struct i915_request *request; int count; @@ -1288,11 +1271,10 @@ static void engine_record_requests(struct intel_engine_cs *engine, ee->num_requests = count; } -static void error_record_engine_execlists(const struct intel_engine_cs *engine, - struct drm_i915_error_engine *ee) +static void engine_record_execlists(struct intel_engine_coredump *ee) { - const struct intel_engine_execlists * const execlists = &engine->execlists; - struct i915_request * const *port = execlists->active; + const struct intel_engine_execlists * const el = &ee->engine->execlists; + struct i915_request * const *port = el->active; unsigned int n = 0; while (*port) @@ -1301,7 +1283,7 @@ static void error_record_engine_execlists(const struct intel_engine_cs *engine, ee->num_ports = n; } -static bool record_context(struct drm_i915_error_context *e, +static bool record_context(struct i915_gem_context_coredump *e, const struct i915_request *rq) { struct i915_gem_context *ctx; @@ -1334,23 +1316,24 @@ static bool record_context(struct drm_i915_error_context *e, return capture; } -struct capture_vma { - struct capture_vma *next; - void **slot; +struct intel_engine_capture_vma { + struct intel_engine_capture_vma *next; + struct i915_vma *vma; + char name[16]; }; -static struct capture_vma * -capture_vma(struct capture_vma *next, +static struct intel_engine_capture_vma * +capture_vma(struct intel_engine_capture_vma *next, struct i915_vma *vma, - struct drm_i915_error_object **out) + const char *name, + gfp_t gfp) { - struct capture_vma *c; + struct intel_engine_capture_vma *c; - *out = NULL; if (!vma) return next; - c = kmalloc(sizeof(*c), ATOMIC_MAYFAIL); + c = kmalloc(sizeof(*c), gfp); if (!c) return next; @@ -1359,54 +1342,31 @@ capture_vma(struct capture_vma *next, return next; } - c->slot = (void **)out; - *c->slot = i915_vma_get(vma); + strcpy(c->name, name); + c->vma = i915_vma_get(vma); c->next = next; return c; } -static struct capture_vma * -request_record_user_bo(struct i915_request *request, - struct drm_i915_error_engine *ee, - struct capture_vma *capture) +static struct intel_engine_capture_vma * +capture_user(struct intel_engine_capture_vma *capture, + const struct i915_request *rq, + gfp_t gfp) { struct i915_capture_list *c; - struct drm_i915_error_object **bo; - long count, max; - - max = 0; - for (c = request->capture_list; c; c = c->next) - max++; - if (!max) - return capture; - - bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL); - if (!bo) { - /* If we can't capture everything, try to capture something. */ - max = min_t(long, max, PAGE_SIZE / sizeof(*bo)); - bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL); - } - if (!bo) - return capture; - count = 0; - for (c = request->capture_list; c; c = c->next) { - capture = capture_vma(capture, c->vma, &bo[count]); - if (++count == max) - break; - } - - ee->user_bo = bo; - ee->user_bo_count = count; + for (c = rq->capture_list; c; c = c->next) + capture = capture_vma(capture, c->vma, "user", gfp); return capture; } -static struct drm_i915_error_object * -capture_object(struct drm_i915_private *dev_priv, +static struct i915_vma_coredump * +capture_object(const struct intel_gt *gt, struct drm_i915_gem_object *obj, - struct compress *compress) + const char *name, + struct i915_vma_compress *compress) { if (obj && i915_gem_object_has_pages(obj)) { struct i915_vma fake = { @@ -1416,127 +1376,183 @@ capture_object(struct drm_i915_private *dev_priv, .obj = obj, }; - return i915_error_object_create(dev_priv, &fake, compress); + return i915_vma_coredump_create(gt, &fake, name, compress); } else { return NULL; } } -static void -gem_record_rings(struct i915_gpu_state *error, struct compress *compress) +static void add_vma(struct intel_engine_coredump *ee, + struct i915_vma_coredump *vma) { - struct drm_i915_private *i915 = error->i915; - struct intel_engine_cs *engine; - struct drm_i915_error_engine *ee; + if (vma) { + vma->next = ee->vma; + ee->vma = vma; + } +} + +struct intel_engine_coredump * +intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp) +{ + struct intel_engine_coredump *ee; - ee = kzalloc(sizeof(*ee), GFP_KERNEL); + ee = kzalloc(sizeof(*ee), gfp); if (!ee) - return; + return NULL; - for_each_uabi_engine(engine, i915) { - struct capture_vma *capture = NULL; - struct i915_request *request; - unsigned long flags; + ee->engine = engine; - /* Refill our page pool before entering atomic section */ - pool_refill(&compress->pool, ALLOW_FAIL); + engine_record_registers(ee); + engine_record_execlists(ee); - spin_lock_irqsave(&engine->active.lock, flags); - request = intel_engine_find_active_request(engine); - if (!request) { - spin_unlock_irqrestore(&engine->active.lock, flags); - continue; - } + return ee; +} - error->simulated |= record_context(&ee->context, request); +struct intel_engine_capture_vma * +intel_engine_coredump_add_request(struct intel_engine_coredump *ee, + struct i915_request *rq, + gfp_t gfp) +{ + struct intel_engine_capture_vma *vma = NULL; - /* - * We need to copy these to an anonymous buffer - * as the simplest method to avoid being overwritten - * by userspace. - */ - capture = capture_vma(capture, - request->batch, - &ee->batchbuffer); + ee->simulated |= record_context(&ee->context, rq); + if (ee->simulated) + return NULL; - if (HAS_BROKEN_CS_TLB(i915)) - capture = capture_vma(capture, - engine->gt->scratch, - &ee->wa_batchbuffer); + /* + * We need to copy these to an anonymous buffer + * as the simplest method to avoid being overwritten + * by userspace. + */ + vma = capture_vma(vma, rq->batch, "batch", gfp); + vma = capture_user(vma, rq, gfp); + vma = capture_vma(vma, rq->ring->vma, "ring", gfp); + vma = capture_vma(vma, rq->context->state, "HW context", gfp); + if (HAS_BROKEN_CS_TLB(rq->i915)) + vma = capture_vma(vma, ee->engine->gt->scratch, "WA batch", gfp); - capture = request_record_user_bo(request, ee, capture); + ee->cpu_ring_head = rq->ring->head; + ee->cpu_ring_tail = rq->ring->tail; - capture = capture_vma(capture, - request->context->state, - &ee->ctx); + ee->rq_head = rq->head; + ee->rq_post = rq->postfix; + ee->rq_tail = rq->tail; - capture = capture_vma(capture, - request->ring->vma, - &ee->ringbuffer); + return vma; +} - ee->cpu_ring_head = request->ring->head; - ee->cpu_ring_tail = request->ring->tail; +void +intel_engine_coredump_add_vma(struct intel_engine_coredump *ee, + struct intel_engine_capture_vma *capture, + struct i915_vma_compress *compress) +{ + const struct intel_engine_cs *engine = ee->engine; - ee->rq_head = request->head; - ee->rq_post = request->postfix; - ee->rq_tail = request->tail; + while (capture) { + struct intel_engine_capture_vma *this = capture; + struct i915_vma *vma = this->vma; - engine_record_requests(engine, request, ee); - spin_unlock_irqrestore(&engine->active.lock, flags); + add_vma(ee, + i915_vma_coredump_create(engine->gt, + vma, this->name, + compress)); - error_record_engine_registers(error, engine, ee); - error_record_engine_execlists(engine, ee); + i915_active_release(&vma->active); + i915_vma_put(vma); - while (capture) { - struct capture_vma *this = capture; - struct i915_vma *vma = *this->slot; + capture = this->next; + kfree(this); + } - *this->slot = - i915_error_object_create(i915, vma, compress); + add_vma(ee, + i915_vma_coredump_create(engine->gt, + engine->status_page.vma, + "HW Status", + compress)); - i915_active_release(&vma->active); - i915_vma_put(vma); + add_vma(ee, + i915_vma_coredump_create(engine->gt, + engine->wa_ctx.vma, + "WA context", + compress)); - capture = this->next; - kfree(this); - } + add_vma(ee, + capture_object(engine->gt, + engine->default_state, + "NULL context", + compress)); +} + +static struct intel_engine_coredump * +capture_engine(struct intel_engine_cs *engine, + struct i915_vma_compress *compress) +{ + struct intel_engine_capture_vma *capture; + struct intel_engine_coredump *ee; + struct i915_request *rq; + unsigned long flags; - ee->hws_page = - i915_error_object_create(i915, - engine->status_page.vma, - compress); + ee = intel_engine_coredump_alloc(engine, GFP_KERNEL); + if (!ee) + return NULL; - ee->wa_ctx = - i915_error_object_create(i915, - engine->wa_ctx.vma, - compress); + spin_lock_irqsave(&engine->active.lock, flags); - ee->default_state = - capture_object(i915, engine->default_state, compress); + rq = intel_engine_find_active_request(engine); + if (!rq) { + spin_unlock_irqrestore(&engine->active.lock, flags); + kfree(ee); + return NULL; + } - ee->engine = engine; + capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); + engine_record_requests(engine, rq, ee); - ee->next = error->engine; - error->engine = ee; + spin_unlock_irqrestore(&engine->active.lock, flags); - ee = kzalloc(sizeof(*ee), GFP_KERNEL); - if (!ee) - return; - } + intel_engine_coredump_add_vma(ee, capture, compress); - kfree(ee); + return ee; } static void -capture_uc_state(struct i915_gpu_state *error, struct compress *compress) +gt_record_engines(struct intel_gt_coredump *gt, + struct i915_vma_compress *compress) { - struct drm_i915_private *i915 = error->i915; - struct i915_error_uc *error_uc = &error->uc; - struct intel_uc *uc = &i915->gt.uc; + struct intel_engine_cs *engine; + enum intel_engine_id id; - /* Capturing uC state won't be useful if there is no GuC */ - if (!error->device_info.has_gt_uc) - return; + for_each_engine(engine, gt->_gt, id) { + struct intel_engine_coredump *ee; + + /* Refill our page pool before entering atomic section */ + pool_refill(&compress->pool, ALLOW_FAIL); + + ee = capture_engine(engine, compress); + if (!ee) + continue; + + gt->simulated |= ee->simulated; + if (ee->simulated) { + kfree(ee); + continue; + } + + ee->next = gt->engine; + gt->engine = ee; + } +} + +static struct intel_uc_coredump * +gt_record_uc(struct intel_gt_coredump *gt, + struct i915_vma_compress *compress) +{ + const struct intel_uc *uc = >->_gt->uc; + struct intel_uc_coredump *error_uc; + + error_uc = kzalloc(sizeof(*error_uc), ALLOW_FAIL); + if (!error_uc) + return NULL; memcpy(&error_uc->guc_fw, &uc->guc.fw, sizeof(uc->guc.fw)); memcpy(&error_uc->huc_fw, &uc->huc.fw, sizeof(uc->huc.fw)); @@ -1547,19 +1563,42 @@ capture_uc_state(struct i915_gpu_state *error, struct compress *compress) */ error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL); error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL); - error_uc->guc_log = i915_error_object_create(i915, - uc->guc.log.vma, - compress); + error_uc->guc_log = + i915_vma_coredump_create(gt->_gt, + uc->guc.log.vma, "GuC log buffer", + compress); + + return error_uc; +} + +static void gt_capture_prepare(struct intel_gt_coredump *gt) +{ + struct i915_ggtt *ggtt = gt->_gt->ggtt; + + mutex_lock(&ggtt->error_mutex); +} + +static void gt_capture_finish(struct intel_gt_coredump *gt) +{ + struct i915_ggtt *ggtt = gt->_gt->ggtt; + + if (drm_mm_node_allocated(&ggtt->error_capture)) + ggtt->vm.clear_range(&ggtt->vm, + ggtt->error_capture.start, + PAGE_SIZE); + + mutex_unlock(&ggtt->error_mutex); } /* Capture all registers which don't fit into another category. */ -static void capture_reg_state(struct i915_gpu_state *error) +static void gt_record_regs(struct intel_gt_coredump *gt) { - struct drm_i915_private *i915 = error->i915; - struct intel_uncore *uncore = &i915->uncore; + struct intel_uncore *uncore = gt->_gt->uncore; + struct drm_i915_private *i915 = uncore->i915; int i; - /* General organization + /* + * General organization * 1. Registers specific to a single generation * 2. Registers which belong to multiple generations * 3. Feature specific registers. @@ -1569,138 +1608,162 @@ static void capture_reg_state(struct i915_gpu_state *error) /* 1: Registers specific to a single generation */ if (IS_VALLEYVIEW(i915)) { - error->gtier[0] = intel_uncore_read(uncore, GTIER); - error->ier = intel_uncore_read(uncore, VLV_IER); - error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV); + gt->gtier[0] = intel_uncore_read(uncore, GTIER); + gt->ier = intel_uncore_read(uncore, VLV_IER); + gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV); } if (IS_GEN(i915, 7)) - error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT); + gt->err_int = intel_uncore_read(uncore, GEN7_ERR_INT); if (INTEL_GEN(i915) >= 12) { - error->fault_data0 = intel_uncore_read(uncore, - GEN12_FAULT_TLB_DATA0); - error->fault_data1 = intel_uncore_read(uncore, - GEN12_FAULT_TLB_DATA1); + gt->fault_data0 = intel_uncore_read(uncore, + GEN12_FAULT_TLB_DATA0); + gt->fault_data1 = intel_uncore_read(uncore, + GEN12_FAULT_TLB_DATA1); } else if (INTEL_GEN(i915) >= 8) { - error->fault_data0 = intel_uncore_read(uncore, - GEN8_FAULT_TLB_DATA0); - error->fault_data1 = intel_uncore_read(uncore, - GEN8_FAULT_TLB_DATA1); + gt->fault_data0 = intel_uncore_read(uncore, + GEN8_FAULT_TLB_DATA0); + gt->fault_data1 = intel_uncore_read(uncore, + GEN8_FAULT_TLB_DATA1); } if (IS_GEN(i915, 6)) { - error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE); - error->gab_ctl = intel_uncore_read(uncore, GAB_CTL); - error->gfx_mode = intel_uncore_read(uncore, GFX_MODE); + gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE); + gt->gab_ctl = intel_uncore_read(uncore, GAB_CTL); + gt->gfx_mode = intel_uncore_read(uncore, GFX_MODE); } /* 2: Registers which belong to multiple generations */ if (INTEL_GEN(i915) >= 7) - error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT); + gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT); if (INTEL_GEN(i915) >= 6) { - error->derrmr = intel_uncore_read(uncore, DERRMR); + gt->derrmr = intel_uncore_read(uncore, DERRMR); if (INTEL_GEN(i915) < 12) { - error->error = intel_uncore_read(uncore, ERROR_GEN6); - error->done_reg = intel_uncore_read(uncore, DONE_REG); + gt->error = intel_uncore_read(uncore, ERROR_GEN6); + gt->done_reg = intel_uncore_read(uncore, DONE_REG); } } - if (INTEL_GEN(i915) >= 5) - error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE)); - /* 3: Feature specific registers */ if (IS_GEN_RANGE(i915, 6, 7)) { - error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK); - error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS); + gt->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK); + gt->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS); } if (IS_GEN_RANGE(i915, 8, 11)) - error->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN); + gt->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN); if (IS_GEN(i915, 12)) - error->aux_err = intel_uncore_read(uncore, GEN12_AUX_ERR_DBG); + gt->aux_err = intel_uncore_read(uncore, GEN12_AUX_ERR_DBG); if (INTEL_GEN(i915) >= 12) { for (i = 0; i < GEN12_SFC_DONE_MAX; i++) { - error->sfc_done[i] = + gt->sfc_done[i] = intel_uncore_read(uncore, GEN12_SFC_DONE(i)); } - error->gam_done = intel_uncore_read(uncore, GEN12_GAM_DONE); + gt->gam_done = intel_uncore_read(uncore, GEN12_GAM_DONE); } /* 4: Everything else */ if (INTEL_GEN(i915) >= 11) { - error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); - error->gtier[0] = + gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); + gt->gtier[0] = intel_uncore_read(uncore, GEN11_RENDER_COPY_INTR_ENABLE); - error->gtier[1] = + gt->gtier[1] = intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE); - error->gtier[2] = + gt->gtier[2] = intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE); - error->gtier[3] = + gt->gtier[3] = intel_uncore_read(uncore, GEN11_GPM_WGBOXPERF_INTR_ENABLE); - error->gtier[4] = + gt->gtier[4] = intel_uncore_read(uncore, GEN11_CRYPTO_RSVD_INTR_ENABLE); - error->gtier[5] = + gt->gtier[5] = intel_uncore_read(uncore, GEN11_GUNIT_CSME_INTR_ENABLE); - error->ngtier = 6; + gt->ngtier = 6; } else if (INTEL_GEN(i915) >= 8) { - error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); + gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); for (i = 0; i < 4; i++) - error->gtier[i] = intel_uncore_read(uncore, - GEN8_GT_IER(i)); - error->ngtier = 4; + gt->gtier[i] = + intel_uncore_read(uncore, GEN8_GT_IER(i)); + gt->ngtier = 4; } else if (HAS_PCH_SPLIT(i915)) { - error->ier = intel_uncore_read(uncore, DEIER); - error->gtier[0] = intel_uncore_read(uncore, GTIER); - error->ngtier = 1; + gt->ier = intel_uncore_read(uncore, DEIER); + gt->gtier[0] = intel_uncore_read(uncore, GTIER); + gt->ngtier = 1; } else if (IS_GEN(i915, 2)) { - error->ier = intel_uncore_read16(uncore, GEN2_IER); + gt->ier = intel_uncore_read16(uncore, GEN2_IER); } else if (!IS_VALLEYVIEW(i915)) { - error->ier = intel_uncore_read(uncore, GEN2_IER); + gt->ier = intel_uncore_read(uncore, GEN2_IER); } - error->eir = intel_uncore_read(uncore, EIR); - error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER); + gt->eir = intel_uncore_read(uncore, EIR); + gt->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER); +} + +/* + * Generate a semi-unique error code. The code is not meant to have meaning, The + * code's only purpose is to try to prevent false duplicated bug reports by + * grossly estimating a GPU error state. + * + * TODO Ideally, hashing the batchbuffer would be a very nice way to determine + * the hang if we could strip the GTT offset information from it. + * + * It's only a small step better than a random number in its current form. + */ +static u32 generate_ecode(const struct intel_engine_coredump *ee) +{ + /* + * IPEHR would be an ideal way to detect errors, as it's the gross + * measure of "the command that hung." However, has some very common + * synchronization commands which almost always appear in the case + * strictly a client bug. Use instdone to differentiate those some. + */ + return ee ? ee->ipehr ^ ee->instdone.instdone : 0; } -static const char * -error_msg(struct i915_gpu_state *error, - intel_engine_mask_t engines, const char *msg) +static const char *error_msg(struct i915_gpu_coredump *error) { + struct intel_engine_coredump *first = NULL; + struct intel_gt_coredump *gt; + intel_engine_mask_t engines; int len; + engines = 0; + for (gt = error->gt; gt; gt = gt->next) { + struct intel_engine_coredump *cs; + + if (gt->engine && !first) + first = gt->engine; + + for (cs = gt->engine; cs; cs = cs->next) + engines |= cs->engine->mask; + } + len = scnprintf(error->error_msg, sizeof(error->error_msg), - "GPU HANG: ecode %d:%x:0x%08x", + "GPU HANG: ecode %d:%x:%08x", INTEL_GEN(error->i915), engines, - i915_error_generate_code(error)); - if (error->engine) { + generate_ecode(first)); + if (first) { /* Just show the first executing process, more is confusing */ len += scnprintf(error->error_msg + len, sizeof(error->error_msg) - len, ", in %s [%d]", - error->engine->context.comm, - error->engine->context.pid); + first->context.comm, first->context.pid); } - if (msg) - len += scnprintf(error->error_msg + len, - sizeof(error->error_msg) - len, - ", %s", msg); return error->error_msg; } -static void capture_gen_state(struct i915_gpu_state *error) +static void capture_gen(struct i915_gpu_coredump *error) { struct drm_i915_private *i915 = error->i915; - error->awake = i915->gt.awake; error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count); error->suspended = i915->runtime_pm.suspended; @@ -1711,6 +1774,7 @@ static void capture_gen_state(struct i915_gpu_state *error) error->reset_count = i915_reset_count(&i915->gpu_error); error->suspend_count = i915->suspend_count; + i915_params_copy(&error->params, &i915_modparams); memcpy(&error->device_info, INTEL_INFO(i915), sizeof(error->device_info)); @@ -1720,115 +1784,138 @@ static void capture_gen_state(struct i915_gpu_state *error) error->driver_caps = i915->caps; } -static void capture_params(struct i915_gpu_state *error) +struct i915_gpu_coredump * +i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp) { - i915_params_copy(&error->params, &i915_modparams); + struct i915_gpu_coredump *error; + + if (!i915_modparams.error_capture) + return NULL; + + error = kzalloc(sizeof(*error), gfp); + if (!error) + return NULL; + + kref_init(&error->ref); + error->i915 = i915; + + error->time = ktime_get_real(); + error->boottime = ktime_get_boottime(); + error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time); + error->capture = jiffies; + + capture_gen(error); + + return error; } -static void capture_finish(struct i915_gpu_state *error) +#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) + +struct intel_gt_coredump * +intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp) { - struct i915_ggtt *ggtt = &error->i915->ggtt; + struct intel_gt_coredump *gc; - if (drm_mm_node_allocated(&ggtt->error_capture)) { - const u64 slot = ggtt->error_capture.start; + gc = kzalloc(sizeof(*gc), gfp); + if (!gc) + return NULL; + + gc->_gt = gt; + gc->awake = intel_gt_pm_is_awake(gt); + + gt_record_regs(gc); + gt_record_fences(gc); + + return gc; +} - ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); +struct i915_vma_compress * +i915_vma_capture_prepare(struct intel_gt_coredump *gt) +{ + struct i915_vma_compress *compress; + + compress = kmalloc(sizeof(*compress), ALLOW_FAIL); + if (!compress) + return NULL; + + if (!compress_init(compress)) { + kfree(compress); + return NULL; } + + gt_capture_prepare(gt); + + return compress; } -#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) +void i915_vma_capture_finish(struct intel_gt_coredump *gt, + struct i915_vma_compress *compress) +{ + if (!compress) + return; -struct i915_gpu_state * -i915_capture_gpu_state(struct drm_i915_private *i915) + gt_capture_finish(gt); + + compress_fini(compress); + kfree(compress); +} + +struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915) { - struct i915_gpu_state *error; - struct compress compress; + struct i915_gpu_coredump *error; /* Check if GPU capture has been disabled */ error = READ_ONCE(i915->gpu_error.first_error); if (IS_ERR(error)) return error; - error = kzalloc(sizeof(*error), ALLOW_FAIL); - if (!error) { - i915_disable_error_state(i915, -ENOMEM); + error = i915_gpu_coredump_alloc(i915, ALLOW_FAIL); + if (!error) return ERR_PTR(-ENOMEM); - } - if (!compress_init(&compress)) { - kfree(error); - i915_disable_error_state(i915, -ENOMEM); - return ERR_PTR(-ENOMEM); - } + error->gt = intel_gt_coredump_alloc(&i915->gt, ALLOW_FAIL); + if (error->gt) { + struct i915_vma_compress *compress; - kref_init(&error->ref); - error->i915 = i915; + compress = i915_vma_capture_prepare(error->gt); + if (!compress) { + kfree(error->gt); + kfree(error); + return ERR_PTR(-ENOMEM); + } - error->time = ktime_get_real(); - error->boottime = ktime_get_boottime(); - error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time); - error->capture = jiffies; + gt_record_engines(error->gt, compress); + + if (INTEL_INFO(i915)->has_gt_uc) + error->gt->uc = gt_record_uc(error->gt, compress); - capture_params(error); - capture_gen_state(error); - capture_uc_state(error, &compress); - capture_reg_state(error); - gem_record_fences(error); - gem_record_rings(error, &compress); + i915_vma_capture_finish(error->gt, compress); + + error->simulated |= error->gt->simulated; + } error->overlay = intel_overlay_capture_error_state(i915); error->display = intel_display_capture_error_state(i915); - capture_finish(error); - compress_fini(&compress); - return error; } -/** - * i915_capture_error_state - capture an error record for later analysis - * @i915: i915 device - * @engine_mask: the mask of engines triggering the hang - * @msg: a message to insert into the error capture header - * - * Should be called when an error is detected (either a hang or an error - * interrupt) to capture error state from the time of the error. Fills - * out a structure which becomes available in debugfs for user level tools - * to pick up. - */ -void i915_capture_error_state(struct drm_i915_private *i915, - intel_engine_mask_t engine_mask, - const char *msg) +void i915_error_state_store(struct i915_gpu_coredump *error) { + struct drm_i915_private *i915; static bool warned; - struct i915_gpu_state *error; - unsigned long flags; - if (!i915_modparams.error_capture) + if (IS_ERR_OR_NULL(error)) return; - if (READ_ONCE(i915->gpu_error.first_error)) - return; + i915 = error->i915; + dev_info(i915->drm.dev, "%s\n", error_msg(error)); - error = i915_capture_gpu_state(i915); - if (IS_ERR(error)) + if (error->simulated || + cmpxchg(&i915->gpu_error.first_error, NULL, error)) return; - dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg)); - - if (!error->simulated) { - spin_lock_irqsave(&i915->gpu_error.lock, flags); - if (!i915->gpu_error.first_error) { - i915->gpu_error.first_error = error; - error = NULL; - } - spin_unlock_irqrestore(&i915->gpu_error.lock, flags); - } - - if (error) { - __i915_gpu_state_free(&error->ref); - return; - } + i915_gpu_coredump_get(error); if (!xchg(&warned, true) && ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { @@ -1841,15 +1928,38 @@ void i915_capture_error_state(struct drm_i915_private *i915, } } -struct i915_gpu_state * +/** + * i915_capture_error_state - capture an error record for later analysis + * @i915: i915 device + * + * Should be called when an error is detected (either a hang or an error + * interrupt) to capture error state from the time of the error. Fills + * out a structure which becomes available in debugfs for user level tools + * to pick up. + */ +void i915_capture_error_state(struct drm_i915_private *i915) +{ + struct i915_gpu_coredump *error; + + error = i915_gpu_coredump(i915); + if (IS_ERR(error)) { + cmpxchg(&i915->gpu_error.first_error, NULL, error); + return; + } + + i915_error_state_store(error); + i915_gpu_coredump_put(error); +} + +struct i915_gpu_coredump * i915_first_error_state(struct drm_i915_private *i915) { - struct i915_gpu_state *error; + struct i915_gpu_coredump *error; spin_lock_irq(&i915->gpu_error.lock); error = i915->gpu_error.first_error; if (!IS_ERR_OR_NULL(error)) - i915_gpu_state_get(error); + i915_gpu_coredump_get(error); spin_unlock_irq(&i915->gpu_error.lock); return error; @@ -1857,7 +1967,7 @@ i915_first_error_state(struct drm_i915_private *i915) void i915_reset_error_state(struct drm_i915_private *i915) { - struct i915_gpu_state *error; + struct i915_gpu_coredump *error; spin_lock_irq(&i915->gpu_error.lock); error = i915->gpu_error.first_error; @@ -1866,7 +1976,7 @@ void i915_reset_error_state(struct drm_i915_private *i915) spin_unlock_irq(&i915->gpu_error.lock); if (!IS_ERR_OR_NULL(error)) - i915_gpu_state_put(error); + i915_gpu_coredump_put(error); } void i915_disable_error_state(struct drm_i915_private *i915, int err) -- cgit v1.2.3