From 060bb115c2d664f04db9c7613a104dfaef3fdd98 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 15 Sep 2020 10:14:15 +0100 Subject: drm/i915/gem: Avoid implicit vmap for highmem on x86-32 On 32b, highmem using a finite set of indirect PTE (i.e. vmap) to provide virtual mappings of the high pages. As these are finite, map_new_virtual() must wait for some other kmap() to finish when it runs out. If we map a large number of objects, there is no method for it to tell us to release the mappings, and we deadlock. However, if we make an explicit vmap of the page, that uses a larger vmalloc arena, and also has the ability to tell us to release unwanted mappings. Most importantly, it will fail and propagate an error instead of waiting forever. Fixes: fb8621d3bee8 ("drm/i915: Avoid allocating a vmap arena for a single page") #x86-32 References: e87666b52f00 ("drm/i915/shrinker: Hook up vmap allocation failure notifier") Signed-off-by: Chris Wilson Reviewed-by: Matthew Auld Cc: # v4.7+ Link: https://patchwork.freedesktop.org/patch/msgid/20200915091417.4086-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gem/i915_gem_pages.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c index 7050519c87a4..43e8378b9a0e 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c @@ -255,8 +255,30 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj, return NULL; /* A single page can always be kmapped */ - if (n_pte == 1 && type == I915_MAP_WB) - return kmap(sg_page(sgt->sgl)); + if (n_pte == 1 && type == I915_MAP_WB) { + struct page *page = sg_page(sgt->sgl); + + /* + * On 32b, highmem using a finite set of indirect PTE (i.e. + * vmap) to provide virtual mappings of the high pages. + * As these are finite, map_new_virtual() must wait for some + * other kmap() to finish when it runs out. If we map a large + * number of objects, there is no method for it to tell us + * to release the mappings, and we deadlock. + * + * However, if we make an explicit vmap of the page, that + * uses a larger vmalloc arena, and also has the ability + * to tell us to release unwanted mappings. Most importantly, + * it will fail and propagate an error instead of waiting + * forever. + * + * So if the page is beyond the 32b boundary, make an explicit + * vmap. On 64b, this check will be optimised away as we can + * directly kmap any page on the system. + */ + if (!PageHighMem(page)) + return kmap(page); + } mem = stack; if (n_pte > ARRAY_SIZE(stack)) { -- cgit v1.2.3 From 121ba69ffddc60df11da56f6d5b29bdb45c8eb80 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 15 Sep 2020 10:14:16 +0100 Subject: drm/i915/gem: Prevent using pgprot_writecombine() if PAT is not supported Let's not try and use PAT attributes for I915_MAP_WC if the CPU doesn't support PAT. Fixes: 6056e50033d9 ("drm/i915/gem: Support discontiguous lmem object maps") Signed-off-by: Chris Wilson Reviewed-by: Matthew Auld Cc: # v5.6+ Link: https://patchwork.freedesktop.org/patch/msgid/20200915091417.4086-2-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gem/i915_gem_pages.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c index 43e8378b9a0e..ed31bb2b82de 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c @@ -254,6 +254,10 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj, if (!i915_gem_object_has_struct_page(obj) && type != I915_MAP_WC) return NULL; + if (GEM_WARN_ON(type == I915_MAP_WC && + !static_cpu_has(X86_FEATURE_PAT))) + return NULL; + /* A single page can always be kmapped */ if (n_pte == 1 && type == I915_MAP_WB) { struct page *page = sg_page(sgt->sgl); -- cgit v1.2.3 From 9bb34ff25c458a2a48fb61409df42f465ede37f8 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 15 Sep 2020 10:14:17 +0100 Subject: drm/i915/gt: Clear the buffer pool age before use If we create a new node, it is possible for the slab allocator to return us a recently freed node. If that node was just retired, it will retain the current jiffy as its node->age. There is then a miniscule window, where as that node is retired, it will appear on the free list with an incorrect age and be eligible for reuse by one thread, and then by a second thread as the correct node->age is written. Fixes: 06b73c2d0b65 ("drm/i915/gt: Delay taking the spinlock for grabbing from the buffer pool") Signed-off-by: Chris Wilson Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20200915091417.4086-3-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c index 4b7671ac5dca..104cb30e8c13 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c @@ -134,6 +134,7 @@ static void pool_retire(struct i915_active *ref) /* Return this object to the shrinker pool */ i915_gem_object_make_purgeable(node->obj); + GEM_BUG_ON(node->age); spin_lock_irqsave(&pool->lock, flags); list_add_rcu(&node->link, list); WRITE_ONCE(node->age, jiffies ?: 1); /* 0 reserved for active nodes */ @@ -155,6 +156,7 @@ node_create(struct intel_gt_buffer_pool *pool, size_t sz) if (!node) return ERR_PTR(-ENOMEM); + node->age = 0; node->pool = pool; i915_active_init(&node->active, pool_active, pool_retire); -- cgit v1.2.3 From 6cb304b31293844461437fe76dc308aaffe31dc8 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 15 Sep 2020 11:51:13 +0100 Subject: drm/i915/gt: Check for a registered driver with IPS If the ips module calls into the driver during an unbind/bind cycle, we may see the driver while it has unregistered itself from ips and try and dereference a NULL ips_mchdev pointer. <1> [211.928844] BUG: kernel NULL pointer dereference, address: 0000000000000014 <1> [211.928861] #PF: supervisor read access in kernel mode <1> [211.928871] #PF: error_code(0x0000) - not-present page <6> [211.928881] PGD 0 P4D 0 <4> [211.928890] Oops: 0000 [#1] PREEMPT SMP PTI <4> [211.928900] CPU: 3 PID: 327 Comm: ips-monitor Not tainted 5.9.0-rc5-CI-CI_DRM_9008+ #1 <4> [211.928914] Hardware name: Hewlett-Packard HP EliteBook 8440p/172A, BIOS 68CCU Ver. F.24 09/13/2013 <4> [211.929056] RIP: 0010:mchdev_get+0x5a/0x180 [i915] <4> [211.929067] Code: c0 5a 74 0d 80 3d f1 53 29 00 00 0f 84 ab 00 00 00 48 8b 1d c8 a8 29 00 e8 d3 18 89 e1 85 c0 74 09 80 3d d1 53 29 00 00 74 65 <8b> 4b 14 48 8d 7b 14 85 c9 0f 84 09 01 00 00 8d 51 01 89 c8 f0 0f <4> [211.929095] RSP: 0018:ffffc900002efe60 EFLAGS: 00010202 <4> [211.929105] RAX: 0000000000000001 RBX: 0000000000000000 RCX: ffff8881297acf40 <4> [211.929118] RDX: 0000000000000000 RSI: ffffffff8264e2c0 RDI: ffff8881297ad820 <4> [211.929130] RBP: ffffc900002efe68 R08: ffff8881297ad820 R09: 00000000fffffffe <4> [211.929143] R10: ffff8881297acf40 R11: 00000000fff74c96 R12: ffff8881294dfa18 <4> [211.929155] R13: 0000000000000067 R14: ffff888126eff640 R15: ffff888126efe840 <4> [211.929168] FS: 0000000000000000(0000) GS:ffff888133d80000(0000) knlGS:0000000000000000 <4> [211.929182] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4> [211.929194] CR2: 0000000000000014 CR3: 0000000002610000 CR4: 00000000000006e0 <4> [211.929206] Call Trace: <4> [211.929294] i915_read_mch_val+0x15/0x380 [i915] <4> [211.929309] ? ips_monitor+0x3fb/0x630 [intel_ips] <4> [211.929321] ips_monitor+0x53c/0x630 [intel_ips] <4> [211.929334] ? ips_gpu_lower+0x30/0x30 [intel_ips] <4> [211.929348] kthread+0x14d/0x170 <4> [211.929358] ? kthread_park+0x80/0x80 <4> [211.929369] ret_from_fork+0x22/0x30 <4> [211.929382] Modules linked in: vgem snd_hda_codec_hdmi snd_hda_codec_generic ledtrig_audio i915 coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_hda_intel snd_intel_dspcfg snd_hda_codec snd_hwdep snd_hda_core e1000e snd_pcm mei_me mei intel_ips lpc_ich ptp prime_numbers pps_core <4> [211.929437] CR2: 0000000000000014 Signed-off-by: Chris Wilson Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200915105113.26564-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_rps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c index e6a00eea0631..a53928363b86 100644 --- a/drivers/gpu/drm/i915/gt/intel_rps.c +++ b/drivers/gpu/drm/i915/gt/intel_rps.c @@ -1949,7 +1949,7 @@ static struct drm_i915_private *mchdev_get(void) rcu_read_lock(); i915 = rcu_dereference(ips_mchdev); - if (!kref_get_unless_zero(&i915->drm.ref)) + if (i915 && !kref_get_unless_zero(&i915->drm.ref)) i915 = NULL; rcu_read_unlock(); -- cgit v1.2.3 From f24a44e52fbc9881fc5f3bcef536831a15a439f3 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 15 Sep 2020 14:49:20 +0100 Subject: drm/i915/gt: Widen CSB pointer to u64 for the parsers A CSB entry is 64b, and it is simpler for us to treat it as an array of 64b entries than as an array of pairs of 32b entries. Signed-off-by: Chris Wilson Cc: Mika Kuoppala Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200915134923.30088-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_engine_types.h | 2 +- drivers/gpu/drm/i915/gt/intel_lrc.c | 33 ++++++++++++++-------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index c400aaa2287b..ee6312601c56 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -278,7 +278,7 @@ struct intel_engine_execlists { * * Note these register may be either mmio or HWSP shadow. */ - u32 *csb_status; + u64 *csb_status; /** * @csb_size: context status buffer FIFO size diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 0412a44f25f2..d6e0f62337b4 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2464,7 +2464,7 @@ cancel_port_requests(struct intel_engine_execlists * const execlists) } static inline void -invalidate_csb_entries(const u32 *first, const u32 *last) +invalidate_csb_entries(const u64 *first, const u64 *last) { clflush((void *)first); clflush((void *)last); @@ -2496,14 +2496,12 @@ invalidate_csb_entries(const u32 *first, const u32 *last) * bits 47-57: sw context id of the lrc the GT switched away from * bits 58-63: sw counter of the lrc the GT switched away from */ -static inline bool -gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) +static inline bool gen12_csb_parse(const u64 *csb) { - u32 lower_dw = csb[0]; - u32 upper_dw = csb[1]; - bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw); - bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw); - bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; + u64 entry = READ_ONCE(*csb); + bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry)); + bool new_queue = + lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; /* * The context switch detail is not guaranteed to be 5 when a preemption @@ -2513,7 +2511,7 @@ gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) * would require some extra handling, but we don't support that. */ if (!ctx_away_valid || new_queue) { - GEM_BUG_ON(!ctx_to_valid); + GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry))); return true; } @@ -2522,12 +2520,11 @@ gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) * context switch on an unsuccessful wait instruction since we always * use polling mode. */ - GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw)); + GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry))); return false; } -static inline bool -gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) +static inline bool gen8_csb_parse(const u64 *csb) { return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); } @@ -2535,7 +2532,7 @@ gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) static void process_csb(struct intel_engine_cs *engine) { struct intel_engine_execlists * const execlists = &engine->execlists; - const u32 * const buf = execlists->csb_status; + const u64 * const buf = execlists->csb_status; const u8 num_entries = execlists->csb_size; u8 head, tail; @@ -2616,12 +2613,14 @@ static void process_csb(struct intel_engine_cs *engine) */ ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", - head, buf[2 * head + 0], buf[2 * head + 1]); + head, + upper_32_bits(buf[head]), + lower_32_bits(buf[head])); if (INTEL_GEN(engine->i915) >= 12) - promote = gen12_csb_parse(execlists, buf + 2 * head); + promote = gen12_csb_parse(buf + head); else - promote = gen8_csb_parse(execlists, buf + 2 * head); + promote = gen8_csb_parse(buf + head); if (promote) { struct i915_request * const *old = execlists->active; @@ -5157,7 +5156,7 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine) } execlists->csb_status = - &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; + (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; execlists->csb_write = &engine->status_page.addr[intel_hws_csb_write_index(i915)]; -- cgit v1.2.3 From 233c1ae3c83f21046c6c4083da904163ece8f110 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 15 Sep 2020 14:49:21 +0100 Subject: drm/i915/gt: Wait for CSB entries on Tigerlake On Tigerlake, we are seeing a repeat of commit d8f505311717 ("drm/i915/icl: Forcibly evict stale csb entries") where, presumably, due to a missing Global Observation Point synchronisation, the write pointer of the CSB ringbuffer is updated _prior_ to the contents of the ringbuffer. That is we see the GPU report more context-switch entries for us to parse, but those entries have not been written, leading us to process stale events, and eventually report a hung GPU. However, this effect appears to be much more severe than we previously saw on Icelake (though it might be best if we try the same approach there as well and measure), and Bruce suggested the good idea of resetting the CSB entry after use so that we can detect when it has been updated by the GPU. By instrumenting how long that may be, we can set a reliable upper bound for how long we should wait for: 513 late, avg of 61 retries (590 ns), max of 1061 retries (10099 ns) Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2045 References: d8f505311717 ("drm/i915/icl: Forcibly evict stale csb entries") References: HSDES#22011327657, HSDES#1508287568 Suggested-by: Bruce Chang Signed-off-by: Chris Wilson Cc: Bruce Chang Cc: Mika Kuoppala Cc: stable@vger.kernel.org # v5.4 Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200915134923.30088-2-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_lrc.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index d6e0f62337b4..d75712a503b7 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2498,9 +2498,22 @@ invalidate_csb_entries(const u64 *first, const u64 *last) */ static inline bool gen12_csb_parse(const u64 *csb) { - u64 entry = READ_ONCE(*csb); - bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry)); - bool new_queue = + bool ctx_away_valid; + bool new_queue; + u64 entry; + + /* HSD#22011248461 */ + entry = READ_ONCE(*csb); + if (unlikely(entry == -1)) { + preempt_disable(); + if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50)) + GEM_WARN_ON("50us CSB timeout"); + preempt_enable(); + } + WRITE_ONCE(*(u64 *)csb, -1); + + ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry)); + new_queue = lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; /* @@ -4004,6 +4017,8 @@ static void reset_csb_pointers(struct intel_engine_cs *engine) WRITE_ONCE(*execlists->csb_write, reset_value); wmb(); /* Make sure this is visible to HW (paranoia?) */ + /* Check that the GPU does indeed update the CSB entries! */ + memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64)); invalidate_csb_entries(&execlists->csb_status[0], &execlists->csb_status[reset_value]); -- cgit v1.2.3 From 884c40741234c12533caf344997c895ee1c37d60 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 15 Sep 2020 14:49:22 +0100 Subject: drm/i915/gt: Apply the CSB w/a for all Since we expect to inline the csb_parse() routines, the w/a for the stale CSB data on Tigerlake will be pulled into process_csb(), and so we might as well simply reuse the logic for all, and so will hopefully avoid any strange behaviour on Icelake that was not covered by our previous w/a. References: d8f505311717 ("drm/i915/icl: Forcibly evict stale csb entries") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Bruce Chang Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200915134923.30088-3-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_lrc.c | 79 ++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index d75712a503b7..fcb6ec3d55f4 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2496,25 +2496,11 @@ invalidate_csb_entries(const u64 *first, const u64 *last) * bits 47-57: sw context id of the lrc the GT switched away from * bits 58-63: sw counter of the lrc the GT switched away from */ -static inline bool gen12_csb_parse(const u64 *csb) +static inline bool gen12_csb_parse(const u64 csb) { - bool ctx_away_valid; - bool new_queue; - u64 entry; - - /* HSD#22011248461 */ - entry = READ_ONCE(*csb); - if (unlikely(entry == -1)) { - preempt_disable(); - if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50)) - GEM_WARN_ON("50us CSB timeout"); - preempt_enable(); - } - WRITE_ONCE(*(u64 *)csb, -1); - - ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry)); - new_queue = - lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; + bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(csb)); + bool new_queue = + lower_32_bits(csb) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; /* * The context switch detail is not guaranteed to be 5 when a preemption @@ -2524,7 +2510,7 @@ static inline bool gen12_csb_parse(const u64 *csb) * would require some extra handling, but we don't support that. */ if (!ctx_away_valid || new_queue) { - GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry))); + GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(csb))); return true; } @@ -2533,19 +2519,56 @@ static inline bool gen12_csb_parse(const u64 *csb) * context switch on an unsuccessful wait instruction since we always * use polling mode. */ - GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry))); + GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(csb))); return false; } -static inline bool gen8_csb_parse(const u64 *csb) +static inline bool gen8_csb_parse(const u64 csb) +{ + return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); +} + +static noinline u64 wa_csb_read(u64 * const csb) { - return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); + u64 entry; + + preempt_disable(); + if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50)) + GEM_WARN_ON("50us CSB timeout"); + preempt_enable(); + + return entry; +} + +static inline u64 csb_read(u64 * const csb) +{ + u64 entry = READ_ONCE(*csb); + + /* + * Unfortunately, the GPU does not always serialise its write + * of the CSB entries before its write of the CSB pointer, at least + * from the perspective of the CPU, using what is known as a Global + * Observation Point. We may read a new CSB tail pointer, but then + * read the stale CSB entries, causing us to misinterpret the + * context-switch events, and eventually declare the GPU hung. + * + * icl:HSDES#1806554093 + * tgl:HSDES#22011248461 + */ + if (unlikely(entry == -1)) + entry = wa_csb_read(csb); + + /* Consume this entry so that we can spot its future reuse. */ + WRITE_ONCE(*csb, -1); + + /* ELSP is an implicit wmb() before the GPU wraps and overwrites csb */ + return entry; } static void process_csb(struct intel_engine_cs *engine) { struct intel_engine_execlists * const execlists = &engine->execlists; - const u64 * const buf = execlists->csb_status; + u64 * const buf = execlists->csb_status; const u8 num_entries = execlists->csb_size; u8 head, tail; @@ -2603,6 +2626,7 @@ static void process_csb(struct intel_engine_cs *engine) rmb(); do { bool promote; + u64 csb; if (++head == num_entries) head = 0; @@ -2625,15 +2649,14 @@ static void process_csb(struct intel_engine_cs *engine) * status notifier. */ + csb = csb_read(buf + head); ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", - head, - upper_32_bits(buf[head]), - lower_32_bits(buf[head])); + head, upper_32_bits(csb), lower_32_bits(csb)); if (INTEL_GEN(engine->i915) >= 12) - promote = gen12_csb_parse(buf + head); + promote = gen12_csb_parse(csb); else - promote = gen8_csb_parse(buf + head); + promote = gen8_csb_parse(csb); if (promote) { struct i915_request * const *old = execlists->active; -- cgit v1.2.3 From 4ff64bcfe2b11be52a3fceb6b5c23d7be7916362 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 15 Sep 2020 14:49:23 +0100 Subject: drm/i915/gt: Use a mmio read of the CSB in case of failure If we find the GPU didn't update the CSB within 50us, we currently fail and eventually reset the GPU. Lets report the value from the mmio space as a last resort, it may just stave off an unnecessary GPU reset. References: HSDES#22011327657 Suggested-by: Mika Kuoppala Signed-off-by: Chris Wilson Cc: Mika Kuoppala Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200915134923.30088-4-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_lrc.c | 35 +++++++++++++++++++++++++++------ drivers/gpu/drm/i915/gt/intel_lrc_reg.h | 3 +++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index fcb6ec3d55f4..0d57b54e5a69 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2528,19 +2528,42 @@ static inline bool gen8_csb_parse(const u64 csb) return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); } -static noinline u64 wa_csb_read(u64 * const csb) +static noinline u64 +wa_csb_read(const struct intel_engine_cs *engine, u64 * const csb) { u64 entry; + /* + * Reading from the HWSP has one particular advantage: we can detect + * a stale entry. Since the write into HWSP is broken, we have no reason + * to trust the HW at all, the mmio entry may equally be unordered, so + * we prefer the path that is self-checking and as a last resort, + * return the mmio value. + * + * tgl,dg1:HSDES#22011327657 + */ preempt_disable(); - if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50)) - GEM_WARN_ON("50us CSB timeout"); + if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 10)) { + int idx = csb - engine->execlists.csb_status; + int status; + + status = GEN8_EXECLISTS_STATUS_BUF; + if (idx >= 6) { + status = GEN11_EXECLISTS_STATUS_BUF2; + idx -= 6; + } + status += sizeof(u64) * idx; + + entry = intel_uncore_read64(engine->uncore, + _MMIO(engine->mmio_base + status)); + } preempt_enable(); return entry; } -static inline u64 csb_read(u64 * const csb) +static inline u64 +csb_read(const struct intel_engine_cs *engine, u64 * const csb) { u64 entry = READ_ONCE(*csb); @@ -2556,7 +2579,7 @@ static inline u64 csb_read(u64 * const csb) * tgl:HSDES#22011248461 */ if (unlikely(entry == -1)) - entry = wa_csb_read(csb); + entry = wa_csb_read(engine, csb); /* Consume this entry so that we can spot its future reuse. */ WRITE_ONCE(*csb, -1); @@ -2649,7 +2672,7 @@ static void process_csb(struct intel_engine_cs *engine) * status notifier. */ - csb = csb_read(buf + head); + csb = csb_read(engine, buf + head); ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", head, upper_32_bits(csb), lower_32_bits(csb)); diff --git a/drivers/gpu/drm/i915/gt/intel_lrc_reg.h b/drivers/gpu/drm/i915/gt/intel_lrc_reg.h index 93cb6c460508..1b51f7b9a5c3 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc_reg.h +++ b/drivers/gpu/drm/i915/gt/intel_lrc_reg.h @@ -49,4 +49,7 @@ #define GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x1A #define GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0xD +#define GEN8_EXECLISTS_STATUS_BUF 0x370 +#define GEN11_EXECLISTS_STATUS_BUF2 0x3c0 + #endif /* _INTEL_LRC_REG_H_ */ -- cgit v1.2.3 From 68ba71e3ae6dd86a23486655e33c5f8c9bd90777 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 11 Sep 2020 10:52:43 +0300 Subject: drm/i915: Fix an error code i915_gem_object_copy_blt() This code should use "vma[1]" instead of "vma". The "vma" variable is a valid pointer. Fixes: 6b05030496f7 ("drm/i915: Convert i915_gem_object/client_blt.c to use ww locking as well, v2.") Signed-off-by: Dan Carpenter Reviewed-by: Mika Kuoppala Signed-off-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20200911075243.GG12635@kadam --- drivers/gpu/drm/i915/gem/i915_gem_object_blt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c index d93eb36160c9..aee7ad3cc3c6 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_blt.c @@ -364,7 +364,7 @@ int i915_gem_object_copy_blt(struct drm_i915_gem_object *src, vma[1] = i915_vma_instance(dst, vm, NULL); if (IS_ERR(vma[1])) - return PTR_ERR(vma); + return PTR_ERR(vma[1]); i915_gem_ww_ctx_init(&ww, true); intel_engine_pm_get(ce->engine); -- cgit v1.2.3 From 0bda4b80d949d871146f85bb478f5c30c368009e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 16 Sep 2020 10:00:57 +0100 Subject: drm/i915/gt: Show engine properties in the pretty printer When debugging the engine state, include the user properties that may, or may not, have been adjusted by the user/test. For example, vecs0 ... Properties: heartbeat_interval_ms: 2500 [default 2500] max_busywait_duration_ns: 8000 [default 8000] preempt_timeout_ms: 640 [default 640] stop_timeout_ms: 100 [default 100] timeslice_duration_ms: 1 [default 1] Suggested-by: Joonas Lahtinen Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: Mika Kuoppala Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200916090059.3189-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 36 +++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index f231edd3fa3a..1579a80bc8cb 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -1599,6 +1599,41 @@ static unsigned long list_count(struct list_head *list) return count; } +static unsigned long read_ul(void *p, size_t x) +{ + return *(unsigned long *)(p + x); +} + +static void print_properties(struct intel_engine_cs *engine, + struct drm_printer *m) +{ + static const struct pmap { + size_t offset; + const char *name; + } props[] = { +#define P(x) { \ + .offset = offsetof(typeof(engine->props), x), \ + .name = #x \ +} + P(heartbeat_interval_ms), + P(max_busywait_duration_ns), + P(preempt_timeout_ms), + P(stop_timeout_ms), + P(timeslice_duration_ms), + + {}, +#undef P + }; + const struct pmap *p; + + drm_printf(m, "\tProperties:\n"); + for (p = props; p->name; p++) + drm_printf(m, "\t\t%s: %lu [default %lu]\n", + p->name, + read_ul(&engine->props, p->offset), + read_ul(&engine->defaults, p->offset)); +} + void intel_engine_dump(struct intel_engine_cs *engine, struct drm_printer *m, const char *header, ...) @@ -1641,6 +1676,7 @@ void intel_engine_dump(struct intel_engine_cs *engine, drm_printf(m, "\tReset count: %d (global %d)\n", i915_reset_engine_count(error, engine), i915_reset_count(error)); + print_properties(engine, m); drm_printf(m, "\tRequests:\n"); -- cgit v1.2.3 From 293f43c80c0027ff9299036c24218ac705ce584e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 16 Sep 2020 10:00:58 +0100 Subject: drm/i915: Break up error capture compression loops with cond_resched() As the error capture will compress user buffers as directed to by the user, it can take an arbitrary amount of time and space. Break up the compression loops with a call to cond_resched(), that will allow other processes to schedule (avoiding the soft lockups) and also serve as a warning should we try to make this loop atomic in the future. Testcase: igt/gem_exec_capture/many-* Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: stable@vger.kernel.org Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200916090059.3189-2-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_gpu_error.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 6a3a2ce0b394..6551ff04d5a6 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -311,6 +311,8 @@ static int compress_page(struct i915_vma_compress *c, if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK) return -EIO; + + cond_resched(); } while (zstream->avail_in); /* Fallback to uncompressed if we increase size? */ @@ -397,6 +399,7 @@ static int compress_page(struct i915_vma_compress *c, if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE))) memcpy(ptr, src, PAGE_SIZE); dst->pages[dst->page_count++] = ptr; + cond_resched(); return 0; } -- cgit v1.2.3 From f2acf74068b0ac56289ecdd9739e9aaee1a46d21 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 16 Sep 2020 10:00:59 +0100 Subject: drm/i915: Reduce GPU error capture mutex hold time Shrink the hold time for the error capture mutex to just around the acquire/release of the PTE used for reading back the object via the Global GTT. For platforms that do not need the GGTT read back, we can skip the mutex entirely and allow concurrent error capture. Where we do use the GGTT, by restricting the hold time around the slow readback and compression, we are more resilient against softlockups (khungtaskd) as the heartbeat may well also trigger an error while the first is on going, and this allows the heartbeat reset to skip past the capture and not be stalled. Testcase: igt/gem_exec_capture/many-* Signed-off-by: Chris Wilson Cc: Mika Kuoppala Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20200916090059.3189-3-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_gpu_error.c | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 6551ff04d5a6..41a82855d046 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1026,6 +1026,7 @@ i915_vma_coredump_create(const struct intel_gt *gt, dma_addr_t dma; for_each_sgt_daddr(dma, iter, vma->pages) { + mutex_lock(&ggtt->error_mutex); ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0); mb(); @@ -1035,6 +1036,10 @@ i915_vma_coredump_create(const struct intel_gt *gt, (void __force *)s, dst, true); io_mapping_unmap(s); + + mb(); + ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); + mutex_unlock(&ggtt->error_mutex); if (ret) break; } @@ -1506,25 +1511,6 @@ gt_record_uc(struct intel_gt_coredump *gt, return error_uc; } -static void gt_capture_prepare(struct intel_gt_coredump *gt) -{ - struct i915_ggtt *ggtt = gt->_gt->ggtt; - - mutex_lock(&ggtt->error_mutex); -} - -static void gt_capture_finish(struct intel_gt_coredump *gt) -{ - struct i915_ggtt *ggtt = gt->_gt->ggtt; - - if (drm_mm_node_allocated(&ggtt->error_capture)) - ggtt->vm.clear_range(&ggtt->vm, - ggtt->error_capture.start, - PAGE_SIZE); - - mutex_unlock(&ggtt->error_mutex); -} - /* Capture all registers which don't fit into another category. */ static void gt_record_regs(struct intel_gt_coredump *gt) { @@ -1783,8 +1769,6 @@ i915_vma_capture_prepare(struct intel_gt_coredump *gt) return NULL; } - gt_capture_prepare(gt); - return compress; } @@ -1794,8 +1778,6 @@ void i915_vma_capture_finish(struct intel_gt_coredump *gt, if (!compress) return; - gt_capture_finish(gt); - compress_fini(compress); kfree(compress); } -- cgit v1.2.3 From 29545e5cd27d71640249a995ee84a8376c78a338 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 26 Aug 2020 14:27:48 +0100 Subject: drm/i915/gt: Remove defunct intel_virtual_engine_get_sibling() As the last user was eliminated in commit e21fecdcde40 ("drm/i915/gt: Distinguish the virtual breadcrumbs from the irq breadcrumbs"), we can remove the function. One less implementation detail creeping beyond its scope. Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200826132811.17577-16-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_lrc.c | 12 ------------ drivers/gpu/drm/i915/gt/intel_lrc.h | 4 ---- 2 files changed, 16 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 0d57b54e5a69..1a1c3b077b7b 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -5953,18 +5953,6 @@ int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, return 0; } -struct intel_engine_cs * -intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, - unsigned int sibling) -{ - struct virtual_engine *ve = to_virtual_engine(engine); - - if (sibling >= ve->num_siblings) - return NULL; - - return ve->siblings[sibling]; -} - void intel_execlists_show_requests(struct intel_engine_cs *engine, struct drm_printer *m, void (*show_request)(struct drm_printer *m, diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.h b/drivers/gpu/drm/i915/gt/intel_lrc.h index 91fd8e452d9b..c2d287f25497 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.h +++ b/drivers/gpu/drm/i915/gt/intel_lrc.h @@ -121,10 +121,6 @@ int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, const struct intel_engine_cs *master, const struct intel_engine_cs *sibling); -struct intel_engine_cs * -intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, - unsigned int sibling); - bool intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine); -- cgit v1.2.3 From 4316b19dee27cc5cd34a95fdbc0a3a5237507701 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Fri, 18 Sep 2020 13:12:08 +0200 Subject: drm/i915: Fix uninitialised variable in intel_context_create_request. In case backoff fails with an error, we return an undefined rq, assign err to rq correctly. Fixes: 8a929c9eb1c2 ("drm/i915: Use ww pinning for intel_context_create_request()") Signed-off-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20200918111208.1392128-1-maarten.lankhorst@linux.intel.com Reviewed-by: Rodrigo Vivi --- drivers/gpu/drm/i915/gt/intel_context.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index d301dda1b261..92a3f25c4006 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -472,6 +472,7 @@ retry: err = i915_gem_ww_ctx_backoff(&ww); if (!err) goto retry; + rq = ERR_PTR(err); } else { rq = ERR_PTR(err); } -- cgit v1.2.3 From 1604cb2aa7fafd83e11f9257f765a5f5dd7c19d3 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Mon, 21 Sep 2020 17:08:44 +0100 Subject: drm/i915: check i915_vm_alloc_pt_stash for errors If we are really unlucky and encounter an error during i915_vm_alloc_pt_stash, we end up passing an empty pt/pd stash all the way down into the low-level ppgtt alloc code, leading to explosions, since it expects at least the required number of pt/pd for the va range. [ 211.981418] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 211.981421] #PF: supervisor read access in kernel mode [ 211.981422] #PF: error_code(0x0000) - not-present page [ 211.981424] PGD 80000008439cb067 P4D 80000008439cb067 PUD 84a37f067 PMD 0 [ 211.981427] Oops: 0000 [#1] SMP PTI [ 211.981428] CPU: 1 PID: 1301 Comm: i915_selftest Tainted: G U I 5.9.0-rc5+ #3 [ 211.981430] Hardware name: /NUC6i7KYB, BIOS KYSKLi70.86A.0050.2017.0831.1924 08/31/2017 [ 211.981521] RIP: 0010:__gen8_ppgtt_alloc+0x1ed/0x3c0 [i915] [ 211.981523] Code: c1 48 c7 c7 5d 5d fe c0 65 ff 0d ee 1d 03 3f e8 d9 91 1f e2 8b 55 c4 31 c0 48 8b 75 b8 85 d2 0f 95 c0 48 8b 1c c6 48 89 45 98 <48> 8b 03 48 8b 90 58 02 00 00 48 85 d2 0f 84 07 ea 15 00 48 81 fa [ 211.981526] RSP: 0018:ffffba2cc0eb3970 EFLAGS: 00010202 [ 211.981527] RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000004 [ 211.981529] RDX: 0000000000000002 RSI: ffff9be998bdb8c0 RDI: ffff9be99c844300 [ 211.981530] RBP: ffffba2cc0eb39d8 R08: 0000000000000640 R09: ffff9be97cdfd000 [ 211.981531] R10: ffff9be97cdfd614 R11: 0000000000000000 R12: 0000000000000000 [ 211.981532] R13: ffff9be98607ba20 R14: ffff9be995a0b400 R15: ffffba2cc0eb39e8 [ 211.981534] FS: 00007f0f10b31000(0000) GS:ffff9be99fc40000(0000) knlGS:0000000000000000 [ 211.981536] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 211.981538] CR2: 0000000000000000 CR3: 000000084d74e006 CR4: 00000000003706e0 [ 211.981539] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 211.981541] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 211.981542] Call Trace: [ 211.981609] gen8_ppgtt_alloc+0x79/0x90 [i915] [ 211.981678] ppgtt_bind_vma+0x36/0x80 [i915] [ 211.981756] __vma_bind+0x39/0x40 [i915] [ 211.981818] fence_work+0x21/0x98 [i915] [ 211.981879] fence_notify+0x8d/0x128 [i915] [ 211.981939] __i915_sw_fence_complete+0x62/0x240 [i915] [ 211.982018] i915_vma_pin_ww+0x1ee/0x9c0 [i915] Fixes: cd0452aa2a0d ("drm/i915: Preallocate stashes for vma page-directories") Signed-off-by: Matthew Auld Cc: Chris Wilson Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20200921160844.73186-1-matthew.auld@intel.com --- drivers/gpu/drm/i915/i915_vma.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 495d28f6d160..ffb5287e055a 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -892,9 +892,11 @@ int i915_vma_pin_ww(struct i915_vma *vma, struct i915_gem_ww_ctx *ww, /* Allocate enough page directories to used PTE */ if (vma->vm->allocate_va_range) { - i915_vm_alloc_pt_stash(vma->vm, - &work->stash, - vma->size); + err = i915_vm_alloc_pt_stash(vma->vm, + &work->stash, + vma->size); + if (err) + goto err_fence; err = i915_vm_pin_pt_stash(vma->vm, &work->stash); -- cgit v1.2.3 From 5ae26012a159febcd2ad7a920b5f9b33ef87c333 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Thu, 17 Sep 2020 19:50:56 +0300 Subject: drm/i915/uc: tune down GuC communication enabled/disabled messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The GuC communication enabled/disabled messages are too noisy in info level. Convert them from info to debug level, and switch to device based logging while at it. Reported-by: Karol Herbst Cc: Karol Herbst Reviewed-by: Ville Syrjälä Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20200917165056.29766-1-jani.nikula@intel.com --- drivers/gpu/drm/i915/gt/uc/intel_uc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c index d6f55f70889d..4e6070e95fe9 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c @@ -231,13 +231,15 @@ static int guc_enable_communication(struct intel_guc *guc) intel_guc_ct_event_handler(&guc->ct); spin_unlock_irq(&i915->irq_lock); - DRM_INFO("GuC communication enabled\n"); + drm_dbg(&i915->drm, "GuC communication enabled\n"); return 0; } static void guc_disable_communication(struct intel_guc *guc) { + struct drm_i915_private *i915 = guc_to_gt(guc)->i915; + /* * Events generated during or after CT disable are logged by guc in * via mmio. Make sure the register is clear before disabling CT since @@ -257,7 +259,7 @@ static void guc_disable_communication(struct intel_guc *guc) */ guc_get_mmio_msg(guc); - DRM_INFO("GuC communication disabled\n"); + drm_dbg(&i915->drm, "GuC communication disabled\n"); } static void __uc_fetch_firmwares(struct intel_uc *uc) -- cgit v1.2.3 From 102f5aa491f262c818e607fc4fee08a724a76c69 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 23 Jul 2020 18:21:19 +0100 Subject: drm/i915/gem: Serialise debugfs i915_gem_objects with ctx->mutex Since the debugfs may peek into the GEM contexts as the corresponding client/fd is being closed, we may try and follow a dangling pointer. However, the context closure itself is serialised with the ctx->mutex, so if we hold that mutex as we inspect the state coupled in the context, we know the pointers within the context are stable and will remain valid as we inspect their tables. Signed-off-by: Chris Wilson Cc: CQ Tang Cc: Daniel Vetter Cc: stable@vger.kernel.org Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200723172119.17649-3-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_debugfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 784219962193..ea469168cd44 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -326,6 +326,7 @@ static void print_context_stats(struct seq_file *m, } i915_gem_context_unlock_engines(ctx); + mutex_lock(&ctx->mutex); if (!IS_ERR_OR_NULL(ctx->file_priv)) { struct file_stats stats = { .vm = rcu_access_pointer(ctx->vm), @@ -346,6 +347,7 @@ static void print_context_stats(struct seq_file *m, print_file_stats(m, name, stats); } + mutex_unlock(&ctx->mutex); spin_lock(&i915->gem.contexts.lock); list_safe_reset_next(ctx, cn, link); -- cgit v1.2.3 From 35faeb7de9ef83da510a048f2016061f1e31d5fc Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 25 Sep 2020 11:11:06 +0100 Subject: drm/i915: Redo "Remove i915_request.lock requirement for execution callbacks" The reordering and rebasing of commit 2e4c6c1a9db5 ("drm/i915: Remove i915_request.lock requirement for execution callbacks") caused it to revert an earlier correction. Let us restore commit 99f0a640d464 ("drm/i915: Remove requirement for holding i915_request.lock for breadcrumbs") Fixes: 2e4c6c1a9db5 ("drm/i915: Remove i915_request.lock requirement for execution callbacks") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: Rodrigo Vivi Cc: Joonas Lahtinen Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200925101107.27869-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_request.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 11e272422fb7..436ce368ddaa 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -593,16 +593,8 @@ xfer: __notify_execute_cb_irq(request); /* We may be recursing from the signal callback of another i915 fence */ - if (!i915_request_signaled(request)) { - spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); - - if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, - &request->fence.flags) && - !i915_request_enable_breadcrumb(request)) - intel_engine_signal_breadcrumbs(engine); - - spin_unlock(&request->lock); - } + if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags)) + i915_request_enable_breadcrumb(request); return result; } -- cgit v1.2.3 From badef44deff1fae8d21c5c1cfc4dde95fb5bf993 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 25 Sep 2020 11:11:07 +0100 Subject: drm/i915/gem: Hold request reference for canceling an active context We have to be very careful while walking the timeline->requests list under the RCU guard, as the requests (and so rq->link) use SLAB_TYPESAFE_BY_RCU and so the requests may be reallocated within an rcu grace period. As the requests are reallocated, they are removed from one list and placed on another, and if we are iterating over that request at that moment, the list iteration jumps from one list to the next and promptly gets confused. Verify we hold the request reference to ensure that the request is not added to a new list behind our backs. <4> [582.745252] general protection fault, probably for non-canonical address 0xcccccccccccccd5c: 0000 [#1] PREEMPT SMP PTI <4> [582.745297] CPU: 0 PID: 1475 Comm: gem_ctx_persist Not tainted 5.9.0-rc1-CI-CI_DRM_8908+ #1 <4> [582.745304] Hardware name: Intel Corporation NUC7CJYH/NUC7JYB, BIOS JYGLKCPX.86A.0027.2018.0125.1347 01/25/2018 <4> [582.745317] RIP: 0010:__lock_acquire+0x2c3/0x1f40 <4> [582.745323] Code: 00 65 8b 05 c7 8a ef 7e 85 c0 0f 85 b4 07 00 00 44 8b 9d c4 08 00 00 45 85 db 0f 84 0f 01 00 00 ba 05 00 00 00 e9 c8 06 00 00 <48> 81 3f c0 89 c7 82 b8 00 00 00 00 41 0f 45 c0 83 fe 01 41 89 c3 <4> [582.745334] RSP: 0018:ffffc9000461bc40 EFLAGS: 00010002 <4> [582.745340] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000 <4> [582.745345] RDX: 0000000000000000 RSI: 0000000000000000 RDI: cccccccccccccd5c <4> [582.745350] RBP: ffff8881ec4a2880 R08: 0000000000000001 R09: 0000000000000001 <4> [582.745356] R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000 <4> [582.745361] R13: 0000000000000000 R14: 0000000000000000 R15: cccccccccccccd5c <4> [582.745367] FS: 00007fb44da78e40(0000) GS:ffff888278000000(0000) knlGS:0000000000000000 <4> [582.745373] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4> [582.745378] CR2: 00007fb44daad040 CR3: 0000000268428000 CR4: 0000000000350ef0 <4> [582.745383] Call Trace: <4> [582.745390] ? __lock_acquire+0x913/0x1f40 <4> [582.745397] lock_acquire+0xb5/0x3c0 <4> [582.745526] ? kill_engines+0x19a/0x4b0 [i915] <4> [582.745533] ? find_held_lock+0x2d/0x90 <4> [582.745541] _raw_spin_lock_irq+0x30/0x40 <4> [582.745635] ? kill_engines+0x19a/0x4b0 [i915] <4> [582.745727] kill_engines+0x19a/0x4b0 [i915] <4> [582.745820] context_close+0x195/0x410 [i915] <4> [582.745912] i915_gem_context_close+0x5b/0x160 [i915] <4> [582.745994] i915_driver_postclose+0x14/0x40 [i915] <4> [582.746003] drm_file_free.part.13+0x240/0x290 <4> [582.746009] drm_release_noglobal+0x16/0x50 <4> [582.746016] __fput+0xa5/0x250 <4> [582.746021] task_work_run+0x6e/0xb0 <4> [582.746028] exit_to_user_mode_prepare+0x178/0x180 <4> [582.746034] syscall_exit_to_user_mode+0x36/0x220 <4> [582.746040] entry_SYSCALL_64_after_hwframe+0x44/0xa9 <4> [582.746045] RIP: 0033:0x7fb44d1dc421 <4> [582.746050] Code: f7 d8 64 89 02 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 8b 05 ea cf 20 00 85 c0 75 16 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 3f f3 c3 0f 1f 44 00 00 53 89 fb 48 83 ec 10 <4> [582.746062] RSP: 002b:00007ffed2e83818 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 <4> [582.746069] RAX: 0000000000000000 RBX: 0000556410bfe840 RCX: 00007fb44d1dc421 <4> [582.746075] RDX: 000000000000000a RSI: 00000000c0406469 RDI: 0000000000000008 <4> [582.746080] RBP: 0000000000000008 R08: 00007fb44d1c51cc R09: 00007fb44d1c5240 <4> [582.746086] R10: 0000000000000001 R11: 0000000000000246 R12: 00000000fffffffb <4> [582.746091] R13: 0000000000000006 R14: 0000000000000000 R15: 000000000000000a <4> [582.746099] Modules linked in: vgem mei_hdcp snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic ledtrig_audio btusb btrtl btbcm btintel x86_pkg_temp_thermal coretemp crct10dif_pclmul crc32_pclmul bluetooth ghash_clmulni_intel ecdh_generic ecc i915 r8169 realtek mei_me mei snd_hda_intel i2c_hid snd_intel_dspcfg snd_hda_codec snd_hwdep snd_hda_core snd_pcm pinctrl_geminilake pinctrl_intel prime_numbers [last unloaded: test_drm_mm] Fixes: 736e785f9b28 ("drm/i915/gem: Reduce context termination list iteration guard to RCU") Signed-off-by: Chris Wilson Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20200925101107.27869-2-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index cf5ecbde9e06..a548626fa8bc 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -460,8 +460,8 @@ __active_engine(struct i915_request *rq, struct intel_engine_cs **active) spin_lock(&locked->active.lock); } - if (!i915_request_completed(rq)) { - if (i915_request_is_active(rq) && rq->fence.error != -EIO) + if (i915_request_is_active(rq)) { + if (!i915_request_completed(rq)) *active = locked; ret = true; } @@ -479,13 +479,26 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce) if (!ce->timeline) return NULL; + /* + * rq->link is only SLAB_TYPESAFE_BY_RCU, we need to hold a reference + * to the request to prevent it being transferred to a new timeline + * (and onto a new timeline->requests list). + */ rcu_read_lock(); - list_for_each_entry_rcu(rq, &ce->timeline->requests, link) { - if (i915_request_is_active(rq) && i915_request_completed(rq)) - continue; + list_for_each_entry_reverse(rq, &ce->timeline->requests, link) { + bool found; + + /* timeline is already completed upto this point? */ + if (!i915_request_get_rcu(rq)) + break; /* Check with the backend if the request is inflight */ - if (__active_engine(rq, &engine)) + found = true; + if (likely(rcu_access_pointer(rq->timeline) == ce->timeline)) + found = __active_engine(rq, &engine); + + i915_request_put(rq); + if (found) break; } rcu_read_unlock(); -- cgit v1.2.3 From 7a991cd3e3da9a56d5616b62d425db000a3242f2 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 28 Sep 2020 23:15:08 +0100 Subject: drm/i915: Cancel outstanding work after disabling heartbeats on an engine We only allow persistent requests to remain on the GPU past the closure of their containing context (and process) so long as they are continuously checked for hangs or allow other requests to preempt them, as we need to ensure forward progress of the system. If we allow persistent contexts to remain on the system after the the hangcheck mechanism is disabled, the system may grind to a halt. On disabling the mechanism, we sent a pulse along the engine to remove all executing contexts from the engine which would check for hung contexts -- but we did not prevent those contexts from being resubmitted if they survived the final hangcheck. Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs") Testcase: igt/gem_ctx_persistence/heartbeat-stop Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: # v5.7+ Reviewed-by: Tvrtko Ursulin Acked-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20200928221510.26044-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_engine.h | 9 +++++++++ drivers/gpu/drm/i915/i915_request.c | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h index 08e2c000dcc3..7c3a1012e702 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine.h +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -337,4 +337,13 @@ intel_engine_has_preempt_reset(const struct intel_engine_cs *engine) return intel_engine_has_preemption(engine); } +static inline bool +intel_engine_has_heartbeat(const struct intel_engine_cs *engine) +{ + if (!IS_ACTIVE(CONFIG_DRM_I915_HEARTBEAT_INTERVAL)) + return false; + + return READ_ONCE(engine->props.heartbeat_interval_ms); +} + #endif /* _INTEL_RINGBUFFER_H_ */ diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 436ce368ddaa..0e813819b041 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -542,8 +542,13 @@ bool __i915_request_submit(struct i915_request *request) if (i915_request_completed(request)) goto xfer; + if (unlikely(intel_context_is_closed(request->context) && + !intel_engine_has_heartbeat(engine))) + intel_context_set_banned(request->context); + if (unlikely(intel_context_is_banned(request->context))) i915_request_set_error_once(request, -EIO); + if (unlikely(fatal_error(request->fence.error))) __i915_request_skip(request); -- cgit v1.2.3 From 3dd66a94de59d7792e7917eb3075342e70f06f44 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 28 Sep 2020 23:15:09 +0100 Subject: drm/i915/gt: Always send a pulse down the engine after disabling heartbeat Currently, we check we can send a pulse prior to disabling the heartbeat to verify that we can change the heartbeat, but since we may re-evaluate execution upon changing the heartbeat interval we need another pulse afterwards to refresh execution. v2: Tvrtko asked if we could reduce the double pulse to a single, which opened up a discussion of how we should handle the pulse-error after attempting to change the property, and the desire to serialise adjustment of the property with its validating pulse, and unwind upon failure. Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs") Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: Tvrtko Ursulin Cc: # v5.7+ Reviewed-by: Tvrtko Ursulin Acked-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20200928221510.26044-2-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c | 106 ++++++++++++++--------- 1 file changed, 67 insertions(+), 39 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c index 8ffdf676c0a0..5067d0524d4b 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c @@ -177,36 +177,82 @@ void intel_engine_init_heartbeat(struct intel_engine_cs *engine) INIT_DELAYED_WORK(&engine->heartbeat.work, heartbeat); } +static int __intel_engine_pulse(struct intel_engine_cs *engine) +{ + struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; + struct intel_context *ce = engine->kernel_context; + struct i915_request *rq; + + lockdep_assert_held(&ce->timeline->mutex); + GEM_BUG_ON(!intel_engine_has_preemption(engine)); + GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); + + intel_context_enter(ce); + rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN); + intel_context_exit(ce); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); + idle_pulse(engine, rq); + + __i915_request_commit(rq); + __i915_request_queue(rq, &attr); + GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); + + return 0; +} + +static unsigned long set_heartbeat(struct intel_engine_cs *engine, + unsigned long delay) +{ + unsigned long old; + + old = xchg(&engine->props.heartbeat_interval_ms, delay); + if (delay) + intel_engine_unpark_heartbeat(engine); + else + intel_engine_park_heartbeat(engine); + + return old; +} + int intel_engine_set_heartbeat(struct intel_engine_cs *engine, unsigned long delay) { - int err; + struct intel_context *ce = engine->kernel_context; + int err = 0; - /* Send one last pulse before to cleanup persistent hogs */ - if (!delay && IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT)) { - err = intel_engine_pulse(engine); - if (err) - return err; - } + if (!delay && !intel_engine_has_preempt_reset(engine)) + return -ENODEV; + + intel_engine_pm_get(engine); + + err = mutex_lock_interruptible(&ce->timeline->mutex); + if (err) + goto out_rpm; - WRITE_ONCE(engine->props.heartbeat_interval_ms, delay); + if (delay != engine->props.heartbeat_interval_ms) { + unsigned long saved = set_heartbeat(engine, delay); - if (intel_engine_pm_get_if_awake(engine)) { - if (delay) - intel_engine_unpark_heartbeat(engine); - else - intel_engine_park_heartbeat(engine); - intel_engine_pm_put(engine); + /* recheck current execution */ + if (intel_engine_has_preemption(engine)) { + err = __intel_engine_pulse(engine); + if (err) + set_heartbeat(engine, saved); + } } - return 0; + mutex_unlock(&ce->timeline->mutex); + +out_rpm: + intel_engine_pm_put(engine); + return err; } int intel_engine_pulse(struct intel_engine_cs *engine) { - struct i915_sched_attr attr = { .priority = I915_PRIORITY_BARRIER }; struct intel_context *ce = engine->kernel_context; - struct i915_request *rq; int err; if (!intel_engine_has_preemption(engine)) @@ -215,30 +261,12 @@ int intel_engine_pulse(struct intel_engine_cs *engine) if (!intel_engine_pm_get_if_awake(engine)) return 0; - if (mutex_lock_interruptible(&ce->timeline->mutex)) { - err = -EINTR; - goto out_rpm; - } - - intel_context_enter(ce); - rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN); - intel_context_exit(ce); - if (IS_ERR(rq)) { - err = PTR_ERR(rq); - goto out_unlock; + err = -EINTR; + if (!mutex_lock_interruptible(&ce->timeline->mutex)) { + err = __intel_engine_pulse(engine); + mutex_unlock(&ce->timeline->mutex); } - __set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); - idle_pulse(engine, rq); - - __i915_request_commit(rq); - __i915_request_queue(rq, &attr); - GEM_BUG_ON(rq->sched.attr.priority < I915_PRIORITY_BARRIER); - err = 0; - -out_unlock: - mutex_unlock(&ce->timeline->mutex); -out_rpm: intel_engine_pm_put(engine); return err; } -- cgit v1.2.3 From d3bb2f9b5ee66d5e000293edd6b6575e59d11db9 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 28 Sep 2020 23:15:10 +0100 Subject: drm/i915/gem: Always test execution status on closing the context Verify that if a context is active at the time it is closed, that it is either persistent and preemptible (with hangcheck running) or it shall be removed from execution. Fixes: 9a40bddd47ca ("drm/i915/gt: Expose heartbeat interval via sysfs") Testcase: igt/gem_ctx_persistence/heartbeat-close Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: # v5.7+ Reviewed-by: Tvrtko Ursulin Acked-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20200928221510.26044-3-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gem/i915_gem_context.c | 48 ++++++----------------------- 1 file changed, 10 insertions(+), 38 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index a548626fa8bc..4fd38101bb56 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -390,24 +390,6 @@ __context_engines_static(const struct i915_gem_context *ctx) return rcu_dereference_protected(ctx->engines, true); } -static bool __reset_engine(struct intel_engine_cs *engine) -{ - struct intel_gt *gt = engine->gt; - bool success = false; - - if (!intel_has_reset_engine(gt)) - return false; - - if (!test_and_set_bit(I915_RESET_ENGINE + engine->id, - >->reset.flags)) { - success = intel_engine_reset(engine, NULL) == 0; - clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id, - >->reset.flags); - } - - return success; -} - static void __reset_context(struct i915_gem_context *ctx, struct intel_engine_cs *engine) { @@ -431,12 +413,7 @@ static bool __cancel_engine(struct intel_engine_cs *engine) * kill the banned context, we fallback to doing a local reset * instead. */ - if (IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT) && - !intel_engine_pulse(engine)) - return true; - - /* If we are unable to send a pulse, try resetting this engine. */ - return __reset_engine(engine); + return intel_engine_pulse(engine) == 0; } static bool @@ -506,7 +483,7 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce) return engine; } -static void kill_engines(struct i915_gem_engines *engines) +static void kill_engines(struct i915_gem_engines *engines, bool ban) { struct i915_gem_engines_iter it; struct intel_context *ce; @@ -521,7 +498,7 @@ static void kill_engines(struct i915_gem_engines *engines) for_each_gem_engine(ce, engines, it) { struct intel_engine_cs *engine; - if (intel_context_set_banned(ce)) + if (ban && intel_context_set_banned(ce)) continue; /* @@ -534,7 +511,7 @@ static void kill_engines(struct i915_gem_engines *engines) engine = active_engine(ce); /* First attempt to gracefully cancel the context */ - if (engine && !__cancel_engine(engine)) + if (engine && !__cancel_engine(engine) && ban) /* * If we are unable to send a preemptive pulse to bump * the context from the GPU, we have to resort to a full @@ -544,8 +521,10 @@ static void kill_engines(struct i915_gem_engines *engines) } } -static void kill_stale_engines(struct i915_gem_context *ctx) +static void kill_context(struct i915_gem_context *ctx) { + bool ban = (!i915_gem_context_is_persistent(ctx) || + !ctx->i915->params.enable_hangcheck); struct i915_gem_engines *pos, *next; spin_lock_irq(&ctx->stale.lock); @@ -558,7 +537,7 @@ static void kill_stale_engines(struct i915_gem_context *ctx) spin_unlock_irq(&ctx->stale.lock); - kill_engines(pos); + kill_engines(pos, ban); spin_lock_irq(&ctx->stale.lock); GEM_BUG_ON(i915_sw_fence_signaled(&pos->fence)); @@ -570,11 +549,6 @@ static void kill_stale_engines(struct i915_gem_context *ctx) spin_unlock_irq(&ctx->stale.lock); } -static void kill_context(struct i915_gem_context *ctx) -{ - kill_stale_engines(ctx); -} - static void engines_idle_release(struct i915_gem_context *ctx, struct i915_gem_engines *engines) { @@ -609,7 +583,7 @@ static void engines_idle_release(struct i915_gem_context *ctx, kill: if (list_empty(&engines->link)) /* raced, already closed */ - kill_engines(engines); + kill_engines(engines, true); i915_sw_fence_commit(&engines->fence); } @@ -667,9 +641,7 @@ static void context_close(struct i915_gem_context *ctx) * case we opt to forcibly kill off all remaining requests on * context close. */ - if (!i915_gem_context_is_persistent(ctx) || - !ctx->i915->params.enable_hangcheck) - kill_context(ctx); + kill_context(ctx); i915_gem_context_put(ctx); } -- cgit v1.2.3 From b7eeb2b4132ccf1a7d38f434cde7043913d1ed3c Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 28 Sep 2020 22:59:42 +0100 Subject: drm/i915: Avoid mixing integer types during batch copies Be consistent and use unsigned long throughout the chunk copies to avoid the inherent clumsiness of mixing integer types of different widths and signs. Failing to take acount of a wider unsigned type when using min_t can lead to treating it as a negative, only for it flip back to a large unsigned value after passing a boundary check. Fixes: ed13033f0287 ("drm/i915/cmdparser: Only cache the dst vmap") Testcase: igt/gen9_exec_parse/bb-large Reported-by: "Candelaria, Jared" Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Joonas Lahtinen Cc: "Candelaria, Jared" Cc: "Bloomfield, Jon" Cc: # v4.9+ Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200928215942.31917-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 7 +++++-- drivers/gpu/drm/i915/i915_cmd_parser.c | 10 +++++----- drivers/gpu/drm/i915/i915_drv.h | 4 ++-- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index ae63748e90bd..b7c4cacc3638 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -2267,8 +2267,8 @@ struct eb_parse_work { struct i915_vma *batch; struct i915_vma *shadow; struct i915_vma *trampoline; - unsigned int batch_offset; - unsigned int batch_length; + unsigned long batch_offset; + unsigned long batch_length; }; static int __eb_parse(struct dma_fence_work *work) @@ -2338,6 +2338,9 @@ static int eb_parse_pipeline(struct i915_execbuffer *eb, struct eb_parse_work *pw; int err; + GEM_BUG_ON(overflows_type(eb->batch_start_offset, pw->batch_offset)); + GEM_BUG_ON(overflows_type(eb->batch_len, pw->batch_length)); + pw = kzalloc(sizeof(*pw), GFP_KERNEL); if (!pw) return -ENOMEM; diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index 5ac4a999f05a..e88970256e8e 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -1136,7 +1136,7 @@ find_reg(const struct intel_engine_cs *engine, u32 addr) /* Returns a vmap'd pointer to dst_obj, which the caller must unmap */ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, struct drm_i915_gem_object *src_obj, - u32 offset, u32 length) + unsigned long offset, unsigned long length) { bool needs_clflush; void *dst, *src; @@ -1166,8 +1166,8 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, } } if (IS_ERR(src)) { + unsigned long x, n; void *ptr; - int x, n; /* * We can avoid clflushing partial cachelines before the write @@ -1184,7 +1184,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, ptr = dst; x = offset_in_page(offset); for (n = offset >> PAGE_SHIFT; length; n++) { - int len = min_t(int, length, PAGE_SIZE - x); + int len = min(length, PAGE_SIZE - x); src = kmap_atomic(i915_gem_object_get_page(src_obj, n)); if (needs_clflush) @@ -1414,8 +1414,8 @@ static bool shadow_needs_clflush(struct drm_i915_gem_object *obj) */ int intel_engine_cmd_parser(struct intel_engine_cs *engine, struct i915_vma *batch, - u32 batch_offset, - u32 batch_length, + unsigned long batch_offset, + unsigned long batch_length, struct i915_vma *shadow, bool trampoline) { diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index ab17084af0ff..9cfc0eba4b12 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1931,8 +1931,8 @@ void intel_engine_init_cmd_parser(struct intel_engine_cs *engine); void intel_engine_cleanup_cmd_parser(struct intel_engine_cs *engine); int intel_engine_cmd_parser(struct intel_engine_cs *engine, struct i915_vma *batch, - u32 batch_offset, - u32 batch_length, + unsigned long batch_offset, + unsigned long batch_length, struct i915_vma *shadow, bool trampoline); #define I915_CMD_PARSER_TRAMPOLINE_SIZE 8 -- cgit v1.2.3 From 5e39b4d94cad76eb6c1b0b7fd0f064a2155d0d0a Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 30 Sep 2020 17:32:51 +0100 Subject: drm/i915/gt: Signal cancelled requests After marking the requests on an engine as cancelled upon wedging, send any signals for their completions. Signed-off-by: Chris Wilson Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20200930163253.2789-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_lrc.c | 1 + drivers/gpu/drm/i915/gt/intel_ring_submission.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 1a1c3b077b7b..287537089c77 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -4407,6 +4407,7 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine) /* Mark all executing requests as skipped. */ list_for_each_entry(rq, &engine->active.requests, sched.link) mark_eio(rq); + intel_engine_signal_breadcrumbs(engine); /* Flush the queued requests to the timeline list (for retiring). */ while ((rb = rb_first_cached(&execlists->queue))) { diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c index 1ca1bac81cf6..9ecf9520fa46 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c @@ -444,6 +444,7 @@ static void reset_cancel(struct intel_engine_cs *engine) i915_request_set_error_once(request, -EIO); i915_request_mark_complete(request); } + intel_engine_signal_breadcrumbs(engine); /* Remaining _unready_ requests will be nop'ed when submitted */ -- cgit v1.2.3 From eb3afbe18eb193a6cbf6541b3876c3fe14ad2a51 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 30 Sep 2020 17:32:52 +0100 Subject: drm/i915/selftests: Finish pending mock requests on cancellation. Flush all the pending requests from the mock engine when they are cancelled. Signed-off-by: Chris Wilson Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20200930163253.2789-2-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/mock_engine.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c b/drivers/gpu/drm/i915/gt/mock_engine.c index dfd1cfb8a7ec..2f830017c51d 100644 --- a/drivers/gpu/drm/i915/gt/mock_engine.c +++ b/drivers/gpu/drm/i915/gt/mock_engine.c @@ -245,18 +245,39 @@ static void mock_reset_rewind(struct intel_engine_cs *engine, bool stalled) GEM_BUG_ON(stalled); } +static void mark_eio(struct i915_request *rq) +{ + if (i915_request_completed(rq)) + return; + + GEM_BUG_ON(i915_request_signaled(rq)); + + i915_request_set_error_once(rq, -EIO); + i915_request_mark_complete(rq); +} + static void mock_reset_cancel(struct intel_engine_cs *engine) { - struct i915_request *request; + struct mock_engine *mock = + container_of(engine, typeof(*mock), base); + struct i915_request *rq; unsigned long flags; + del_timer_sync(&mock->hw_delay); + spin_lock_irqsave(&engine->active.lock, flags); /* Mark all submitted requests as skipped. */ - list_for_each_entry(request, &engine->active.requests, sched.link) { - i915_request_set_error_once(request, -EIO); - i915_request_mark_complete(request); + list_for_each_entry(rq, &engine->active.requests, sched.link) + mark_eio(rq); + intel_engine_signal_breadcrumbs(engine); + + /* Cancel and submit all pending requests. */ + list_for_each_entry(rq, &mock->hw_queue, mock.link) { + mark_eio(rq); + __i915_request_submit(rq); } + INIT_LIST_HEAD(&mock->hw_queue); spin_unlock_irqrestore(&engine->active.lock, flags); } -- cgit v1.2.3 From b05734720de982a40676a14ddfdca5f87b4aab5e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 30 Sep 2020 17:32:53 +0100 Subject: drm/i915/gt: Retire cancelled requests on unload If we manage to hit the intel_gt_set_wedged_on_fini() while active, i.e. module unload during a stress test, we may cancel the requests but not clean up. This leads to a very slow module unload as we wait for something or other to trigger the retirement flushing, or timeout and unload with a bunch of warnings. Instead if we explicitly cancel then cleanup on an active unload, it should be instant and quiet. Signed-off-by: Chris Wilson Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20200930163253.2789-3-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_reset.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index ac36b67fb46b..4e5e13dc95da 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -19,6 +19,7 @@ #include "intel_engine_pm.h" #include "intel_gt.h" #include "intel_gt_pm.h" +#include "intel_gt_requests.h" #include "intel_reset.h" #include "uc/intel_guc.h" @@ -1370,6 +1371,7 @@ void intel_gt_set_wedged_on_fini(struct intel_gt *gt) { intel_gt_set_wedged(gt); set_bit(I915_WEDGED_ON_FINI, >->reset.flags); + intel_gt_retire_requests(gt); /* cleanup any wedged requests */ } void intel_gt_init_reset(struct intel_gt *gt) -- cgit v1.2.3 From a6c5e2aea70451f6f1012734a606721c75c57276 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 1 Oct 2020 11:26:32 +0100 Subject: drm/i915: Skip over MI_NOOP when parsing Though less likely in practice, igt uses MI_NOOP frequently to pad out its batch buffers. The lookup and valiation of so many MI_NOOP command descriptions is noticeable, though the side-effect of poisoning the last-validated-command cache is more likely to impact upon real CS. Testcase: igt/gen9_exec_parse/bb-large Signed-off-by: Chris Wilson Reviewed-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20201001102632.18789-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_cmd_parser.c | 67 +++++++++++++++++----------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index e88970256e8e..93265951fdbb 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -1452,43 +1452,42 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, * space. Parsing should be faster in some cases this way. */ batch_end = cmd + batch_length / sizeof(*batch_end); - do { - u32 length; - - if (*cmd == MI_BATCH_BUFFER_END) - break; - - desc = find_cmd(engine, *cmd, desc, &default_desc); - if (!desc) { - DRM_DEBUG("CMD: Unrecognized command: 0x%08X\n", *cmd); - ret = -EINVAL; - break; - } + while (*cmd != MI_BATCH_BUFFER_END) { + u32 length = 1; + + if (*cmd != MI_NOOP) { /* MI_NOOP == 0 */ + desc = find_cmd(engine, *cmd, desc, &default_desc); + if (!desc) { + DRM_DEBUG("CMD: Unrecognized command: 0x%08X\n", *cmd); + ret = -EINVAL; + break; + } - if (desc->flags & CMD_DESC_FIXED) - length = desc->length.fixed; - else - length = (*cmd & desc->length.mask) + LENGTH_BIAS; + if (desc->flags & CMD_DESC_FIXED) + length = desc->length.fixed; + else + length = (*cmd & desc->length.mask) + LENGTH_BIAS; - if ((batch_end - cmd) < length) { - DRM_DEBUG("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n", - *cmd, - length, - batch_end - cmd); - ret = -EINVAL; - break; - } + if ((batch_end - cmd) < length) { + DRM_DEBUG("CMD: Command length exceeds batch length: 0x%08X length=%u batchlen=%td\n", + *cmd, + length, + batch_end - cmd); + ret = -EINVAL; + break; + } - if (!check_cmd(engine, desc, cmd, length)) { - ret = -EACCES; - break; - } + if (!check_cmd(engine, desc, cmd, length)) { + ret = -EACCES; + break; + } - if (cmd_desc_is(desc, MI_BATCH_BUFFER_START)) { - ret = check_bbstart(cmd, offset, length, batch_length, - batch_addr, shadow_addr, - jump_whitelist); - break; + if (cmd_desc_is(desc, MI_BATCH_BUFFER_START)) { + ret = check_bbstart(cmd, offset, length, batch_length, + batch_addr, shadow_addr, + jump_whitelist); + break; + } } if (!IS_ERR_OR_NULL(jump_whitelist)) @@ -1501,7 +1500,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, ret = -EINVAL; break; } - } while (1); + } if (trampoline) { /* -- cgit v1.2.3 From 25dc89d5270f8bc96e792207de72300d840b7076 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 29 Sep 2020 12:26:39 +0100 Subject: drm/i915/gt: Scrub HW state on remove Currently we do a final scrub of the HW state upon release. However, when rebinding the device, this is too late as the device may either have been partially rebound or the device is no longer accessible. If the device has been removed before release, the reset goes astray leaving the device in an inconsistent state, unlikely to work without a full PCI reset. Furthermore, if the device is partially rebound before the HW scrubbing, there may be leftover HW state that should have been scrubbed. Either way, we need to push the scrubbing earlier before the removal, so into unregister. The danger is that on older machines, resetting the GPU also impact the display engine and so the reset should be after modesetting is disabled (and before reuse we need to recover modesetting). Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2508 Testcase: igt/core_hotunplug Signed-off-by: Chris Wilson Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200929112639.24223-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_gt.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index 39b428c5049c..44f1d51e5ae5 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -614,6 +614,8 @@ void intel_gt_driver_remove(struct intel_gt *gt) void intel_gt_driver_unregister(struct intel_gt *gt) { + intel_wakeref_t wakeref; + intel_rps_driver_unregister(>->rps); /* @@ -622,16 +624,15 @@ void intel_gt_driver_unregister(struct intel_gt *gt) * resources. */ intel_gt_set_wedged(gt); + + /* Scrub all HW state upon release */ + with_intel_runtime_pm(gt->uncore->rpm, wakeref) + __intel_gt_reset(gt, ALL_ENGINES); } void intel_gt_driver_release(struct intel_gt *gt) { struct i915_address_space *vm; - intel_wakeref_t wakeref; - - /* Scrub all HW state upon release */ - with_intel_runtime_pm(gt->uncore->rpm, wakeref) - __intel_gt_reset(gt, ALL_ENGINES); vm = fetch_and_zero(>->vm); if (vm) /* FIXME being called twice on error paths :( */ -- cgit v1.2.3 From 8a473dbadccfc6206150de3db3223c40785da348 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 6 Oct 2020 10:25:07 +0100 Subject: drm/i915: Fix DMA mapped scatterlist walks When walking DMA mapped scatterlists sg_dma_len has to be used since it can be different (coalesced) from the backing store entry. This also means we have to end the walk when encountering a zero length DMA entry and cannot rely on the normal sg list end marker. Both issues were there in theory for some time but were hidden by the fact Intel IOMMU driver was never coalescing entries. As there are ongoing efforts to change this we need to start handling it. v2: * Use unsigned int for local storing sg_dma_len. (Logan) Signed-off-by: Tvrtko Ursulin References: 85d1225ec066 ("drm/i915: Introduce & use new lightweight SGL iterators") References: b31144c0daa8 ("drm/i915: Micro-optimise gen6_ppgtt_insert_entries()") Reported-by: Tom Murphy Suggested-by: Tom Murphy # __sgt_iter Suggested-by: Logan Gunthorpe # __sgt_iter Cc: Joonas Lahtinen Cc: Chris Wilson Cc: Matthew Auld Cc: Lu Baolu Reviewed-by: Logan Gunthorpe Link: https://patchwork.freedesktop.org/patch/msgid/20201006092508.1064287-1-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/gt/gen6_ppgtt.c | 6 +++--- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 17 ++++++++++------- drivers/gpu/drm/i915/gt/intel_gtt.h | 2 +- drivers/gpu/drm/i915/i915_scatterlist.h | 12 ++++++++---- 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c index fd0d24d28763..c0d17f87b00f 100644 --- a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c @@ -131,17 +131,17 @@ static void gen6_ppgtt_insert_entries(struct i915_address_space *vm, vaddr = kmap_atomic_px(i915_pt_entry(pd, act_pt)); do { - GEM_BUG_ON(iter.sg->length < I915_GTT_PAGE_SIZE); + GEM_BUG_ON(sg_dma_len(iter.sg) < I915_GTT_PAGE_SIZE); vaddr[act_pte] = pte_encode | GEN6_PTE_ADDR_ENCODE(iter.dma); iter.dma += I915_GTT_PAGE_SIZE; if (iter.dma == iter.max) { iter.sg = __sg_next(iter.sg); - if (!iter.sg) + if (!iter.sg || sg_dma_len(iter.sg) == 0) break; iter.dma = sg_dma_address(iter.sg); - iter.max = iter.dma + iter.sg->length; + iter.max = iter.dma + sg_dma_len(iter.sg); } if (++act_pte == GEN6_PTES) { diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c index eb64f474a78c..b236aa046f91 100644 --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -372,19 +372,19 @@ gen8_ppgtt_insert_pte(struct i915_ppgtt *ppgtt, pd = i915_pd_entry(pdp, gen8_pd_index(idx, 2)); vaddr = kmap_atomic_px(i915_pt_entry(pd, gen8_pd_index(idx, 1))); do { - GEM_BUG_ON(iter->sg->length < I915_GTT_PAGE_SIZE); + GEM_BUG_ON(sg_dma_len(iter->sg) < I915_GTT_PAGE_SIZE); vaddr[gen8_pd_index(idx, 0)] = pte_encode | iter->dma; iter->dma += I915_GTT_PAGE_SIZE; if (iter->dma >= iter->max) { iter->sg = __sg_next(iter->sg); - if (!iter->sg) { + if (!iter->sg || sg_dma_len(iter->sg) == 0) { idx = 0; break; } iter->dma = sg_dma_address(iter->sg); - iter->max = iter->dma + iter->sg->length; + iter->max = iter->dma + sg_dma_len(iter->sg); } if (gen8_pd_index(++idx, 0) == 0) { @@ -413,8 +413,8 @@ static void gen8_ppgtt_insert_huge(struct i915_vma *vma, u32 flags) { const gen8_pte_t pte_encode = gen8_pte_encode(0, cache_level, flags); + unsigned int rem = sg_dma_len(iter->sg); u64 start = vma->node.start; - dma_addr_t rem = iter->sg->length; GEM_BUG_ON(!i915_vm_is_4lvl(vma->vm)); @@ -456,7 +456,7 @@ static void gen8_ppgtt_insert_huge(struct i915_vma *vma, } do { - GEM_BUG_ON(iter->sg->length < page_size); + GEM_BUG_ON(sg_dma_len(iter->sg) < page_size); vaddr[index++] = encode | iter->dma; start += page_size; @@ -467,7 +467,10 @@ static void gen8_ppgtt_insert_huge(struct i915_vma *vma, if (!iter->sg) break; - rem = iter->sg->length; + rem = sg_dma_len(iter->sg); + if (!rem) + break; + iter->dma = sg_dma_address(iter->sg); iter->max = iter->dma + rem; @@ -525,7 +528,7 @@ static void gen8_ppgtt_insert_huge(struct i915_vma *vma, } vma->page_sizes.gtt |= page_size; - } while (iter->sg); + } while (iter->sg && sg_dma_len(iter->sg)); } static void gen8_ppgtt_insert(struct i915_address_space *vm, diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h index c13c650ced22..8a33940a71f3 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.h +++ b/drivers/gpu/drm/i915/gt/intel_gtt.h @@ -580,7 +580,7 @@ static inline struct sgt_dma { struct scatterlist *sg = vma->pages->sgl; dma_addr_t addr = sg_dma_address(sg); - return (struct sgt_dma){ sg, addr, addr + sg->length }; + return (struct sgt_dma){ sg, addr, addr + sg_dma_len(sg) }; } #endif diff --git a/drivers/gpu/drm/i915/i915_scatterlist.h b/drivers/gpu/drm/i915/i915_scatterlist.h index b7b59328cb76..510856887628 100644 --- a/drivers/gpu/drm/i915/i915_scatterlist.h +++ b/drivers/gpu/drm/i915/i915_scatterlist.h @@ -27,13 +27,17 @@ static __always_inline struct sgt_iter { } __sgt_iter(struct scatterlist *sgl, bool dma) { struct sgt_iter s = { .sgp = sgl }; - if (s.sgp) { + if (dma && s.sgp && sg_dma_len(s.sgp) == 0) { + s.sgp = NULL; + } else if (s.sgp) { s.max = s.curr = s.sgp->offset; - s.max += s.sgp->length; - if (dma) + if (dma) { s.dma = sg_dma_address(s.sgp); - else + s.max += sg_dma_len(s.sgp); + } else { s.pfn = page_to_pfn(sg_page(s.sgp)); + s.max += s.sgp->length; + } } return s; -- cgit v1.2.3 From 934941ed5a3070a7833c688c9b1d71484fc01a68 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 6 Oct 2020 10:25:08 +0100 Subject: drm/i915: Fix DMA mapped scatterlist lookup As the previous patch fixed the places where we walk the whole scatterlist for DMA addresses, this patch fixes the random lookup functionality. To achieve this we have to add a second lookup iterator and add a i915_gem_object_get_sg_dma helper, to be used analoguous to existing i915_gem_object_get_sg_dma. Therefore two lookup caches are maintained per object and they are flushed at the same point for simplicity. (Strictly speaking the DMA cache should be flushed from i915_gem_gtt_finish_pages, but today this conincides with unsetting of the pages in general.) Partial VMA view is then fixed to use the new DMA lookup and properly query sg length. v2: * Checkpatch. Signed-off-by: Tvrtko Ursulin Cc: Joonas Lahtinen Cc: Chris Wilson Cc: Matthew Auld Cc: Lu Baolu Cc: Tom Murphy Cc: Logan Gunthorpe Reviewed-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201006092508.1064287-2-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/gem/i915_gem_object.c | 2 ++ drivers/gpu/drm/i915/gem/i915_gem_object.h | 20 +++++++++++++++++++- drivers/gpu/drm/i915/gem/i915_gem_object_types.h | 17 ++++++++++------- drivers/gpu/drm/i915/gem/i915_gem_pages.c | 21 +++++++++++++-------- drivers/gpu/drm/i915/gt/intel_ggtt.c | 4 ++-- drivers/gpu/drm/i915/i915_scatterlist.h | 5 +++++ 6 files changed, 51 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c index c8421fd9d2dc..ffeaf1b9b1bb 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c @@ -73,6 +73,8 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj, obj->mm.madv = I915_MADV_WILLNEED; INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN); mutex_init(&obj->mm.get_page.lock); + INIT_RADIX_TREE(&obj->mm.get_dma_page.radix, GFP_KERNEL | __GFP_NOWARN); + mutex_init(&obj->mm.get_dma_page.lock); if (IS_ENABLED(CONFIG_LOCKDEP) && i915_gem_object_is_shrinkable(obj)) i915_gem_shrinker_taints_mutex(to_i915(obj->base.dev), diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h b/drivers/gpu/drm/i915/gem/i915_gem_object.h index afde1952c119..3cad6a07d0a6 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h @@ -275,8 +275,26 @@ int i915_gem_object_set_tiling(struct drm_i915_gem_object *obj, unsigned int tiling, unsigned int stride); struct scatterlist * +__i915_gem_object_get_sg(struct drm_i915_gem_object *obj, + struct i915_gem_object_page_iter *iter, + unsigned int n, + unsigned int *offset); + +static inline struct scatterlist * i915_gem_object_get_sg(struct drm_i915_gem_object *obj, - unsigned int n, unsigned int *offset); + unsigned int n, + unsigned int *offset) +{ + return __i915_gem_object_get_sg(obj, &obj->mm.get_page, n, offset); +} + +static inline struct scatterlist * +i915_gem_object_get_sg_dma(struct drm_i915_gem_object *obj, + unsigned int n, + unsigned int *offset) +{ + return __i915_gem_object_get_sg(obj, &obj->mm.get_dma_page, n, offset); +} struct page * i915_gem_object_get_page(struct drm_i915_gem_object *obj, diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index b5c15557cc87..fedfebf13344 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -80,6 +80,14 @@ struct i915_mmap_offset { struct rb_node offset; }; +struct i915_gem_object_page_iter { + struct scatterlist *sg_pos; + unsigned int sg_idx; /* in pages, but 32bit eek! */ + + struct radix_tree_root radix; + struct mutex lock; /* protects this cache */ +}; + struct drm_i915_gem_object { struct drm_gem_object base; @@ -246,13 +254,8 @@ struct drm_i915_gem_object { I915_SELFTEST_DECLARE(unsigned int page_mask); - struct i915_gem_object_page_iter { - struct scatterlist *sg_pos; - unsigned int sg_idx; /* in pages, but 32bit eek! */ - - struct radix_tree_root radix; - struct mutex lock; /* protects this cache */ - } get_page; + struct i915_gem_object_page_iter get_page; + struct i915_gem_object_page_iter get_dma_page; /** * Element within i915->mm.unbound_list or i915->mm.bound_list, diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c index ed31bb2b82de..138020b5edf7 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c @@ -33,6 +33,8 @@ void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj, obj->mm.get_page.sg_pos = pages->sgl; obj->mm.get_page.sg_idx = 0; + obj->mm.get_dma_page.sg_pos = pages->sgl; + obj->mm.get_dma_page.sg_idx = 0; obj->mm.pages = pages; @@ -155,6 +157,8 @@ static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj) rcu_read_lock(); radix_tree_for_each_slot(slot, &obj->mm.get_page.radix, &iter, 0) radix_tree_delete(&obj->mm.get_page.radix, iter.index); + radix_tree_for_each_slot(slot, &obj->mm.get_dma_page.radix, &iter, 0) + radix_tree_delete(&obj->mm.get_dma_page.radix, iter.index); rcu_read_unlock(); } @@ -450,11 +454,12 @@ void __i915_gem_object_release_map(struct drm_i915_gem_object *obj) } struct scatterlist * -i915_gem_object_get_sg(struct drm_i915_gem_object *obj, - unsigned int n, - unsigned int *offset) +__i915_gem_object_get_sg(struct drm_i915_gem_object *obj, + struct i915_gem_object_page_iter *iter, + unsigned int n, + unsigned int *offset) { - struct i915_gem_object_page_iter *iter = &obj->mm.get_page; + const bool dma = iter == &obj->mm.get_dma_page; struct scatterlist *sg; unsigned int idx, count; @@ -483,7 +488,7 @@ i915_gem_object_get_sg(struct drm_i915_gem_object *obj, sg = iter->sg_pos; idx = iter->sg_idx; - count = __sg_page_count(sg); + count = dma ? __sg_dma_page_count(sg) : __sg_page_count(sg); while (idx + count <= n) { void *entry; @@ -511,7 +516,7 @@ i915_gem_object_get_sg(struct drm_i915_gem_object *obj, idx += count; sg = ____sg_next(sg); - count = __sg_page_count(sg); + count = dma ? __sg_dma_page_count(sg) : __sg_page_count(sg); } scan: @@ -529,7 +534,7 @@ scan: while (idx + count <= n) { idx += count; sg = ____sg_next(sg); - count = __sg_page_count(sg); + count = dma ? __sg_dma_page_count(sg) : __sg_page_count(sg); } *offset = n - idx; @@ -582,7 +587,7 @@ i915_gem_object_get_dma_address_len(struct drm_i915_gem_object *obj, struct scatterlist *sg; unsigned int offset; - sg = i915_gem_object_get_sg(obj, n, &offset); + sg = i915_gem_object_get_sg_dma(obj, n, &offset); if (len) *len = sg_dma_len(sg) - (offset << PAGE_SHIFT); diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c index 33a3f627ddb1..5c8aab1f9bef 100644 --- a/drivers/gpu/drm/i915/gt/intel_ggtt.c +++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c @@ -1383,7 +1383,7 @@ intel_partial_pages(const struct i915_ggtt_view *view, if (ret) goto err_sg_alloc; - iter = i915_gem_object_get_sg(obj, view->partial.offset, &offset); + iter = i915_gem_object_get_sg_dma(obj, view->partial.offset, &offset); GEM_BUG_ON(!iter); sg = st->sgl; @@ -1391,7 +1391,7 @@ intel_partial_pages(const struct i915_ggtt_view *view, do { unsigned int len; - len = min(iter->length - (offset << PAGE_SHIFT), + len = min(sg_dma_len(iter) - (offset << PAGE_SHIFT), count << PAGE_SHIFT); sg_set_page(sg, NULL, len, 0); sg_dma_address(sg) = diff --git a/drivers/gpu/drm/i915/i915_scatterlist.h b/drivers/gpu/drm/i915/i915_scatterlist.h index 510856887628..102d8d7007b6 100644 --- a/drivers/gpu/drm/i915/i915_scatterlist.h +++ b/drivers/gpu/drm/i915/i915_scatterlist.h @@ -48,6 +48,11 @@ static inline int __sg_page_count(const struct scatterlist *sg) return sg->length >> PAGE_SHIFT; } +static inline int __sg_dma_page_count(const struct scatterlist *sg) +{ + return sg_dma_len(sg) >> PAGE_SHIFT; +} + static inline struct scatterlist *____sg_next(struct scatterlist *sg) { ++sg; -- cgit v1.2.3 From bf9bd6a5128a2a366b5a13e916c1f76393ff7ba8 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 6 Oct 2020 10:46:53 +0100 Subject: drm/i915/gt: Track the most recent pulse for the heartbeat Since we track the idle_pulse for flushing the barriers and avoid re-emitting the pulse upon idling if no futher action is required, this also impacts the heartbeat. Before emitting a fresh heartbeat, we look at the engine idle status and assume that if the pulse was the last request emitted along the heartbeat, the engine is idling and a heartbeat pulse not required. This assumption fails, but we can reuse the idle pulse as the heartbeat if we are yet to emit one, and so track the status of that pulse for our engine health check. This impacts tgl/rcs0 as we rely on the heartbeat for our healthcheck for the normal preemption detection mechanism is disabled by default. Testcase: igt/gem_exec_schedule/preempt-hang/rcs0 #tgl Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201006094653.7558-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c index 5067d0524d4b..9060385cd69e 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c @@ -41,6 +41,8 @@ static void idle_pulse(struct intel_engine_cs *engine, struct i915_request *rq) { engine->wakeref_serial = READ_ONCE(engine->serial) + 1; i915_request_add_active_barriers(rq); + if (!engine->heartbeat.systole && intel_engine_has_heartbeat(engine)) + engine->heartbeat.systole = i915_request_get(rq); } static void show_heartbeat(const struct i915_request *rq, @@ -144,8 +146,6 @@ static void heartbeat(struct work_struct *wrk) goto unlock; idle_pulse(engine, rq); - if (engine->i915->params.enable_hangcheck) - engine->heartbeat.systole = i915_request_get(rq); __i915_request_commit(rq); __i915_request_queue(rq, &attr); @@ -153,7 +153,7 @@ static void heartbeat(struct work_struct *wrk) unlock: mutex_unlock(&ce->timeline->mutex); out: - if (!next_heartbeat(engine)) + if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine)) i915_request_put(fetch_and_zero(&engine->heartbeat.systole)); intel_engine_pm_put(engine); } -- cgit v1.2.3 From 4d8a5cfe3b131f60903949f998c5961cc922e0b0 Mon Sep 17 00:00:00 2001 From: Ayaz A Siddiqui Date: Wed, 29 Jul 2020 15:55:39 +0530 Subject: drm/i915/gt: Initialize reserved and unspecified MOCS indices In order to avoid functional breakage of mis-programmed applications that have grown to depend on unused MOCS entries, we are programming those entries to be equal to fully cached ("L3 + LLC") entry. These reserved and unspecified entries should not be used as they may be changed to less performant variants with better coherency in the future if more entries are needed. v2: As suggested by Lucas De Marchi to utilise __init_mocs_table for programming default value, setting I915_MOCS_PTE index of tgl_mocs_table with desired value. Cc: Chris Wilson Cc: Lucas De Marchi Cc: Tomasz Lis Cc: Matt Roper Cc: Joonas Lahtinen Cc: Francisco Jerez Cc: Mathew Alwin Cc: Mcguire Russell W Cc: Spruit Neil R Cc: Zhou Cheng Cc: Benemelis Mike G Signed-off-by: Ayaz A Siddiqui Reviewed-by: Lucas De Marchi Acked-by: Joonas Lahtinen Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20200729102539.134731-2-ayaz.siddiqui@intel.com Cc: stable@vger.kernel.org --- drivers/gpu/drm/i915/gt/intel_mocs.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c index 632e08a4592b..b8f56e62158e 100644 --- a/drivers/gpu/drm/i915/gt/intel_mocs.c +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c @@ -234,11 +234,17 @@ static const struct drm_i915_mocs_entry broxton_mocs_table[] = { L3_1_UC) static const struct drm_i915_mocs_entry tgl_mocs_table[] = { - /* Base - Error (Reserved for Non-Use) */ - MOCS_ENTRY(0, 0x0, 0x0), - /* Base - Reserved */ - MOCS_ENTRY(1, 0x0, 0x0), - + /* + * NOTE: + * Reserved and unspecified MOCS indices have been set to (L3 + LCC). + * These reserved entries should never be used, they may be changed + * to low performant variants with better coherency in the future if + * more entries are needed. We are programming index I915_MOCS_PTE(1) + * only, __init_mocs_table() take care to program unused index with + * this entry. + */ + MOCS_ENTRY(1, LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), + L3_3_WB), GEN11_MOCS_ENTRIES, /* Implicitly enable L1 - HDC:L1 + L3 + LLC */ -- cgit v1.2.3 From d46b60a2e8d246f1f0faa38e52f4f5a73858c338 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Thu, 15 Oct 2020 13:21:35 +0100 Subject: drm/i915: Mark ininitial fb obj as WT on eLLC machines to avoid rcu lockup during fbdev init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we leave the cache_level of the initial fb obj set to NONE. This means on eLLC machines the first pin_to_display() will try to switch it to WT which requires a vma unbind+bind. If that happens during the fbdev initialization rcu does not seem operational which causes the unbind to get stuck. To most appearances this looks like a dead machine on boot. Avoid the unbind by already marking the object cache_level as WT when creating it. We still do an excplicit ggtt pin which will rewrite the PTEs anyway, so they will match whatever cache level we set. Cc: # v5.7+ Suggested-by: Chris Wilson Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2381 Signed-off-by: Ville Syrjälä Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201007120329.17076-1-ville.syrjala@linux.intel.com Link: https://patchwork.freedesktop.org/patch/msgid/20201015122138.30161-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/display/intel_display.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index c8b1dd1a9e46..4d58a4710f2a 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -3433,6 +3433,14 @@ initial_plane_vma(struct drm_i915_private *i915, if (IS_ERR(obj)) return NULL; + /* + * Mark it WT ahead of time to avoid changing the + * cache_level during fbdev initialization. The + * unbind there would get stuck waiting for rcu. + */ + i915_gem_object_set_cache_coherency(obj, HAS_WT(i915) ? + I915_CACHE_WT : I915_CACHE_NONE); + switch (plane_config->tiling) { case I915_TILING_NONE: break; -- cgit v1.2.3 From 36b6b6816989cf6f468eea82694e83211a066fa4 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Thu, 15 Oct 2020 13:21:36 +0100 Subject: drm/i915: Fix MOCS PTE setting for gen9+ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix up the MOCS PTE setting to really get the LLC cacheability from the PTE rather than hardocoding it to LLC or LLC+eLLC. Signed-off-by: Ville Syrjälä Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201007120329.17076-2-ville.syrjala@linux.intel.com Link: https://patchwork.freedesktop.org/patch/msgid/20201015122138.30161-2-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_mocs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c index b8f56e62158e..93970eb36406 100644 --- a/drivers/gpu/drm/i915/gt/intel_mocs.c +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c @@ -124,7 +124,7 @@ struct drm_i915_mocs_table { LE_1_UC | LE_TC_2_LLC_ELLC, \ L3_1_UC), \ MOCS_ENTRY(I915_MOCS_PTE, \ - LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | LE_LRUM(3), \ + LE_0_PAGETABLE | LE_TC_0_PAGETABLE | LE_LRUM(3), \ L3_3_WB) static const struct drm_i915_mocs_entry skl_mocs_table[] = { @@ -280,7 +280,7 @@ static const struct drm_i915_mocs_entry icl_mocs_table[] = { L3_1_UC), /* Base - L3 + LeCC:PAT (Deprecated) */ MOCS_ENTRY(I915_MOCS_PTE, - LE_0_PAGETABLE | LE_TC_1_LLC, + LE_0_PAGETABLE | LE_TC_0_PAGETABLE, L3_3_WB), GEN11_MOCS_ENTRIES -- cgit v1.2.3 From c0888e9e22623972a8468cd18f4b5b7c6db040ee Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Thu, 15 Oct 2020 13:21:37 +0100 Subject: drm/i915: Enable eLLC caching of display buffers for SKL+ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since SKL the eLLC has been sitting on the far side of the system agent, meaning the display engine can utilize it. Let's enable that. I chose WB for the caching mode, because my numbers are indicating that WT might actually be WB and WC might actually be UC. I'm not 100% sure that is indeed the case but at least my simple rendercopy based benchmark didn't see any difference in performance. Also if I configure things to do LLCeLLC+WT I still get cache dirt on my screen, suggesting that is in fact operating in WB mode anyway. This is also the reason I had to fix the MOCS target cache to really say PTE rather than LLC+eLLC. Since SKL the eLLC has been sitting on the far side of the system agent, meaning the display engine can utilize it. Let's enable that. Eero's earlier benchmarks numbers: "* Results in GfxBench and Unigine (Valley/Heaven) tests were within daily variation on the tested SKL machines * SKL GT4e (128MB eLLC) / Wayland / Weston: +15-20% SynMark TexMem512 (512MB of textures) +4-6% SynMark TerrainFly*, CSCloth, ShMapVsm -5-10% SynMark TexMem128 (128MB of textures) * SKL GT3e (64MB eLLC) / Xorg / Unity: +4-8% GpuTest Triangle fullscreen (FullHD) -5-10% GpuTest Triangle windowed (1/2 screen) * SKL GT2 (no eLLC) / Xorg / Unity: * Some of the higher FPS SynMark pixel and vertex shader tests are few percent higher, more than daily variance => Do you see any reason why this machine would be impacted although it doesn't eLLC?" Caveats: - Still haven't tested with a prime setup - Still not entirely sure this a good idea, but I've been using it on my cfl anyway :) v2: Split the MOCS PTE change out Cc: Eero Tamminen Reviewed-by: Chris Wilson Signed-off-by: Ville Syrjälä Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201007120329.17076-3-ville.syrjala@linux.intel.com Link: https://patchwork.freedesktop.org/patch/msgid/20201015122138.30161-3-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_gtt.c | 10 ++++++++-- drivers/gpu/drm/i915/i915_drv.h | 3 +-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index 3f1114b58b01..7bfe9072be9a 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -324,7 +324,7 @@ static void cnl_setup_private_ppat(struct intel_uncore *uncore) GEN8_PPAT_WC | GEN8_PPAT_LLCELLC); intel_uncore_write(uncore, GEN10_PAT_INDEX(2), - GEN8_PPAT_WT | GEN8_PPAT_LLCELLC); + GEN8_PPAT_WB | GEN8_PPAT_ELLC_OVERRIDE); intel_uncore_write(uncore, GEN10_PAT_INDEX(3), GEN8_PPAT_UC); @@ -349,17 +349,23 @@ static void cnl_setup_private_ppat(struct intel_uncore *uncore) */ static void bdw_setup_private_ppat(struct intel_uncore *uncore) { + struct drm_i915_private *i915 = uncore->i915; u64 pat; pat = GEN8_PPAT(0, GEN8_PPAT_WB | GEN8_PPAT_LLC) | /* for normal objects, no eLLC */ GEN8_PPAT(1, GEN8_PPAT_WC | GEN8_PPAT_LLCELLC) | /* for something pointing to ptes? */ - GEN8_PPAT(2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC) | /* for scanout with eLLC */ GEN8_PPAT(3, GEN8_PPAT_UC) | /* Uncached objects, mostly for scanout */ GEN8_PPAT(4, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(0)) | GEN8_PPAT(5, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(1)) | GEN8_PPAT(6, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(2)) | GEN8_PPAT(7, GEN8_PPAT_WB | GEN8_PPAT_LLCELLC | GEN8_PPAT_AGE(3)); + /* for scanout with eLLC */ + if (INTEL_GEN(i915) >= 9) + pat |= GEN8_PPAT(2, GEN8_PPAT_WB | GEN8_PPAT_ELLC_OVERRIDE); + else + pat |= GEN8_PPAT(2, GEN8_PPAT_WT | GEN8_PPAT_LLCELLC); + intel_uncore_write(uncore, GEN8_PRIVATE_PAT_LO, lower_32_bits(pat)); intel_uncore_write(uncore, GEN8_PRIVATE_PAT_HI, upper_32_bits(pat)); } diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 9cfc0eba4b12..539df8fbc84b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1620,8 +1620,7 @@ extern const struct i915_rev_steppings kbl_revids[]; #define HAS_SNOOP(dev_priv) (INTEL_INFO(dev_priv)->has_snoop) #define HAS_EDRAM(dev_priv) ((dev_priv)->edram_size_mb) #define HAS_SECURE_BATCHES(dev_priv) (INTEL_GEN(dev_priv) < 6) -#define HAS_WT(dev_priv) ((IS_HASWELL(dev_priv) || \ - IS_BROADWELL(dev_priv)) && HAS_EDRAM(dev_priv)) +#define HAS_WT(dev_priv) HAS_EDRAM(dev_priv) #define HWS_NEEDS_PHYSICAL(dev_priv) (INTEL_INFO(dev_priv)->hws_needs_physical) -- cgit v1.2.3 From a04ac827366594c7244f60e9be79fcb404af69f0 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 15 Oct 2020 13:21:38 +0100 Subject: drm/i915/gt: Fixup tgl mocs for PTE tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Forcing mocs:1 [used for our winsys follows-pte mode] to be cached caused display glitches. Though it is documented as deprecated (and so likely behaves as uncached) use the follow-pte bit and force it out of L3 cache. Fixes: 4d8a5cfe3b13 ("drm/i915/gt: Initialize reserved and unspecified MOCS indices") Testcase: igt/kms_frontbuffer_tracking Testcase: igt/kms_big_fb Signed-off-by: Chris Wilson Cc: Ayaz A Siddiqui Cc: Lucas De Marchi Cc: Matt Roper Cc: Ville Syrjälä Cc: Joonas Lahtinen Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20201015122138.30161-4-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_mocs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c index 93970eb36406..1ade9583c3c1 100644 --- a/drivers/gpu/drm/i915/gt/intel_mocs.c +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c @@ -243,8 +243,9 @@ static const struct drm_i915_mocs_entry tgl_mocs_table[] = { * only, __init_mocs_table() take care to program unused index with * this entry. */ - MOCS_ENTRY(1, LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), - L3_3_WB), + MOCS_ENTRY(I915_MOCS_PTE, + LE_0_PAGETABLE | LE_TC_0_PAGETABLE, + L3_1_UC), GEN11_MOCS_ENTRIES, /* Implicitly enable L1 - HDC:L1 + L3 + LLC */ -- cgit v1.2.3 From 57b2d834bf235daab388c3ba12d035c820ae09c6 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 15 Oct 2020 12:59:54 +0100 Subject: drm/i915/gem: Support parsing of oversize batches Matthew Auld noted that on more recent systems (such as the parser for gen9) we may have objects that are larger than expected by the GEM uAPI (i.e. greater than u32). These objects would have incorrect implicit batch lengths, causing the parser to reject them for being incomplete, or worse. Based on a patch by Matthew Auld. Reported-by: Matthew Auld Fixes: 435e8fc059db ("drm/i915: Allow parsing of unsized batches") Testcase: igt/gem_exec_params/larger-than-life-batch Signed-off-by: Chris Wilson Cc: Matthew Auld Cc: Mika Kuoppala Cc: Jon Bloomfield Reviewed-by: Matthew Auld Cc: stable@vger.kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20201015115954.871-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index b7c4cacc3638..6bd6fc988694 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -287,8 +287,8 @@ struct i915_execbuffer { u64 invalid_flags; /** Set of execobj.flags that are invalid */ u32 context_flags; /** Set of execobj.flags to insert from the ctx */ + u64 batch_len; /** Length of batch within object */ u32 batch_start_offset; /** Location within object of batch */ - u32 batch_len; /** Length of batch within object */ u32 batch_flags; /** Flags composed for emit_bb_start() */ struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch buffer */ @@ -871,6 +871,10 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb) if (eb->batch_len == 0) eb->batch_len = eb->batch->vma->size - eb->batch_start_offset; + if (unlikely(eb->batch_len == 0)) { /* impossible! */ + drm_dbg(&i915->drm, "Invalid batch length\n"); + return -EINVAL; + } return 0; @@ -2424,7 +2428,7 @@ static int eb_parse(struct i915_execbuffer *eb) struct drm_i915_private *i915 = eb->i915; struct intel_gt_buffer_pool_node *pool = eb->batch_pool; struct i915_vma *shadow, *trampoline, *batch; - unsigned int len; + unsigned long len; int err; if (!eb_use_cmdparser(eb)) { @@ -2449,6 +2453,8 @@ static int eb_parse(struct i915_execbuffer *eb) } else { len += I915_CMD_PARSER_TRAMPOLINE_SIZE; } + if (unlikely(len < eb->batch_len)) /* last paranoid check of overflow */ + return -EINVAL; if (!pool) { pool = intel_gt_get_buffer_pool(eb->engine->gt, len); -- cgit v1.2.3 From 6ca7217dffaf1abba91558e67a2efb655ac91405 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 15 Oct 2020 20:50:23 +0100 Subject: drm/i915/gt: Delay execlist processing for tgl When running gem_exec_nop, it floods the system with many requests (with the goal of userspace submitting faster than the HW can process a single empty batch). This causes the driver to continually resubmit new requests onto the end of an active context, a flood of lite-restore preemptions. If we time this just right, Tigerlake hangs. Inserting a small delay between the processing of CS events and submitting the next context, prevents the hang. Naturally it does not occur with debugging enabled. The suspicion then is that this is related to the issues with the CS event buffer, and inserting an mmio read of the CS pointer status appears to be very successful in preventing the hang. Other registers, or uncached reads, or plain mb, do not prevent the hang, suggesting that register is key -- but that the hang can be prevented by a simple udelay, suggests it is just a timing issue like that encountered by commit 233c1ae3c83f ("drm/i915/gt: Wait for CSB entries on Tigerlake"). Also note that the hang is not prevented by applying CTX_DESC_FORCE_RESTORE, or by inserting a delay on the GPU between requests. Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Bruce Chang Cc: Joonas Lahtinen Cc: stable@vger.kernel.org Acked-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201015195023.32346-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_lrc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 287537089c77..0a5b8ab742c7 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2707,6 +2707,9 @@ static void process_csb(struct intel_engine_cs *engine) smp_wmb(); /* complete the seqlock */ WRITE_ONCE(execlists->active, execlists->inflight); + /* XXX Magic delay for tgl */ + ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); + WRITE_ONCE(execlists->pending[0], NULL); } else { if (GEM_WARN_ON(!*execlists->active)) { -- cgit v1.2.3 From bb65548e3c6e299175a9e8c3e24b2b9577656a5d Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 2 Oct 2020 09:34:25 +0100 Subject: drm/i915/gt: Undo forced context restores after trivial preemptions We may try to preempt the currently executing request, only to find that after unravelling all the dependencies that the original executing context is still the earliest in the topological sort and re-submitted back to HW (if we do detect some change in the ELSP that requires re-submission). However, due to the way we check for wrap-around during the unravelling, we mark any context that has been submitted just once (i.e. with the rq->wa_tail set, but the ring->tail earlier) as potentially wrapping and requiring a forced restore on resubmission. This was expected to be not a problem, as it was anticipated that most unwinding for preemption would result in a context switch and the few that did not would be lost in the noise. It did not take long for someone to find one particular workload where the cost of those extra context restores was measurable. However, since we know the wa_tail is of fixed size, and we know that a request must be larger than the wa_tail itself, we can safely maintain the check for request wrapping and check against a slightly future point in the ring that includes an expected wa_tail. (That is if the ring->tail is already set to rq->wa_tail, including another 8 bytes in the check does not invalidate the incremental wrap detection.) Fixes: 8ab3a3812aa9 ("drm/i915/gt: Incrementally check for rewinding") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Bruce Chang Cc: Ramalingam C Cc: Joonas Lahtinen Cc: # v5.4+ Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201002083425.4605-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_lrc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 0a5b8ab742c7..5f5553c7107b 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1140,9 +1140,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine) /* Check in case we rollback so far we wrap [size/2] */ if (intel_ring_direction(rq->ring, - intel_ring_wrap(rq->ring, - rq->tail), - rq->ring->tail) > 0) + rq->tail, + rq->ring->tail + 8) > 0) rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE; active = rq; -- cgit v1.2.3 From 6971e07b6b0cf7dd74fb98d97011dbe4546dd2cf Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 13 Oct 2020 12:08:45 +0100 Subject: drm/i915/gt: Cleanup kasan warning for on-stack (unsigned long) casting Kasan is gving a warning for passing a u32 parameter into find_first_bit (casting to a unsigned long *, with appropriate length restrictions): [ 44.678262] BUG: KASAN: stack-out-of-bounds in find_first_bit+0x2e/0x50 [ 44.678295] Read of size 8 at addr ffff888233f4fc30 by task core_hotunplug/474 [ 44.678326] [ 44.678358] CPU: 0 PID: 474 Comm: core_hotunplug Not tainted 5.9.0+ #608 [ 44.678465] Hardware name: BESSTAR (HK) LIMITED GN41/Default string, BIOS BLT-BI-MINIPC-F4G-EX3R110-GA65A-101-D 10/12/2018 [ 44.678500] Call Trace: [ 44.678534] dump_stack+0x84/0xba [ 44.678569] print_address_description.constprop.0+0x21/0x220 [ 44.678605] ? kmsg_dump_rewind_nolock+0x5f/0x5f [ 44.678638] ? _raw_spin_lock_irqsave+0x6d/0xb0 [ 44.678669] ? _raw_write_lock_irqsave+0xb0/0xb0 [ 44.678702] ? set_task_cpu+0x1e0/0x1e0 [ 44.678733] ? find_first_bit+0x2e/0x50 [ 44.678763] kasan_report.cold+0x20/0x42 [ 44.678794] ? find_first_bit+0x2e/0x50 [ 44.678825] __asan_load8+0x69/0x90 [ 44.678856] find_first_bit+0x2e/0x50 [ 44.679027] __caps_show.isra.0+0x9e/0x1f0 [i915] Since we are only using the shorter type for our own convenience, accommodate kasan and use unsigned long. Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20201013110845.16127-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/sysfs_engines.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/sysfs_engines.c b/drivers/gpu/drm/i915/gt/sysfs_engines.c index 535cc1169e54..967031056202 100644 --- a/drivers/gpu/drm/i915/gt/sysfs_engines.c +++ b/drivers/gpu/drm/i915/gt/sysfs_engines.c @@ -79,14 +79,12 @@ static ssize_t repr_trim(char *buf, ssize_t len) static ssize_t __caps_show(struct intel_engine_cs *engine, - u32 caps, char *buf, bool show_unknown) + unsigned long caps, char *buf, bool show_unknown) { const char * const *repr; int count, n; ssize_t len; - BUILD_BUG_ON(!typecheck(typeof(caps), engine->uabi_capabilities)); - switch (engine->class) { case VIDEO_DECODE_CLASS: repr = vcs_caps; @@ -103,12 +101,10 @@ __caps_show(struct intel_engine_cs *engine, count = 0; break; } - GEM_BUG_ON(count > BITS_PER_TYPE(typeof(caps))); + GEM_BUG_ON(count > BITS_PER_LONG); len = 0; - for_each_set_bit(n, - (unsigned long *)&caps, - show_unknown ? BITS_PER_TYPE(typeof(caps)) : count) { + for_each_set_bit(n, &caps, show_unknown ? BITS_PER_LONG : count) { if (n >= count || !repr[n]) { if (GEM_WARN_ON(show_unknown)) len += snprintf(buf + len, PAGE_SIZE - len, -- cgit v1.2.3 From 89db95377be4cc4b895c27e0b5300a410fc076a6 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 15 Oct 2020 20:08:16 +0100 Subject: drm/i915/gt: Confirm the context survives execution Repeat our sanitychecks from before execution to after execution. One expects that if we were to see these, the gpu would already be on fire, but the timing may be informative. Signed-off-by: Chris Wilson Reviewed-by: Tvrtko Ursulin Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201015190816.31763-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_engine_pm.c | 37 +++++++++++++++++++++---------- drivers/gpu/drm/i915/gt/intel_lrc.c | 12 +++++++--- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c index f7b2e07e2229..499b09cb4acf 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c @@ -17,6 +17,25 @@ #include "intel_ring.h" #include "shmem_utils.h" +static void dbg_poison_ce(struct intel_context *ce) +{ + if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + return; + + if (ce->state) { + struct drm_i915_gem_object *obj = ce->state->obj; + int type = i915_coherent_map_type(ce->engine->i915); + void *map; + + map = i915_gem_object_pin_map(obj, type); + if (!IS_ERR(map)) { + memset(map, CONTEXT_REDZONE, obj->base.size); + i915_gem_object_flush_map(obj); + i915_gem_object_unpin_map(obj); + } + } +} + static int __engine_unpark(struct intel_wakeref *wf) { struct intel_engine_cs *engine = @@ -32,20 +51,14 @@ static int __engine_unpark(struct intel_wakeref *wf) if (ce) { GEM_BUG_ON(test_bit(CONTEXT_VALID_BIT, &ce->flags)); + /* Flush all pending HW writes before we touch the context */ + while (unlikely(intel_context_inflight(ce))) + intel_engine_flush_submission(engine); + /* First poison the image to verify we never fully trust it */ - if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) && ce->state) { - struct drm_i915_gem_object *obj = ce->state->obj; - int type = i915_coherent_map_type(engine->i915); - void *map; - - map = i915_gem_object_pin_map(obj, type); - if (!IS_ERR(map)) { - memset(map, CONTEXT_REDZONE, obj->base.size); - i915_gem_object_flush_map(obj); - i915_gem_object_unpin_map(obj); - } - } + dbg_poison_ce(ce); + /* Scrub the context image after our loss of control */ ce->ops->reset(ce); } diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 5f5553c7107b..c22d47cc6701 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1215,7 +1215,8 @@ static void intel_engine_context_out(struct intel_engine_cs *engine) static void execlists_check_context(const struct intel_context *ce, - const struct intel_engine_cs *engine) + const struct intel_engine_cs *engine, + const char *when) { const struct intel_ring *ring = ce->ring; u32 *regs = ce->lrc_reg_state; @@ -1250,7 +1251,7 @@ execlists_check_context(const struct intel_context *ce, valid = false; } - WARN_ONCE(!valid, "Invalid lrc state found before submission\n"); + WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); } static void restore_default_state(struct intel_context *ce, @@ -1346,7 +1347,7 @@ __execlists_schedule_in(struct i915_request *rq) reset_active(rq, engine); if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) - execlists_check_context(ce, engine); + execlists_check_context(ce, engine, "before"); if (ce->tag) { /* Use a fixed tag for OA and friends */ @@ -1417,6 +1418,9 @@ __execlists_schedule_out(struct i915_request *rq, * refrain from doing non-trivial work here. */ + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + execlists_check_context(ce, engine, "after"); + /* * If we have just completed this context, the engine may now be * idle and we want to re-enter powersaving. @@ -4080,6 +4084,8 @@ static void reset_csb_pointers(struct intel_engine_cs *engine) static void execlists_sanitize(struct intel_engine_cs *engine) { + GEM_BUG_ON(execlists_active(&engine->execlists)); + /* * Poison residual state on resume, in case the suspend didn't! * -- cgit v1.2.3 From 178536b8292ecd118f59d2fac4509c7e70b99854 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 16 Oct 2020 10:25:27 +0100 Subject: drm/i915: Use the active reference on the vma while capturing During error capture, we need to take a reference to the vma from before the reset in order to catpure the contents of the vma later. Currently we are using both an active reference and a kref, but due to nature of the i915_vma reference handling, that kref is on the vma->obj and not the vma itself. This means the vma may be destroyed as soon as it is idle, that is in between the i915_active_release(&vma->active) and the i915_vma_put(vma): <3> [197.866181] BUG: KASAN: use-after-free in intel_engine_coredump_add_vma+0x36c/0x4a0 [i915] <3> [197.866339] Read of size 8 at addr ffff8881258cb800 by task gem_exec_captur/1041 <3> [197.866467] <4> [197.866512] CPU: 2 PID: 1041 Comm: gem_exec_captur Not tainted 5.9.0-g5e4234f97efba-kasan_200+ #1 <4> [197.866521] Hardware name: Intel Corp. Broxton P/Apollolake RVP1A, BIOS APLKRVPA.X64.0150.B11.1608081044 08/08/2016 <4> [197.866530] Call Trace: <4> [197.866549] dump_stack+0x99/0xd0 <4> [197.866760] ? intel_engine_coredump_add_vma+0x36c/0x4a0 [i915] <4> [197.866783] print_address_description.constprop.8+0x3e/0x60 <4> [197.866797] ? kmsg_dump_rewind_nolock+0xd4/0xd4 <4> [197.866819] ? lockdep_hardirqs_off+0xd4/0x120 <4> [197.867037] ? intel_engine_coredump_add_vma+0x36c/0x4a0 [i915] <4> [197.867249] ? intel_engine_coredump_add_vma+0x36c/0x4a0 [i915] <4> [197.867270] kasan_report.cold.10+0x1f/0x37 <4> [197.867492] ? intel_engine_coredump_add_vma+0x36c/0x4a0 [i915] <4> [197.867710] intel_engine_coredump_add_vma+0x36c/0x4a0 [i915] <4> [197.867949] i915_gpu_coredump.part.29+0x150/0x7b0 [i915] <4> [197.868186] i915_capture_error_state+0x5e/0xc0 [i915] <4> [197.868396] intel_gt_handle_error+0x6eb/0xa20 [i915] <4> [197.868624] ? intel_gt_reset_global+0x370/0x370 [i915] <4> [197.868644] ? check_flags+0x50/0x50 <4> [197.868662] ? __lock_acquire+0xd59/0x6b00 <4> [197.868678] ? register_lock_class+0x1ad0/0x1ad0 <4> [197.868944] i915_wedged_set+0xcf/0x1b0 [i915] <4> [197.869147] ? i915_wedged_get+0x90/0x90 [i915] <4> [197.869371] ? i915_wedged_get+0x90/0x90 [i915] <4> [197.869398] simple_attr_write+0x153/0x1c0 <4> [197.869428] full_proxy_write+0xee/0x180 <4> [197.869442] ? __sb_start_write+0x1f3/0x310 <4> [197.869465] vfs_write+0x1a3/0x640 <4> [197.869492] ksys_write+0xec/0x1c0 <4> [197.869507] ? __ia32_sys_read+0xa0/0xa0 <4> [197.869525] ? lockdep_hardirqs_on_prepare+0x32b/0x4e0 <4> [197.869541] ? syscall_enter_from_user_mode+0x1c/0x50 <4> [197.869566] do_syscall_64+0x33/0x80 <4> [197.869579] entry_SYSCALL_64_after_hwframe+0x44/0xa9 <4> [197.869590] RIP: 0033:0x7fd8b7aee281 <4> [197.869604] Code: c3 0f 1f 84 00 00 00 00 00 48 8b 05 59 8d 20 00 c3 0f 1f 84 00 00 00 00 00 8b 05 8a d1 20 00 85 c0 75 16 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 57 f3 c3 0f 1f 44 00 00 41 54 55 49 89 d4 53 <4> [197.869613] RSP: 002b:00007ffea3b72008 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 <4> [197.869625] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fd8b7aee281 <4> [197.869633] RDX: 0000000000000002 RSI: 00007fd8b81a82e7 RDI: 000000000000000d <4> [197.869641] RBP: 0000000000000002 R08: 0000000000000000 R09: 0000000000000034 <4> [197.869650] R10: 0000000000000000 R11: 0000000000000246 R12: 00007fd8b81a82e7 <4> [197.869658] R13: 000000000000000d R14: 0000000000000000 R15: 0000000000000000 <3> [197.869707] <3> [197.869757] Allocated by task 1041: <4> [197.869833] kasan_save_stack+0x19/0x40 <4> [197.869843] __kasan_kmalloc.constprop.5+0xc1/0xd0 <4> [197.869853] kmem_cache_alloc+0x106/0x8e0 <4> [197.870059] i915_vma_instance+0x212/0x1930 [i915] <4> [197.870270] eb_lookup_vmas+0xe06/0x1d10 [i915] <4> [197.870475] i915_gem_do_execbuffer+0x131d/0x4080 [i915] <4> [197.870682] i915_gem_execbuffer2_ioctl+0x103/0x5d0 [i915] <4> [197.870701] drm_ioctl_kernel+0x1d2/0x270 <4> [197.870710] drm_ioctl+0x40d/0x85c <4> [197.870721] __x64_sys_ioctl+0x10d/0x170 <4> [197.870731] do_syscall_64+0x33/0x80 <4> [197.870740] entry_SYSCALL_64_after_hwframe+0x44/0xa9 <3> [197.870748] <3> [197.870798] Freed by task 22: <4> [197.870865] kasan_save_stack+0x19/0x40 <4> [197.870875] kasan_set_track+0x1c/0x30 <4> [197.870884] kasan_set_free_info+0x1b/0x30 <4> [197.870894] __kasan_slab_free+0x111/0x160 <4> [197.870903] kmem_cache_free+0xcd/0x710 <4> [197.871109] i915_vma_parked+0x618/0x800 [i915] <4> [197.871307] __gt_park+0xdb/0x1e0 [i915] <4> [197.871501] ____intel_wakeref_put_last+0xb1/0x190 [i915] <4> [197.871516] process_one_work+0x8dc/0x15d0 <4> [197.871525] worker_thread+0x82/0xb30 <4> [197.871535] kthread+0x36d/0x440 <4> [197.871545] ret_from_fork+0x22/0x30 <3> [197.871553] <3> [197.871602] The buggy address belongs to the object at ffff8881258cb740 which belongs to the cache i915_vma of size 968 Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2553 Fixes: 2850748ef876 ("drm/i915: Pull i915_vma_pin under the vm->mutex") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Tvrtko Ursulin Cc: Joonas Lahtinen Cc: # v5.5+ Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20201016092527.29039-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_gpu_error.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 41a82855d046..ae370909e8dd 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1317,7 +1317,7 @@ capture_vma(struct intel_engine_capture_vma *next, } strcpy(c->name, name); - c->vma = i915_vma_get(vma); + c->vma = vma; /* reference held while active */ c->next = next; return c; @@ -1407,7 +1407,6 @@ intel_engine_coredump_add_vma(struct intel_engine_coredump *ee, compress)); i915_active_release(&vma->active); - i915_vma_put(vma); capture = this->next; kfree(this); -- cgit v1.2.3 From fa812ce96a46efc27cae4dcad866aaee9cb25d28 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 19 Oct 2020 09:34:44 +0100 Subject: drm/i915/gt: Onion unwind for scratch page allocation failure In switching to using objects for our ppGTT scratch pages, care was not taken to avoid trying to unref NULL objects on failure. And for gen6 ppGTT, it appears we forgot entirely to unwind after a partial allocation failure. Fixes: 89351925a477 ("drm/i915/gt: Switch to object allocations for page directories") Signed-off-by: Chris Wilson Cc: Matthew Auld Cc: Joonas Lahtinen Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20201019083444.1286-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/gen6_ppgtt.c | 18 ++++++++++++------ drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 3 ++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c index c0d17f87b00f..680bd9442eb0 100644 --- a/drivers/gpu/drm/i915/gt/gen6_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen6_ppgtt.c @@ -239,18 +239,24 @@ static int gen6_ppgtt_init_scratch(struct gen6_ppgtt *ppgtt) I915_CACHE_NONE, PTE_READ_ONLY); vm->scratch[1] = vm->alloc_pt_dma(vm, I915_GTT_PAGE_SIZE_4K); - if (IS_ERR(vm->scratch[1])) - return PTR_ERR(vm->scratch[1]); + if (IS_ERR(vm->scratch[1])) { + ret = PTR_ERR(vm->scratch[1]); + goto err_scratch0; + } ret = pin_pt_dma(vm, vm->scratch[1]); - if (ret) { - i915_gem_object_put(vm->scratch[1]); - return ret; - } + if (ret) + goto err_scratch1; fill32_px(vm->scratch[1], vm->scratch[0]->encode); return 0; + +err_scratch1: + i915_gem_object_put(vm->scratch[1]); +err_scratch0: + i915_gem_object_put(vm->scratch[0]); + return ret; } static void gen6_ppgtt_free_pd(struct gen6_ppgtt *ppgtt) diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c index b236aa046f91..a37c968ef8f7 100644 --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -607,7 +607,8 @@ static int gen8_init_scratch(struct i915_address_space *vm) return 0; free_scratch: - free_scratch(vm); + while (i--) + i915_gem_object_put(vm->scratch[i]); return -ENOMEM; } -- cgit v1.2.3 From d7085b0faac8330b67e1b10ff2dadd8af6b2564e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 19 Oct 2020 17:50:04 +0100 Subject: drm/i915/gem: Poison stolen pages before use When allocating objects from stolen, memset() the backing store to POISON_INUSE (0x5a) to help identify any uninitialised use of a stolen object. Signed-off-by: Chris Wilson Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201019165005.18128-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 45 ++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index e0f21f12d3ce..9e0e3cd616fd 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -495,6 +495,40 @@ static int i915_gem_init_stolen(struct drm_i915_private *i915) return 0; } +static void dbg_poison(struct i915_ggtt *ggtt, + dma_addr_t addr, resource_size_t size, + u8 x) +{ +#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) + if (!drm_mm_node_allocated(&ggtt->error_capture)) + return; + + GEM_BUG_ON(!IS_ALIGNED(size, PAGE_SIZE)); + + mutex_lock(&ggtt->error_mutex); + while (size) { + void __iomem *s; + + ggtt->vm.insert_page(&ggtt->vm, addr, + ggtt->error_capture.start, + I915_CACHE_NONE, 0); + mb(); + + s = io_mapping_map_wc(&ggtt->iomap, + ggtt->error_capture.start, + PAGE_SIZE); + memset_io(s, x, PAGE_SIZE); + io_mapping_unmap(s); + + addr += PAGE_SIZE; + size -= PAGE_SIZE; + } + mb(); + ggtt->vm.clear_range(&ggtt->vm, ggtt->error_capture.start, PAGE_SIZE); + mutex_unlock(&ggtt->error_mutex); +#endif +} + static struct sg_table * i915_pages_create_for_stolen(struct drm_device *dev, resource_size_t offset, resource_size_t size) @@ -538,6 +572,11 @@ static int i915_gem_object_get_pages_stolen(struct drm_i915_gem_object *obj) if (IS_ERR(pages)) return PTR_ERR(pages); + dbg_poison(&to_i915(obj->base.dev)->ggtt, + sg_dma_address(pages->sgl), + sg_dma_len(pages->sgl), + POISON_INUSE); + __i915_gem_object_set_pages(obj, pages, obj->stolen->size); return 0; @@ -547,6 +586,12 @@ static void i915_gem_object_put_pages_stolen(struct drm_i915_gem_object *obj, struct sg_table *pages) { /* Should only be called from i915_gem_object_release_stolen() */ + + dbg_poison(&to_i915(obj->base.dev)->ggtt, + sg_dma_address(pages->sgl), + sg_dma_len(pages->sgl), + POISON_FREE); + sg_free_table(pages); kfree(pages); } -- cgit v1.2.3 From d3606757e611fbd48bb239e8c2fe9779b3f50035 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 19 Oct 2020 17:50:05 +0100 Subject: drm/i915: Exclude low pages (128KiB) of stolen from use The GPU is trashing the low pages of its reserved memory upon reset. If we are using this memory for ringbuffers, then we will dutiful resubmit the trashed rings after the reset causing further resets, and worse. We must exclude this range from our own use. The value of 128KiB was found by empirical measurement (and verified now with a selftest) on gen9. Signed-off-by: Chris Wilson Cc: stable@vger.kernel.org Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201019165005.18128-2-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/Kconfig.debug | 1 + drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 6 +- drivers/gpu/drm/i915/gem/i915_gem_stolen.h | 2 + drivers/gpu/drm/i915/gt/selftest_reset.c | 196 +++++++++++++++++++++++++++++ 4 files changed, 203 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/Kconfig.debug b/drivers/gpu/drm/i915/Kconfig.debug index 1cb28c20807c..25cd9788a4d5 100644 --- a/drivers/gpu/drm/i915/Kconfig.debug +++ b/drivers/gpu/drm/i915/Kconfig.debug @@ -153,6 +153,7 @@ config DRM_I915_SELFTEST select DRM_EXPORT_FOR_TESTS if m select FAULT_INJECTION select PRIME_NUMBERS + select CRC32 help Choose this option to allow the driver to perform selftests upon loading; also requires the i915.selftest=1 module parameter. To diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index 9e0e3cd616fd..78f64a2724e2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -53,8 +53,10 @@ int i915_gem_stolen_insert_node(struct drm_i915_private *i915, struct drm_mm_node *node, u64 size, unsigned alignment) { - return i915_gem_stolen_insert_node_in_range(i915, node, size, - alignment, 0, U64_MAX); + return i915_gem_stolen_insert_node_in_range(i915, node, + size, alignment, + I915_GEM_STOLEN_BIAS, + U64_MAX); } void i915_gem_stolen_remove_node(struct drm_i915_private *i915, diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.h b/drivers/gpu/drm/i915/gem/i915_gem_stolen.h index e15c0adad8af..61e028063f9f 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.h @@ -30,4 +30,6 @@ i915_gem_object_create_stolen_for_preallocated(struct drm_i915_private *dev_priv resource_size_t stolen_offset, resource_size_t size); +#define I915_GEM_STOLEN_BIAS SZ_128K + #endif /* __I915_GEM_STOLEN_H__ */ diff --git a/drivers/gpu/drm/i915/gt/selftest_reset.c b/drivers/gpu/drm/i915/gt/selftest_reset.c index 35406ecdf0b2..ef5aeebbeeb0 100644 --- a/drivers/gpu/drm/i915/gt/selftest_reset.c +++ b/drivers/gpu/drm/i915/gt/selftest_reset.c @@ -3,9 +3,203 @@ * Copyright © 2018 Intel Corporation */ +#include + +#include "gem/i915_gem_stolen.h" + +#include "i915_memcpy.h" #include "i915_selftest.h" #include "selftests/igt_reset.h" #include "selftests/igt_atomic.h" +#include "selftests/igt_spinner.h" + +static int +__igt_reset_stolen(struct intel_gt *gt, + intel_engine_mask_t mask, + const char *msg) +{ + struct i915_ggtt *ggtt = >->i915->ggtt; + const struct resource *dsm = >->i915->dsm; + resource_size_t num_pages, page; + struct intel_engine_cs *engine; + intel_wakeref_t wakeref; + enum intel_engine_id id; + struct igt_spinner spin; + long max, count; + void *tmp; + u32 *crc; + int err; + + if (!drm_mm_node_allocated(&ggtt->error_capture)) + return 0; + + num_pages = resource_size(dsm) >> PAGE_SHIFT; + if (!num_pages) + return 0; + + crc = kmalloc_array(num_pages, sizeof(u32), GFP_KERNEL); + if (!crc) + return -ENOMEM; + + tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!tmp) { + err = -ENOMEM; + goto err_crc; + } + + igt_global_reset_lock(gt); + wakeref = intel_runtime_pm_get(gt->uncore->rpm); + + err = igt_spinner_init(&spin, gt); + if (err) + goto err_lock; + + for_each_engine(engine, gt, id) { + struct intel_context *ce; + struct i915_request *rq; + + if (!(mask & engine->mask)) + continue; + + if (!intel_engine_can_store_dword(engine)) + continue; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto err_spin; + } + rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); + intel_context_put(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_spin; + } + i915_request_add(rq); + } + + for (page = 0; page < num_pages; page++) { + dma_addr_t dma = (dma_addr_t)dsm->start + (page << PAGE_SHIFT); + void __iomem *s; + void *in; + + ggtt->vm.insert_page(&ggtt->vm, dma, + ggtt->error_capture.start, + I915_CACHE_NONE, 0); + mb(); + + s = io_mapping_map_wc(&ggtt->iomap, + ggtt->error_capture.start, + PAGE_SIZE); + + if (!__drm_mm_interval_first(>->i915->mm.stolen, + page << PAGE_SHIFT, + ((page + 1) << PAGE_SHIFT) - 1)) + memset32(s, STACK_MAGIC, PAGE_SIZE / sizeof(u32)); + + in = s; + if (i915_memcpy_from_wc(tmp, s, PAGE_SIZE)) + in = tmp; + crc[page] = crc32_le(0, in, PAGE_SIZE); + + io_mapping_unmap(s); + } + mb(); + ggtt->vm.clear_range(&ggtt->vm, ggtt->error_capture.start, PAGE_SIZE); + + if (mask == ALL_ENGINES) { + intel_gt_reset(gt, mask, NULL); + } else { + for_each_engine(engine, gt, id) { + if (mask & engine->mask) + intel_engine_reset(engine, NULL); + } + } + + max = -1; + count = 0; + for (page = 0; page < num_pages; page++) { + dma_addr_t dma = (dma_addr_t)dsm->start + (page << PAGE_SHIFT); + void __iomem *s; + void *in; + u32 x; + + ggtt->vm.insert_page(&ggtt->vm, dma, + ggtt->error_capture.start, + I915_CACHE_NONE, 0); + mb(); + + s = io_mapping_map_wc(&ggtt->iomap, + ggtt->error_capture.start, + PAGE_SIZE); + + in = s; + if (i915_memcpy_from_wc(tmp, s, PAGE_SIZE)) + in = tmp; + x = crc32_le(0, in, PAGE_SIZE); + + if (x != crc[page] && + !__drm_mm_interval_first(>->i915->mm.stolen, + page << PAGE_SHIFT, + ((page + 1) << PAGE_SHIFT) - 1)) { + pr_debug("unused stolen page %pa modified by GPU reset\n", + &page); + if (count++ == 0) + igt_hexdump(in, PAGE_SIZE); + max = page; + } + + io_mapping_unmap(s); + } + mb(); + ggtt->vm.clear_range(&ggtt->vm, ggtt->error_capture.start, PAGE_SIZE); + + if (count > 0) { + pr_info("%s reset clobbered %ld pages of stolen, last clobber at page %ld\n", + msg, count, max); + } + if (max >= I915_GEM_STOLEN_BIAS >> PAGE_SHIFT) { + pr_err("%s reset clobbered unreserved area [above %x] of stolen; may cause severe faults\n", + msg, I915_GEM_STOLEN_BIAS); + err = -EINVAL; + } + +err_spin: + igt_spinner_fini(&spin); + +err_lock: + intel_runtime_pm_put(gt->uncore->rpm, wakeref); + igt_global_reset_unlock(gt); + + kfree(tmp); +err_crc: + kfree(crc); + return err; +} + +static int igt_reset_device_stolen(void *arg) +{ + return __igt_reset_stolen(arg, ALL_ENGINES, "device"); +} + +static int igt_reset_engines_stolen(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err; + + if (!intel_has_reset_engine(gt)) + return 0; + + for_each_engine(engine, gt, id) { + err = __igt_reset_stolen(gt, engine->mask, engine->name); + if (err) + return err; + } + + return 0; +} static int igt_global_reset(void *arg) { @@ -164,6 +358,8 @@ int intel_reset_live_selftests(struct drm_i915_private *i915) { static const struct i915_subtest tests[] = { SUBTEST(igt_global_reset), /* attempt to recover GPU first */ + SUBTEST(igt_reset_device_stolen), + SUBTEST(igt_reset_engines_stolen), SUBTEST(igt_wedged_reset), SUBTEST(igt_atomic_reset), SUBTEST(igt_atomic_engine_reset), -- cgit v1.2.3 From 8f2b4b684ae530cf7623cc1df22f05460b84cc5e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 19 Oct 2020 15:28:41 +0100 Subject: drm/i915/selftests: Flush the old heartbeat more gently In order to test how fast the heartbeat can respond, we measure with the interval set to its minimum. Before we measure though, we want to be sure we start with a fresh pulse, and so wait until any old one is complete. During that wait though, we were continually flushing the work, and so continually re-evaluating to see if the pulse was complete, and each attempt would count as an unresponsive system. If the engine did not complete the request in the couple of busy-spins, it would flag an error. This is unfortunate, so let's not busy-spin waiting for the old heartbeat, but terminate it and start afresh. Signed-off-by: Chris Wilson Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201019142841.32273-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c index e73854dd2fe0..b88aa35ad75b 100644 --- a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c +++ b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c @@ -215,16 +215,17 @@ static int __live_heartbeat_fast(struct intel_engine_cs *engine) goto err_pm; for (i = 0; i < ARRAY_SIZE(times); i++) { - /* Manufacture a tick */ do { - while (READ_ONCE(engine->heartbeat.systole)) - flush_delayed_work(&engine->heartbeat.work); + /* Manufacture a tick */ + intel_engine_park_heartbeat(engine); + GEM_BUG_ON(engine->heartbeat.systole); + engine->serial++; /* pretend we are not idle! */ + intel_engine_unpark_heartbeat(engine); - engine->serial++; /* quick, pretend we are not idle! */ flush_delayed_work(&engine->heartbeat.work); if (!delayed_work_pending(&engine->heartbeat.work)) { - pr_err("%s: heartbeat did not start\n", - engine->name); + pr_err("%s: heartbeat %d did not start\n", + engine->name, i); err = -EINVAL; goto err_pm; } -- cgit v1.2.3 From 83ebef47f8ebe320d5c5673db82f9903a4f40a69 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Wed, 21 Oct 2020 11:36:06 +0100 Subject: drm/i915/region: fix max size calculation We are incorrectly limiting the max allocation size as per the mm max_order, which is effectively the largest power-of-two that we can fit in the region size. However, it's normal to setup the region or allocator with a non-power-of-two size(for example 3G), which we should already handle correctly, except it seems for the early too-big-check. v2: make sure we also exercise the I915_BO_ALLOC_CONTIGUOUS path, which is quite different, since for that we are actually limited by the largest power-of-two that we can fit within the region size. (Chris) Fixes: b908be543e44 ("drm/i915: support creating LMEM objects") Signed-off-by: Matthew Auld Cc: Chris Wilson Cc: CQ Tang Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201021103606.241395-1-matthew.auld@intel.com --- drivers/gpu/drm/i915/intel_memory_region.c | 2 +- .../gpu/drm/i915/selftests/intel_memory_region.c | 77 ++++++++++++++++++++++ drivers/gpu/drm/i915/selftests/mock_region.c | 2 +- 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c index 6b5e9d88646d..180e1078ef7c 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.c +++ b/drivers/gpu/drm/i915/intel_memory_region.c @@ -87,7 +87,7 @@ __intel_memory_region_get_pages_buddy(struct intel_memory_region *mem, min_order = ilog2(size) - ilog2(mem->mm.chunk_size); } - if (size > BIT(mem->mm.max_order) * mem->mm.chunk_size) + if (size > mem->mm.size) return -E2BIG; n_pages = size >> ilog2(mem->mm.chunk_size); diff --git a/drivers/gpu/drm/i915/selftests/intel_memory_region.c b/drivers/gpu/drm/i915/selftests/intel_memory_region.c index 93a38a323584..4829c47659db 100644 --- a/drivers/gpu/drm/i915/selftests/intel_memory_region.c +++ b/drivers/gpu/drm/i915/selftests/intel_memory_region.c @@ -261,6 +261,82 @@ err_close_objects: return err; } +static int igt_mock_splintered_region(void *arg) +{ + struct intel_memory_region *mem = arg; + struct drm_i915_private *i915 = mem->i915; + struct drm_i915_gem_object *obj; + unsigned int expected_order; + LIST_HEAD(objects); + u64 size; + int err = 0; + + /* + * Sanity check we can still allocate everything even if the + * mm.max_order != mm.size. i.e our starting address space size is not a + * power-of-two. + */ + + size = (SZ_4G - 1) & PAGE_MASK; + mem = mock_region_create(i915, 0, size, PAGE_SIZE, 0); + if (IS_ERR(mem)) + return PTR_ERR(mem); + + if (mem->mm.size != size) { + pr_err("%s size mismatch(%llu != %llu)\n", + __func__, mem->mm.size, size); + err = -EINVAL; + goto out_put; + } + + expected_order = get_order(rounddown_pow_of_two(size)); + if (mem->mm.max_order != expected_order) { + pr_err("%s order mismatch(%u != %u)\n", + __func__, mem->mm.max_order, expected_order); + err = -EINVAL; + goto out_put; + } + + obj = igt_object_create(mem, &objects, size, 0); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto out_close; + } + + close_objects(mem, &objects); + + /* + * While we should be able allocate everything without any flag + * restrictions, if we consider I915_BO_ALLOC_CONTIGUOUS then we are + * actually limited to the largest power-of-two for the region size i.e + * max_order, due to the inner workings of the buddy allocator. So make + * sure that does indeed hold true. + */ + + obj = igt_object_create(mem, &objects, size, I915_BO_ALLOC_CONTIGUOUS); + if (!IS_ERR(obj)) { + pr_err("%s too large contiguous allocation was not rejected\n", + __func__); + err = -EINVAL; + goto out_close; + } + + obj = igt_object_create(mem, &objects, rounddown_pow_of_two(size), + I915_BO_ALLOC_CONTIGUOUS); + if (IS_ERR(obj)) { + pr_err("%s largest possible contiguous allocation failed\n", + __func__); + err = PTR_ERR(obj); + goto out_close; + } + +out_close: + close_objects(mem, &objects); +out_put: + intel_memory_region_put(mem); + return err; +} + static int igt_gpu_write_dw(struct intel_context *ce, struct i915_vma *vma, u32 dword, @@ -771,6 +847,7 @@ int intel_memory_region_mock_selftests(void) static const struct i915_subtest tests[] = { SUBTEST(igt_mock_fill), SUBTEST(igt_mock_contiguous), + SUBTEST(igt_mock_splintered_region), }; struct intel_memory_region *mem; struct drm_i915_private *i915; diff --git a/drivers/gpu/drm/i915/selftests/mock_region.c b/drivers/gpu/drm/i915/selftests/mock_region.c index 09660f5a0a4c..979d96f27c43 100644 --- a/drivers/gpu/drm/i915/selftests/mock_region.c +++ b/drivers/gpu/drm/i915/selftests/mock_region.c @@ -24,7 +24,7 @@ mock_object_create(struct intel_memory_region *mem, struct drm_i915_private *i915 = mem->i915; struct drm_i915_gem_object *obj; - if (size > BIT(mem->mm.max_order) * mem->mm.chunk_size) + if (size > mem->mm.size) return ERR_PTR(-E2BIG); obj = i915_gem_object_alloc(); -- cgit v1.2.3 From 44c2200afcd59f441b43f27829b4003397cc495d Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 19 Oct 2020 21:38:25 +0100 Subject: drm/i915/gem: Flush coherency domains on first set-domain-ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid skipping what appears to be a no-op set-domain-ioctl if the cache coherency state is inconsistent with our target domain. This also has the utility of using the population of the pages to validate the backing store. The danger in skipping the first set-domain is leaving the cache inconsistent and submitting stale data, or worse leaving the clean data in the cache and not flushing it to the GPU. The impact should be small as it requires a no-op set-domain as the very first ioctl in a particular sequence not found in typical userspace. Reported-by: Zbigniew Kempczyński Fixes: 754a25442705 ("drm/i915: Skip object locking around a no-op set-domain ioctl") Testcase: igt/gem_mmap_offset/blt-coherency Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: Matthew Auld Cc: Zbigniew Kempczyński Cc: # v5.2+ Reviewed-by: Matthew Auld Link: https://patchwork.freedesktop.org/patch/msgid/20201019203825.10966-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gem/i915_gem_domain.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c b/drivers/gpu/drm/i915/gem/i915_gem_domain.c index 7c90a63c273d..fcce6909f201 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c @@ -508,21 +508,6 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data, if (!obj) return -ENOENT; - /* - * Already in the desired write domain? Nothing for us to do! - * - * We apply a little bit of cunning here to catch a broader set of - * no-ops. If obj->write_domain is set, we must be in the same - * obj->read_domains, and only that domain. Therefore, if that - * obj->write_domain matches the request read_domains, we are - * already in the same read/write domain and can skip the operation, - * without having to further check the requested write_domain. - */ - if (READ_ONCE(obj->write_domain) == read_domains) { - err = 0; - goto out; - } - /* * Try to flush the object off the GPU without holding the lock. * We will repeat the flush holding the lock in the normal manner @@ -560,6 +545,19 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data, if (err) goto out; + /* + * Already in the desired write domain? Nothing for us to do! + * + * We apply a little bit of cunning here to catch a broader set of + * no-ops. If obj->write_domain is set, we must be in the same + * obj->read_domains, and only that domain. Therefore, if that + * obj->write_domain matches the request read_domains, we are + * already in the same read/write domain and can skip the operation, + * without having to further check the requested write_domain. + */ + if (READ_ONCE(obj->write_domain) == read_domains) + goto out_unpin; + err = i915_gem_object_lock_interruptible(obj, NULL); if (err) goto out_unpin; -- cgit v1.2.3 From b00bccb3f0bb7e99052791eeffb3e2cfcfa1ae8e Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 20 Oct 2020 11:08:21 +0100 Subject: drm/i915/pmu: Handle PCI unbind Mark the device as closed and keep references to driver data alive to allow for safe driver unbind with active PMU clients. Perf core does not otherwise handle this case so we have to do it manually like this. Signed-off-by: Tvrtko Ursulin Reviewed-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201020100822.543332-1-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/i915_pmu.c | 39 +++++++++++++++++++++++++++++++++++++-- drivers/gpu/drm/i915/i915_pmu.h | 4 ++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 056994224c6b..39f584de239b 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -445,6 +445,8 @@ static void i915_pmu_event_destroy(struct perf_event *event) container_of(event->pmu, typeof(*i915), pmu.base); drm_WARN_ON(&i915->drm, event->parent); + + drm_dev_put(&i915->drm); } static int @@ -510,8 +512,12 @@ static int i915_pmu_event_init(struct perf_event *event) { struct drm_i915_private *i915 = container_of(event->pmu, typeof(*i915), pmu.base); + struct i915_pmu *pmu = &i915->pmu; int ret; + if (pmu->closed) + return -ENODEV; + if (event->attr.type != event->pmu->type) return -ENOENT; @@ -536,8 +542,10 @@ static int i915_pmu_event_init(struct perf_event *event) if (ret) return ret; - if (!event->parent) + if (!event->parent) { + drm_dev_get(&i915->drm); event->destroy = i915_pmu_event_destroy; + } return 0; } @@ -594,9 +602,16 @@ static u64 __i915_pmu_event_read(struct perf_event *event) static void i915_pmu_event_read(struct perf_event *event) { + struct drm_i915_private *i915 = + container_of(event->pmu, typeof(*i915), pmu.base); struct hw_perf_event *hwc = &event->hw; + struct i915_pmu *pmu = &i915->pmu; u64 prev, new; + if (pmu->closed) { + event->hw.state = PERF_HES_STOPPED; + return; + } again: prev = local64_read(&hwc->prev_count); new = __i915_pmu_event_read(event); @@ -724,6 +739,13 @@ static void i915_pmu_disable(struct perf_event *event) static void i915_pmu_event_start(struct perf_event *event, int flags) { + struct drm_i915_private *i915 = + container_of(event->pmu, typeof(*i915), pmu.base); + struct i915_pmu *pmu = &i915->pmu; + + if (pmu->closed) + return; + i915_pmu_enable(event); event->hw.state = 0; } @@ -738,6 +760,13 @@ static void i915_pmu_event_stop(struct perf_event *event, int flags) static int i915_pmu_event_add(struct perf_event *event, int flags) { + struct drm_i915_private *i915 = + container_of(event->pmu, typeof(*i915), pmu.base); + struct i915_pmu *pmu = &i915->pmu; + + if (pmu->closed) + return -ENODEV; + if (flags & PERF_EF_START) i915_pmu_event_start(event, flags); @@ -1167,7 +1196,13 @@ void i915_pmu_unregister(struct drm_i915_private *i915) if (!pmu->base.event_init) return; - drm_WARN_ON(&i915->drm, pmu->enable); + /* + * "Disconnect" the PMU callbacks - since all are atomic synchronize_rcu + * ensures all currently executing ones will have exited before we + * proceed with unregistration. + */ + pmu->closed = true; + synchronize_rcu(); hrtimer_cancel(&pmu->timer); diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h index 941f0c14037c..59a0d19afb67 100644 --- a/drivers/gpu/drm/i915/i915_pmu.h +++ b/drivers/gpu/drm/i915/i915_pmu.h @@ -49,6 +49,10 @@ struct i915_pmu { * @base: PMU base. */ struct pmu base; + /** + * @closed: i915 is unregistering. + */ + bool closed; /** * @name: Name as registered with perf core. */ -- cgit v1.2.3 From 537f9c84a42754f89977bc2d19f2f69503a3a02a Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 20 Oct 2020 17:11:44 +0100 Subject: drm/i915/pmu: Fix CPU hotplug with multiple GPUs Since we keep a driver global mask of online CPUs and base the decision whether PMU needs to be migrated upon it, we need to make sure the migration is done for all registered PMUs (so GPUs). To do this we need to track the current CPU for each PMU and base the decision on whether to migrate on a comparison between global and local state. At the same time, since dynamic CPU hotplug notification slots are a scarce resource and given how we already register the multi instance type state, we can and should add multiple instance of the i915 PMU to this same state and not allocate a new one for every GPU. v2: * Use pr_notice. (Chris) v3: * Handle a nasty interaction where unregistration which triggers a false CPU offline event. (Chris) Signed-off-by: Tvrtko Ursulin Suggested-by: Daniel Vetter # dynamic slot optimisation Cc: Chris Wilson Reviewed-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201020161144.678668-1-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/i915_pci.c | 7 ++++- drivers/gpu/drm/i915/i915_pmu.c | 57 ++++++++++++++++++++++++++--------------- drivers/gpu/drm/i915/i915_pmu.h | 6 ++++- 3 files changed, 48 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c index 366ddfc8df6b..629e19209da0 100644 --- a/drivers/gpu/drm/i915/i915_pci.c +++ b/drivers/gpu/drm/i915/i915_pci.c @@ -1129,9 +1129,13 @@ static int __init i915_init(void) return 0; } + i915_pmu_init(); + err = pci_register_driver(&i915_pci_driver); - if (err) + if (err) { + i915_pmu_exit(); return err; + } i915_perf_sysctl_register(); return 0; @@ -1145,6 +1149,7 @@ static void __exit i915_exit(void) i915_perf_sysctl_unregister(); pci_unregister_driver(&i915_pci_driver); i915_globals_exit(); + i915_pmu_exit(); } module_init(i915_init); diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 39f584de239b..5c010bc147aa 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -30,6 +30,7 @@ #define ENGINE_SAMPLE_BITS (1 << I915_PMU_SAMPLE_BITS) static cpumask_t i915_pmu_cpumask; +static unsigned int i915_pmu_target_cpu = -1; static u8 engine_config_sample(u64 config) { @@ -1049,25 +1050,39 @@ static int i915_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) static int i915_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node) { struct i915_pmu *pmu = hlist_entry_safe(node, typeof(*pmu), cpuhp.node); - unsigned int target; + unsigned int target = i915_pmu_target_cpu; GEM_BUG_ON(!pmu->base.event_init); + /* + * Unregistering an instance generates a CPU offline event which we must + * ignore to avoid incorrectly modifying the shared i915_pmu_cpumask. + */ + if (pmu->closed) + return 0; + if (cpumask_test_and_clear_cpu(cpu, &i915_pmu_cpumask)) { target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu); + /* Migrate events if there is a valid target */ if (target < nr_cpu_ids) { cpumask_set_cpu(target, &i915_pmu_cpumask); - perf_pmu_migrate_context(&pmu->base, cpu, target); + i915_pmu_target_cpu = target; } } + if (target < nr_cpu_ids && target != pmu->cpuhp.cpu) { + perf_pmu_migrate_context(&pmu->base, cpu, target); + pmu->cpuhp.cpu = target; + } + return 0; } -static int i915_pmu_register_cpuhp_state(struct i915_pmu *pmu) +static enum cpuhp_state cpuhp_slot = CPUHP_INVALID; + +void i915_pmu_init(void) { - enum cpuhp_state slot; int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, @@ -1075,27 +1090,29 @@ static int i915_pmu_register_cpuhp_state(struct i915_pmu *pmu) i915_pmu_cpu_online, i915_pmu_cpu_offline); if (ret < 0) - return ret; + pr_notice("Failed to setup cpuhp state for i915 PMU! (%d)\n", + ret); + else + cpuhp_slot = ret; +} - slot = ret; - ret = cpuhp_state_add_instance(slot, &pmu->cpuhp.node); - if (ret) { - cpuhp_remove_multi_state(slot); - return ret; - } +void i915_pmu_exit(void) +{ + if (cpuhp_slot != CPUHP_INVALID) + cpuhp_remove_multi_state(cpuhp_slot); +} - pmu->cpuhp.slot = slot; - return 0; +static int i915_pmu_register_cpuhp_state(struct i915_pmu *pmu) +{ + if (cpuhp_slot == CPUHP_INVALID) + return -EINVAL; + + return cpuhp_state_add_instance(cpuhp_slot, &pmu->cpuhp.node); } static void i915_pmu_unregister_cpuhp_state(struct i915_pmu *pmu) { - struct drm_i915_private *i915 = container_of(pmu, typeof(*i915), pmu); - - drm_WARN_ON(&i915->drm, pmu->cpuhp.slot == CPUHP_INVALID); - drm_WARN_ON(&i915->drm, cpuhp_state_remove_instance(pmu->cpuhp.slot, &pmu->cpuhp.node)); - cpuhp_remove_multi_state(pmu->cpuhp.slot); - pmu->cpuhp.slot = CPUHP_INVALID; + cpuhp_state_remove_instance(cpuhp_slot, &pmu->cpuhp.node); } static bool is_igp(struct drm_i915_private *i915) @@ -1129,7 +1146,7 @@ void i915_pmu_register(struct drm_i915_private *i915) spin_lock_init(&pmu->lock); hrtimer_init(&pmu->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); pmu->timer.function = i915_sample; - pmu->cpuhp.slot = CPUHP_INVALID; + pmu->cpuhp.cpu = -1; if (!is_igp(i915)) { pmu->name = kasprintf(GFP_KERNEL, diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h index 59a0d19afb67..a24885ab415c 100644 --- a/drivers/gpu/drm/i915/i915_pmu.h +++ b/drivers/gpu/drm/i915/i915_pmu.h @@ -43,7 +43,7 @@ struct i915_pmu { */ struct { struct hlist_node node; - enum cpuhp_state slot; + unsigned int cpu; } cpuhp; /** * @base: PMU base. @@ -126,11 +126,15 @@ struct i915_pmu { }; #ifdef CONFIG_PERF_EVENTS +void i915_pmu_init(void); +void i915_pmu_exit(void); void i915_pmu_register(struct drm_i915_private *i915); void i915_pmu_unregister(struct drm_i915_private *i915); void i915_pmu_gt_parked(struct drm_i915_private *i915); void i915_pmu_gt_unparked(struct drm_i915_private *i915); #else +static inline void i915_pmu_init(void) {} +static inline void i915_pmu_exit(void) {} static inline void i915_pmu_register(struct drm_i915_private *i915) {} static inline void i915_pmu_unregister(struct drm_i915_private *i915) {} static inline void i915_pmu_gt_parked(struct drm_i915_private *i915) {} -- cgit v1.2.3 From b1cff585784edf2ab03edb63376a7543a9c50644 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 22 Oct 2020 22:08:14 +0100 Subject: drm/i915/selftests: Skip RPS tests on Ironlake (only IPS) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since Ironlake uses intel_ips.ko for its dynamic frequency adjustment, we do not have direct control over the frequency management so such tests are defunct. Similarly, we can't check the gen6+ RPS registers on Ironlake. Hopefully this catches all the invalid tests now that Ironlake has rejoined the dynamic GPU frequency club. There is an opportunity for the reader to add tests to exercise MEMINTRSTS and co. Signed-off-by: Chris Wilson Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20201022210814.23004-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/selftest_rps.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/selftest_rps.c b/drivers/gpu/drm/i915/gt/selftest_rps.c index 3540ba9bd459..aa5675ecb5cc 100644 --- a/drivers/gpu/drm/i915/gt/selftest_rps.c +++ b/drivers/gpu/drm/i915/gt/selftest_rps.c @@ -219,7 +219,7 @@ int live_rps_clock_interval(void *arg) struct igt_spinner spin; int err = 0; - if (!intel_rps_is_enabled(rps)) + if (!intel_rps_is_enabled(rps) || INTEL_GEN(gt->i915) < 6) return 0; if (igt_spinner_init(&spin, gt)) @@ -1028,7 +1028,7 @@ int live_rps_interrupt(void *arg) * First, let's check whether or not we are receiving interrupts. */ - if (!intel_rps_has_interrupts(rps)) + if (!intel_rps_has_interrupts(rps) || INTEL_GEN(gt->i915) < 6) return 0; intel_gt_pm_get(gt); @@ -1133,7 +1133,7 @@ int live_rps_power(void *arg) * that theory. */ - if (!intel_rps_is_enabled(rps)) + if (!intel_rps_is_enabled(rps) || INTEL_GEN(gt->i915) < 6) return 0; if (!librapl_energy_uJ()) @@ -1237,7 +1237,7 @@ int live_rps_dynamic(void *arg) * moving parts into dynamic reclocking based on load. */ - if (!intel_rps_is_enabled(rps)) + if (!intel_rps_is_enabled(rps) || INTEL_GEN(gt->i915) < 6) return 0; if (igt_spinner_init(&spin, gt)) -- cgit v1.2.3 From c10f6019d0b2dc8a6a62b55459f3ada5bc4e5e1a Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 22 Oct 2020 07:41:27 +0100 Subject: drm/i915/gt: Use the local HWSP offset during submission We wrap the timeline on construction of the next request, but there may still be requests in flight that have not yet finalized the breadcrumb. (The breadcrumb is delayed as we need engine-local offsets, and for the virtual engine that is not known until execution.) As such, by the time we write to the timeline's HWSP offset it may have changed, and we should use the value we preserved in the request instead. Though the window is small and infrequent (at full flow we can expect a timeline's seqno to wrap once every 30 minutes), the impact of writing the old seqno into the new HWSP is severe: the old requests are never completed, and the new requests are completed before they are even submitted. Fixes: ebece7539242 ("drm/i915: Keep timeline HWSP allocated until idle across the system") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: Joonas Lahtinen Cc: # v5.2+ Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201022064127.10159-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_lrc.c | 27 ++++++++++++++++++-------- drivers/gpu/drm/i915/gt/intel_timeline.c | 18 +++++++++-------- drivers/gpu/drm/i915/gt/intel_timeline_types.h | 2 ++ 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index c22d47cc6701..d0be98b67138 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -3597,6 +3597,19 @@ static const struct intel_context_ops execlists_context_ops = { .destroy = execlists_context_destroy, }; +static u32 hwsp_offset(const struct i915_request *rq) +{ + const struct intel_timeline_cacheline *cl; + + /* Before the request is executed, the timeline/cachline is fixed */ + + cl = rcu_dereference_protected(rq->hwsp_cacheline, 1); + if (cl) + return cl->ggtt_offset; + + return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset; +} + static int gen8_emit_init_breadcrumb(struct i915_request *rq) { u32 *cs; @@ -3619,7 +3632,7 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq) *cs++ = MI_NOOP; *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; - *cs++ = i915_request_timeline(rq)->hwsp_offset; + *cs++ = hwsp_offset(rq); *cs++ = 0; *cs++ = rq->fence.seqno - 1; @@ -4939,11 +4952,9 @@ gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) return gen8_emit_wa_tail(request, cs); } -static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs) +static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs) { - u32 addr = i915_request_active_timeline(request)->hwsp_offset; - - return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0); + return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0); } static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) @@ -4962,7 +4973,7 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ cs = gen8_emit_ggtt_write_rcs(cs, request->fence.seqno, - i915_request_active_timeline(request)->hwsp_offset, + hwsp_offset(request), PIPE_CONTROL_FLUSH_ENABLE | PIPE_CONTROL_CS_STALL); @@ -4974,7 +4985,7 @@ gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) { cs = gen8_emit_ggtt_write_rcs(cs, request->fence.seqno, - i915_request_active_timeline(request)->hwsp_offset, + hwsp_offset(request), PIPE_CONTROL_CS_STALL | PIPE_CONTROL_TILE_CACHE_FLUSH | PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | @@ -5044,7 +5055,7 @@ gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) { cs = gen12_emit_ggtt_write_rcs(cs, request->fence.seqno, - i915_request_active_timeline(request)->hwsp_offset, + hwsp_offset(request), PIPE_CONTROL0_HDC_PIPELINE_FLUSH, PIPE_CONTROL_CS_STALL | PIPE_CONTROL_TILE_CACHE_FLUSH | diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c index a2f74cefe4c3..7ea94d201fe6 100644 --- a/drivers/gpu/drm/i915/gt/intel_timeline.c +++ b/drivers/gpu/drm/i915/gt/intel_timeline.c @@ -188,10 +188,14 @@ cacheline_alloc(struct intel_timeline_hwsp *hwsp, unsigned int cacheline) return cl; } -static void cacheline_acquire(struct intel_timeline_cacheline *cl) +static void cacheline_acquire(struct intel_timeline_cacheline *cl, + u32 ggtt_offset) { - if (cl) - i915_active_acquire(&cl->active); + if (!cl) + return; + + cl->ggtt_offset = ggtt_offset; + i915_active_acquire(&cl->active); } static void cacheline_release(struct intel_timeline_cacheline *cl) @@ -340,7 +344,7 @@ int intel_timeline_pin(struct intel_timeline *tl, struct i915_gem_ww_ctx *ww) GT_TRACE(tl->gt, "timeline:%llx using HWSP offset:%x\n", tl->fence_context, tl->hwsp_offset); - cacheline_acquire(tl->hwsp_cacheline); + cacheline_acquire(tl->hwsp_cacheline, tl->hwsp_offset); if (atomic_fetch_inc(&tl->pin_count)) { cacheline_release(tl->hwsp_cacheline); __i915_vma_unpin(tl->hwsp_ggtt); @@ -515,7 +519,7 @@ __intel_timeline_get_seqno(struct intel_timeline *tl, GT_TRACE(tl->gt, "timeline:%llx using HWSP offset:%x\n", tl->fence_context, tl->hwsp_offset); - cacheline_acquire(cl); + cacheline_acquire(cl, tl->hwsp_offset); tl->hwsp_cacheline = cl; *seqno = timeline_advance(tl); @@ -573,9 +577,7 @@ int intel_timeline_read_hwsp(struct i915_request *from, if (err) goto out; - *hwsp = i915_ggtt_offset(cl->hwsp->vma) + - ptr_unmask_bits(cl->vaddr, CACHELINE_BITS) * CACHELINE_BYTES; - + *hwsp = cl->ggtt_offset; out: i915_active_release(&cl->active); return err; diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h index 02181c5020db..4474f487f589 100644 --- a/drivers/gpu/drm/i915/gt/intel_timeline_types.h +++ b/drivers/gpu/drm/i915/gt/intel_timeline_types.h @@ -94,6 +94,8 @@ struct intel_timeline_cacheline { struct intel_timeline_hwsp *hwsp; void *vaddr; + u32 ggtt_offset; + struct rcu_head rcu; }; -- cgit v1.2.3 From 6e7a21e7ab27dd5e42e212155200c1c84bbadbc7 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 21 Oct 2020 23:04:11 +0100 Subject: drm/i915/selftests: Exercise intel_timeline_read_hwsp() intel_timeline_read_hwsp() is used to support semaphore waits between engines, that may themselves be deferred for arbitrary periods -- that is the read of the target request's HWSP is at an indeterminant point in the future. To support this, we need to prevent overwriting a HWSP that is being watched across a seqno wrap (otherwise the next request will write its value into the old HWSP preventing the watcher from making progress, ad infinitum.) To simulate the observer across a wrap, let's create a request that reads from the HWSP and dispatch it at different points around a wrap to see if the value is lost. Signed-off-by: Chris Wilson Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201021220411.5777-2-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/selftest_timeline.c | 378 +++++++++++++++++++++++++++- 1 file changed, 376 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/selftest_timeline.c b/drivers/gpu/drm/i915/gt/selftest_timeline.c index 96d164a3841d..d5ea6a91bc9e 100644 --- a/drivers/gpu/drm/i915/gt/selftest_timeline.c +++ b/drivers/gpu/drm/i915/gt/selftest_timeline.c @@ -17,8 +17,9 @@ #include "../selftests/i915_random.h" #include "../i915_selftest.h" -#include "../selftests/igt_flush_test.h" -#include "../selftests/mock_gem_device.h" +#include "selftests/igt_flush_test.h" +#include "selftests/lib_sw_fence.h" +#include "selftests/mock_gem_device.h" #include "selftests/mock_timeline.h" static struct page *hwsp_page(struct intel_timeline *tl) @@ -755,6 +756,378 @@ out_free: return err; } +static int emit_read_hwsp(struct i915_request *rq, + u32 seqno, u32 hwsp, + u32 *addr) +{ + const u32 gpr = i915_mmio_reg_offset(GEN8_RING_CS_GPR(rq->engine->mmio_base, 0)); + u32 *cs; + + cs = intel_ring_begin(rq, 12); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = *addr; + *cs++ = 0; + *cs++ = seqno; + *addr += 4; + + *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_USE_GGTT; + *cs++ = gpr; + *cs++ = hwsp; + *cs++ = 0; + + *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; + *cs++ = gpr; + *cs++ = *addr; + *cs++ = 0; + *addr += 4; + + intel_ring_advance(rq, cs); + + return 0; +} + +struct hwsp_watcher { + struct i915_vma *vma; + struct i915_request *rq; + u32 addr; + u32 *map; +}; + +static bool cmp_lt(u32 a, u32 b) +{ + return a < b; +} + +static bool cmp_gte(u32 a, u32 b) +{ + return a >= b; +} + +static int setup_watcher(struct hwsp_watcher *w, struct intel_gt *gt) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + + obj = i915_gem_object_create_internal(gt->i915, SZ_2M); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + w->map = i915_gem_object_pin_map(obj, I915_MAP_WB); + if (IS_ERR(w->map)) { + i915_gem_object_put(obj); + return PTR_ERR(w->map); + } + + vma = i915_gem_object_ggtt_pin_ww(obj, NULL, NULL, 0, 0, 0); + if (IS_ERR(vma)) { + i915_gem_object_put(obj); + return PTR_ERR(vma); + } + + w->vma = vma; + w->addr = i915_ggtt_offset(vma); + return 0; +} + +static int create_watcher(struct hwsp_watcher *w, + struct intel_engine_cs *engine, + int ringsz) +{ + struct intel_context *ce; + struct intel_timeline *tl; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + ce->ring = __intel_context_ring_size(ringsz); + w->rq = intel_context_create_request(ce); + intel_context_put(ce); + if (IS_ERR(w->rq)) + return PTR_ERR(w->rq); + + w->addr = i915_ggtt_offset(w->vma); + tl = w->rq->context->timeline; + + /* some light mutex juggling required; think co-routines */ + lockdep_unpin_lock(&tl->mutex, w->rq->cookie); + mutex_unlock(&tl->mutex); + + return 0; +} + +static int check_watcher(struct hwsp_watcher *w, const char *name, + bool (*op)(u32 hwsp, u32 seqno)) +{ + struct i915_request *rq = fetch_and_zero(&w->rq); + struct intel_timeline *tl = rq->context->timeline; + u32 offset, end; + int err; + + GEM_BUG_ON(w->addr - i915_ggtt_offset(w->vma) > w->vma->size); + + i915_request_get(rq); + mutex_lock(&tl->mutex); + rq->cookie = lockdep_pin_lock(&tl->mutex); + i915_request_add(rq); + + if (i915_request_wait(rq, 0, HZ) < 0) { + err = -ETIME; + goto out; + } + + err = 0; + offset = 0; + end = (w->addr - i915_ggtt_offset(w->vma)) / sizeof(*w->map); + while (offset < end) { + if (!op(w->map[offset + 1], w->map[offset])) { + pr_err("Watcher '%s' found HWSP value %x for seqno %x\n", + name, w->map[offset + 1], w->map[offset]); + err = -EINVAL; + } + + offset += 2; + } + +out: + i915_request_put(rq); + return err; +} + +static void cleanup_watcher(struct hwsp_watcher *w) +{ + if (w->rq) { + struct intel_timeline *tl = w->rq->context->timeline; + + mutex_lock(&tl->mutex); + w->rq->cookie = lockdep_pin_lock(&tl->mutex); + + i915_request_add(w->rq); + } + + i915_vma_unpin_and_release(&w->vma, I915_VMA_RELEASE_MAP); +} + +static bool retire_requests(struct intel_timeline *tl) +{ + struct i915_request *rq, *rn; + + mutex_lock(&tl->mutex); + list_for_each_entry_safe(rq, rn, &tl->requests, link) + if (!i915_request_retire(rq)) + break; + mutex_unlock(&tl->mutex); + + return !i915_active_fence_isset(&tl->last_request); +} + +static struct i915_request *wrap_timeline(struct i915_request *rq) +{ + struct intel_context *ce = rq->context; + struct intel_timeline *tl = ce->timeline; + u32 seqno = rq->fence.seqno; + + while (tl->seqno >= seqno) { /* Cause a wrap */ + i915_request_put(rq); + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + return rq; + + i915_request_get(rq); + i915_request_add(rq); + } + + i915_request_put(rq); + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) + return rq; + + i915_request_get(rq); + i915_request_add(rq); + + return rq; +} + +static int live_hwsp_read(void *arg) +{ + struct intel_gt *gt = arg; + struct hwsp_watcher watcher[2] = {}; + struct intel_engine_cs *engine; + struct intel_timeline *tl; + enum intel_engine_id id; + int err = 0; + int i; + + /* + * If we take a reference to the HWSP for reading on the GPU, that + * read may be arbitrarily delayed (either by foreign fence or + * priority saturation) and a wrap can happen within 30 minutes. + * When the GPU read is finally submitted it should be correct, + * even across multiple wraps. + */ + + if (INTEL_GEN(gt->i915) < 8) /* CS convenience [SRM/LRM] */ + return 0; + + tl = intel_timeline_create(gt); + if (IS_ERR(tl)) + return PTR_ERR(tl); + + if (!tl->hwsp_cacheline) + goto out_free; + + for (i = 0; i < ARRAY_SIZE(watcher); i++) { + err = setup_watcher(&watcher[i], gt); + if (err) + goto out; + } + + for_each_engine(engine, gt, id) { + struct intel_context *ce; + unsigned long count = 0; + IGT_TIMEOUT(end_time); + + /* Create a request we can use for remote reading of the HWSP */ + err = create_watcher(&watcher[1], engine, SZ_512K); + if (err) + goto out; + + do { + struct i915_sw_fence *submit; + struct i915_request *rq; + u32 hwsp; + + submit = heap_fence_create(GFP_KERNEL); + if (!submit) { + err = -ENOMEM; + goto out; + } + + err = create_watcher(&watcher[0], engine, SZ_4K); + if (err) + goto out; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + goto out; + } + + /* Skip to the end, saving 30 minutes of nops */ + tl->seqno = -10u + 2 * (count & 3); + WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno); + ce->timeline = intel_timeline_get(tl); + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + intel_context_put(ce); + goto out; + } + + err = i915_sw_fence_await_dma_fence(&rq->submit, + &watcher[0].rq->fence, 0, + GFP_KERNEL); + if (err < 0) { + i915_request_add(rq); + intel_context_put(ce); + goto out; + } + + mutex_lock(&watcher[0].rq->context->timeline->mutex); + err = intel_timeline_read_hwsp(rq, watcher[0].rq, &hwsp); + if (err == 0) + err = emit_read_hwsp(watcher[0].rq, /* before */ + rq->fence.seqno, hwsp, + &watcher[0].addr); + mutex_unlock(&watcher[0].rq->context->timeline->mutex); + if (err) { + i915_request_add(rq); + intel_context_put(ce); + goto out; + } + + mutex_lock(&watcher[1].rq->context->timeline->mutex); + err = intel_timeline_read_hwsp(rq, watcher[1].rq, &hwsp); + if (err == 0) + err = emit_read_hwsp(watcher[1].rq, /* after */ + rq->fence.seqno, hwsp, + &watcher[1].addr); + mutex_unlock(&watcher[1].rq->context->timeline->mutex); + if (err) { + i915_request_add(rq); + intel_context_put(ce); + goto out; + } + + i915_request_get(rq); + i915_request_add(rq); + + rq = wrap_timeline(rq); + intel_context_put(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + + err = i915_sw_fence_await_dma_fence(&watcher[1].rq->submit, + &rq->fence, 0, + GFP_KERNEL); + if (err < 0) { + i915_request_put(rq); + goto out; + } + + err = check_watcher(&watcher[0], "before", cmp_lt); + i915_sw_fence_commit(submit); + heap_fence_put(submit); + if (err) { + i915_request_put(rq); + goto out; + } + count++; + + if (8 * watcher[1].rq->ring->emit > + 3 * watcher[1].rq->ring->size) { + i915_request_put(rq); + break; + } + + /* Flush the timeline before manually wrapping again */ + if (i915_request_wait(rq, + I915_WAIT_INTERRUPTIBLE, + HZ) < 0) { + err = -ETIME; + i915_request_put(rq); + goto out; + } + + retire_requests(tl); + i915_request_put(rq); + } while (!__igt_timeout(end_time, NULL)); + WRITE_ONCE(*(u32 *)tl->hwsp_seqno, 0xdeadbeef); + + pr_info("%s: simulated %lu wraps\n", engine->name, count); + err = check_watcher(&watcher[1], "after", cmp_gte); + if (err) + goto out; + } + +out: + for (i = 0; i < ARRAY_SIZE(watcher); i++) + cleanup_watcher(&watcher[i]); + + if (igt_flush_test(gt->i915)) + err = -EIO; + +out_free: + intel_timeline_put(tl); + return err; +} + static int live_hwsp_rollover_kernel(void *arg) { struct intel_gt *gt = arg; @@ -998,6 +1371,7 @@ int intel_timeline_live_selftests(struct drm_i915_private *i915) SUBTEST(live_hwsp_engine), SUBTEST(live_hwsp_alternate), SUBTEST(live_hwsp_wrap), + SUBTEST(live_hwsp_read), SUBTEST(live_hwsp_rollover_kernel), SUBTEST(live_hwsp_rollover_user), }; -- cgit v1.2.3 From c071ab8c20748873183d5f210aa105d6a3e2d56a Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 27 Oct 2020 18:47:59 +0000 Subject: drm/i915/gem: Avoid synchronous binds deep within locks On bxt, we require a VT'd w/a to serialise all GGTT updates with memory transfers, and use stop_machine() for this purpose. stop_machine() is a global serialisation barrier and so dangerous to use from within critical sections, as the stop_machine() will wait for all cpus to enter the stop_machine callback, and those cpus may be waiting for the critical section already held. Fixes: d7085b0faac8 ("drm/i915/gem: Poison stolen pages before use") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201027184759.29888-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gem/i915_gem_stolen.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c index 78f64a2724e2..69a37a1fcff2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_stolen.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_stolen.c @@ -505,6 +505,9 @@ static void dbg_poison(struct i915_ggtt *ggtt, if (!drm_mm_node_allocated(&ggtt->error_capture)) return; + if (ggtt->vm.bind_async_flags & I915_VMA_GLOBAL_BIND) + return; /* beware stop_machine() inversion */ + GEM_BUG_ON(!IS_ALIGNED(size, PAGE_SIZE)); mutex_lock(&ggtt->error_mutex); -- cgit v1.2.3 From c784e5249e773689e38d2bc1749f08b986621a26 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 28 Oct 2020 07:58:24 -0700 Subject: drm/i915/guc: Update to use firmware v49.0.1 The latest GuC firmware includes a number of interface changes that require driver updates to match. * Starting from Gen11, the ID to be provided to GuC needs to contain the engine class in bits [0..2] and the instance in bits [3..6]. NOTE: this patch breaks pointer dereferences in some existing GuC functions that use the guc_id to dereference arrays but these functions are not used for now as we have GuC submission disabled and we will update these functions in follow up patch which requires new IDs. * The new GuC requires the additional data structure (ADS) and associated 'private_data' pointer to be setup. This is basically a scratch area of memory that the GuC owns. The size is read from the CSS header. * There is now a physical to logical engine mapping table in the ADS which needs to be configured in order for the firmware to load. For now, the table is initialised with a 1 to 1 mapping. * GUC_CTL_CTXINFO has been removed from the initialization params. * reg_state_buffer is maintained internally by the GuC as part of the private data. * The ADS layout has changed significantly. This patch updates the shared structure and also adds better documentation of the layout. * While i915 does not use GuC doorbells, the firmware now requires that some initialisation is done. * The number of engine classes and instances supported in the ADS has been increased. Signed-off-by: John Harrison Signed-off-by: Matthew Brost Signed-off-by: Daniele Ceraolo Spurio Signed-off-by: Oscar Mateo Signed-off-by: Michel Thierry Signed-off-by: Rodrigo Vivi Signed-off-by: Michal Wajdeczko Cc: Michal Winiarski Cc: Tomasz Lis Cc: Joonas Lahtinen Reviewed-by: Daniele Ceraolo Spurio Signed-off-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20201028145826.2949180-2-John.C.Harrison@Intel.com --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 3 +- drivers/gpu/drm/i915/gt/uc/intel_guc.c | 18 ---- drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c | 131 ++++++++++++++++++++++----- drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h | 80 ++++++++-------- drivers/gpu/drm/i915/gt/uc/intel_guc_reg.h | 5 + drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c | 27 +++--- drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h | 2 + drivers/gpu/drm/i915/gt/uc/intel_uc_fw_abi.h | 6 +- 8 files changed, 176 insertions(+), 96 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 1579a80bc8cb..b80d7285db2f 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -305,8 +305,9 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id) engine->i915 = i915; engine->gt = gt; engine->uncore = gt->uncore; - engine->hw_id = engine->guc_id = info->hw_id; engine->mmio_base = __engine_mmio_base(i915, info->mmio_bases); + engine->hw_id = info->hw_id; + engine->guc_id = MAKE_GUC_ID(info->class, info->instance); engine->class = info->class; engine->instance = info->instance; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c index 942c7c187adb..6909da1e1a73 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c @@ -213,23 +213,6 @@ static u32 guc_ctl_feature_flags(struct intel_guc *guc) return flags; } -static u32 guc_ctl_ctxinfo_flags(struct intel_guc *guc) -{ - u32 flags = 0; - - if (intel_guc_submission_is_used(guc)) { - u32 ctxnum, base; - - base = intel_guc_ggtt_offset(guc, guc->stage_desc_pool); - ctxnum = GUC_MAX_STAGE_DESCRIPTORS / 16; - - base >>= PAGE_SHIFT; - flags |= (base << GUC_CTL_BASE_ADDR_SHIFT) | - (ctxnum << GUC_CTL_CTXNUM_IN16_SHIFT); - } - return flags; -} - static u32 guc_ctl_log_params_flags(struct intel_guc *guc) { u32 offset = intel_guc_ggtt_offset(guc, guc->log.vma) >> PAGE_SHIFT; @@ -291,7 +274,6 @@ static void guc_init_params(struct intel_guc *guc) BUILD_BUG_ON(sizeof(guc->params) != GUC_CTL_MAX_DWORDS * sizeof(u32)); - params[GUC_CTL_CTXINFO] = guc_ctl_ctxinfo_flags(guc); params[GUC_CTL_LOG_PARAMS] = guc_ctl_log_params_flags(guc); params[GUC_CTL_FEATURE] = guc_ctl_feature_flags(guc); params[GUC_CTL_DEBUG] = guc_ctl_debug_flags(guc); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c index d44061033f23..7950d28beb8c 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c @@ -10,11 +10,52 @@ /* * The Additional Data Struct (ADS) has pointers for different buffers used by - * the GuC. One single gem object contains the ADS struct itself (guc_ads), the - * scheduling policies (guc_policies), a structure describing a collection of - * register sets (guc_mmio_reg_state) and some extra pages for the GuC to save - * its internal state for sleep. + * the GuC. One single gem object contains the ADS struct itself (guc_ads) and + * all the extra buffers indirectly linked via the ADS struct's entries. + * + * Layout of the ADS blob allocated for the GuC: + * + * +---------------------------------------+ <== base + * | guc_ads | + * +---------------------------------------+ + * | guc_policies | + * +---------------------------------------+ + * | guc_gt_system_info | + * +---------------------------------------+ + * | guc_clients_info | + * +---------------------------------------+ + * | guc_ct_pool_entry[size] | + * +---------------------------------------+ + * | padding | + * +---------------------------------------+ <== 4K aligned + * | private data | + * +---------------------------------------+ + * | padding | + * +---------------------------------------+ <== 4K aligned */ +struct __guc_ads_blob { + struct guc_ads ads; + struct guc_policies policies; + struct guc_gt_system_info system_info; + struct guc_clients_info clients_info; + struct guc_ct_pool_entry ct_pool[GUC_CT_POOL_SIZE]; +} __packed; + +static u32 guc_ads_private_data_size(struct intel_guc *guc) +{ + return PAGE_ALIGN(guc->fw.private_data_size); +} + +static u32 guc_ads_private_data_offset(struct intel_guc *guc) +{ + return PAGE_ALIGN(sizeof(struct __guc_ads_blob)); +} + +static u32 guc_ads_blob_size(struct intel_guc *guc) +{ + return guc_ads_private_data_offset(guc) + + guc_ads_private_data_size(guc); +} static void guc_policy_init(struct guc_policy *policy) { @@ -48,26 +89,37 @@ static void guc_ct_pool_entries_init(struct guc_ct_pool_entry *pool, u32 num) memset(pool, 0, num * sizeof(*pool)); } +static void guc_mapping_table_init(struct intel_gt *gt, + struct guc_gt_system_info *system_info) +{ + unsigned int i, j; + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* Table must be set to invalid values for entries not used */ + for (i = 0; i < GUC_MAX_ENGINE_CLASSES; ++i) + for (j = 0; j < GUC_MAX_INSTANCES_PER_CLASS; ++j) + system_info->mapping_table[i][j] = + GUC_MAX_INSTANCES_PER_CLASS; + + for_each_engine(engine, gt, id) { + u8 guc_class = engine->class; + + system_info->mapping_table[guc_class][engine->instance] = + engine->instance; + } +} + /* * The first 80 dwords of the register state context, containing the * execlists and ppgtt registers. */ #define LR_HW_CONTEXT_SIZE (80 * sizeof(u32)) -/* The ads obj includes the struct itself and buffers passed to GuC */ -struct __guc_ads_blob { - struct guc_ads ads; - struct guc_policies policies; - struct guc_mmio_reg_state reg_state; - struct guc_gt_system_info system_info; - struct guc_clients_info clients_info; - struct guc_ct_pool_entry ct_pool[GUC_CT_POOL_SIZE]; - u8 reg_state_buffer[GUC_S3_SAVE_SPACE_PAGES * PAGE_SIZE]; -} __packed; - static void __guc_ads_init(struct intel_guc *guc) { struct intel_gt *gt = guc_to_gt(guc); + struct drm_i915_private *i915 = gt->i915; struct __guc_ads_blob *blob = guc->ads_blob; const u32 skipped_size = LRC_PPHWSP_SZ * PAGE_SIZE + LR_HW_CONTEXT_SIZE; u32 base; @@ -99,13 +151,25 @@ static void __guc_ads_init(struct intel_guc *guc) } /* System info */ - blob->system_info.slice_enabled = hweight8(gt->info.sseu.slice_mask); - blob->system_info.rcs_enabled = 1; - blob->system_info.bcs_enabled = 1; + blob->system_info.engine_enabled_masks[RENDER_CLASS] = 1; + blob->system_info.engine_enabled_masks[COPY_ENGINE_CLASS] = 1; + blob->system_info.engine_enabled_masks[VIDEO_DECODE_CLASS] = VDBOX_MASK(gt); + blob->system_info.engine_enabled_masks[VIDEO_ENHANCEMENT_CLASS] = VEBOX_MASK(gt); + + blob->system_info.generic_gt_sysinfo[GUC_GENERIC_GT_SYSINFO_SLICE_ENABLED] = + hweight8(gt->info.sseu.slice_mask); + blob->system_info.generic_gt_sysinfo[GUC_GENERIC_GT_SYSINFO_VDBOX_SFC_SUPPORT_MASK] = + gt->info.vdbox_sfc_access; + + if (INTEL_GEN(i915) >= 12 && !IS_DGFX(i915)) { + u32 distdbreg = intel_uncore_read(gt->uncore, + GEN12_DIST_DBS_POPULATED); + blob->system_info.generic_gt_sysinfo[GUC_GENERIC_GT_SYSINFO_DOORBELL_COUNT_PER_SQIDI] = + ((distdbreg >> GEN12_DOORBELLS_PER_SQIDI_SHIFT) & + GEN12_DOORBELLS_PER_SQIDI) + 1; + } - blob->system_info.vdbox_enable_mask = VDBOX_MASK(gt); - blob->system_info.vebox_enable_mask = VEBOX_MASK(gt); - blob->system_info.vdbox_sfc_support_mask = gt->info.vdbox_sfc_access; + guc_mapping_table_init(guc_to_gt(guc), &blob->system_info); base = intel_guc_ggtt_offset(guc, guc->ads_vma); @@ -118,11 +182,12 @@ static void __guc_ads_init(struct intel_guc *guc) /* ADS */ blob->ads.scheduler_policies = base + ptr_offset(blob, policies); - blob->ads.reg_state_buffer = base + ptr_offset(blob, reg_state_buffer); - blob->ads.reg_state_addr = base + ptr_offset(blob, reg_state); blob->ads.gt_system_info = base + ptr_offset(blob, system_info); blob->ads.clients_info = base + ptr_offset(blob, clients_info); + /* Private Data */ + blob->ads.private_data = base + guc_ads_private_data_offset(guc); + i915_gem_object_flush_map(guc->ads_vma->obj); } @@ -135,14 +200,15 @@ static void __guc_ads_init(struct intel_guc *guc) */ int intel_guc_ads_create(struct intel_guc *guc) { - const u32 size = PAGE_ALIGN(sizeof(struct __guc_ads_blob)); + u32 size; int ret; GEM_BUG_ON(guc->ads_vma); + size = guc_ads_blob_size(guc); + ret = intel_guc_allocate_and_map_vma(guc, size, &guc->ads_vma, (void **)&guc->ads_blob); - if (ret) return ret; @@ -156,6 +222,18 @@ void intel_guc_ads_destroy(struct intel_guc *guc) i915_vma_unpin_and_release(&guc->ads_vma, I915_VMA_RELEASE_MAP); } +static void guc_ads_private_data_reset(struct intel_guc *guc) +{ + u32 size; + + size = guc_ads_private_data_size(guc); + if (!size) + return; + + memset((void *)guc->ads_blob + guc_ads_private_data_offset(guc), 0, + size); +} + /** * intel_guc_ads_reset() - prepares GuC Additional Data Struct for reuse * @guc: intel_guc struct @@ -168,5 +246,8 @@ void intel_guc_ads_reset(struct intel_guc *guc) { if (!guc->ads_vma) return; + __guc_ads_init(guc); + + guc_ads_private_data_reset(guc); } diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h index a6b733c146c9..79c560d9c0b6 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h @@ -26,8 +26,8 @@ #define GUC_VIDEO_ENGINE2 4 #define GUC_MAX_ENGINES_NUM (GUC_VIDEO_ENGINE2 + 1) -#define GUC_MAX_ENGINE_CLASSES 5 -#define GUC_MAX_INSTANCES_PER_CLASS 16 +#define GUC_MAX_ENGINE_CLASSES 16 +#define GUC_MAX_INSTANCES_PER_CLASS 32 #define GUC_DOORBELL_INVALID 256 @@ -62,12 +62,7 @@ #define GUC_STAGE_DESC_ATTR_PCH BIT(6) #define GUC_STAGE_DESC_ATTR_TERMINATED BIT(7) -/* New GuC control data */ -#define GUC_CTL_CTXINFO 0 -#define GUC_CTL_CTXNUM_IN16_SHIFT 0 -#define GUC_CTL_BASE_ADDR_SHIFT 12 - -#define GUC_CTL_LOG_PARAMS 1 +#define GUC_CTL_LOG_PARAMS 0 #define GUC_LOG_VALID (1 << 0) #define GUC_LOG_NOTIFY_ON_HALF_FULL (1 << 1) #define GUC_LOG_ALLOC_IN_MEGABYTE (1 << 3) @@ -79,11 +74,11 @@ #define GUC_LOG_ISR_MASK (0x7 << GUC_LOG_ISR_SHIFT) #define GUC_LOG_BUF_ADDR_SHIFT 12 -#define GUC_CTL_WA 2 -#define GUC_CTL_FEATURE 3 +#define GUC_CTL_WA 1 +#define GUC_CTL_FEATURE 2 #define GUC_CTL_DISABLE_SCHEDULER (1 << 14) -#define GUC_CTL_DEBUG 4 +#define GUC_CTL_DEBUG 3 #define GUC_LOG_VERBOSITY_SHIFT 0 #define GUC_LOG_VERBOSITY_LOW (0 << GUC_LOG_VERBOSITY_SHIFT) #define GUC_LOG_VERBOSITY_MED (1 << GUC_LOG_VERBOSITY_SHIFT) @@ -97,12 +92,37 @@ #define GUC_LOG_DISABLED (1 << 6) #define GUC_PROFILE_ENABLED (1 << 7) -#define GUC_CTL_ADS 5 +#define GUC_CTL_ADS 4 #define GUC_ADS_ADDR_SHIFT 1 #define GUC_ADS_ADDR_MASK (0xFFFFF << GUC_ADS_ADDR_SHIFT) #define GUC_CTL_MAX_DWORDS (SOFT_SCRATCH_COUNT - 2) /* [1..14] */ +/* Generic GT SysInfo data types */ +#define GUC_GENERIC_GT_SYSINFO_SLICE_ENABLED 0 +#define GUC_GENERIC_GT_SYSINFO_VDBOX_SFC_SUPPORT_MASK 1 +#define GUC_GENERIC_GT_SYSINFO_DOORBELL_COUNT_PER_SQIDI 2 +#define GUC_GENERIC_GT_SYSINFO_MAX 16 + +/* + * The class goes in bits [0..2] of the GuC ID, the instance in bits [3..6]. + * Bit 7 can be used for operations that apply to all engine classes&instances. + */ +#define GUC_ENGINE_CLASS_SHIFT 0 +#define GUC_ENGINE_CLASS_MASK (0x7 << GUC_ENGINE_CLASS_SHIFT) +#define GUC_ENGINE_INSTANCE_SHIFT 3 +#define GUC_ENGINE_INSTANCE_MASK (0xf << GUC_ENGINE_INSTANCE_SHIFT) +#define GUC_ENGINE_ALL_INSTANCES BIT(7) + +#define MAKE_GUC_ID(class, instance) \ + (((class) << GUC_ENGINE_CLASS_SHIFT) | \ + ((instance) << GUC_ENGINE_INSTANCE_SHIFT)) + +#define GUC_ID_TO_ENGINE_CLASS(guc_id) \ + (((guc_id) & GUC_ENGINE_CLASS_MASK) >> GUC_ENGINE_CLASS_SHIFT) +#define GUC_ID_TO_ENGINE_INSTANCE(guc_id) \ + (((guc_id) & GUC_ENGINE_INSTANCE_MASK) >> GUC_ENGINE_INSTANCE_SHIFT) + /* Work item for submitting workloads into work queue of GuC. */ struct guc_wq_item { u32 header; @@ -336,11 +356,6 @@ struct guc_policies { } __packed; /* GuC MMIO reg state struct */ - - -#define GUC_REGSET_MAX_REGISTERS 64 -#define GUC_S3_SAVE_SPACE_PAGES 10 - struct guc_mmio_reg { u32 offset; u32 value; @@ -348,28 +363,18 @@ struct guc_mmio_reg { #define GUC_REGSET_MASKED (1 << 0) } __packed; -struct guc_mmio_regset { - struct guc_mmio_reg registers[GUC_REGSET_MAX_REGISTERS]; - u32 values_valid; - u32 number_of_registers; -} __packed; - /* GuC register sets */ -struct guc_mmio_reg_state { - struct guc_mmio_regset engine_reg[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS]; - u32 reserved[98]; +struct guc_mmio_reg_set { + u32 address; + u16 count; + u16 reserved; } __packed; /* HW info */ struct guc_gt_system_info { - u32 slice_enabled; - u32 rcs_enabled; - u32 reserved0; - u32 bcs_enabled; - u32 vdbox_enable_mask; - u32 vdbox_sfc_support_mask; - u32 vebox_enable_mask; - u32 reserved[9]; + u8 mapping_table[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS]; + u32 engine_enabled_masks[GUC_MAX_ENGINE_CLASSES]; + u32 generic_gt_sysinfo[GUC_GENERIC_GT_SYSINFO_MAX]; } __packed; /* Clients info */ @@ -390,15 +395,16 @@ struct guc_clients_info { /* GuC Additional Data Struct */ struct guc_ads { - u32 reg_state_addr; - u32 reg_state_buffer; + struct guc_mmio_reg_set reg_state_list[GUC_MAX_ENGINE_CLASSES][GUC_MAX_INSTANCES_PER_CLASS]; + u32 reserved0; u32 scheduler_policies; u32 gt_system_info; u32 clients_info; u32 control_data; u32 golden_context_lrca[GUC_MAX_ENGINE_CLASSES]; u32 eng_state_size[GUC_MAX_ENGINE_CLASSES]; - u32 reserved[16]; + u32 private_data; + u32 reserved[15]; } __packed; /* GuC logging structures */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_reg.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_reg.h index 1949346e714e..b37fc2ffaef2 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_reg.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_reg.h @@ -118,6 +118,11 @@ struct guc_doorbell_info { #define GEN8_DRB_VALID (1<<0) #define GEN8_DRBREGU(x) _MMIO(0x1000 + (x) * 8 + 4) +#define GEN12_DIST_DBS_POPULATED _MMIO(0xd08) +#define GEN12_DOORBELLS_PER_SQIDI_SHIFT 16 +#define GEN12_DOORBELLS_PER_SQIDI (0xff) +#define GEN12_SQIDIS_DOORBELL_EXIST (0xffff) + #define DE_GUCRMR _MMIO(0x44054) #define GUC_BCS_RCS_IER _MMIO(0xC550) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c index 80e8b6c3bc8c..ee4ac3922277 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c @@ -44,23 +44,19 @@ void intel_uc_fw_change_status(struct intel_uc_fw *uc_fw, * List of required GuC and HuC binaries per-platform. * Must be ordered based on platform + revid, from newer to older. * - * TGL 35.2 is interface-compatible with 33.0 for previous Gens. The deltas - * between 33.0 and 35.2 are only related to new additions to support new Gen12 - * features. - * * Note that RKL uses the same firmware as TGL. */ #define INTEL_UC_FIRMWARE_DEFS(fw_def, guc_def, huc_def) \ - fw_def(ROCKETLAKE, 0, guc_def(tgl, 35, 2, 0), huc_def(tgl, 7, 5, 0)) \ - fw_def(TIGERLAKE, 0, guc_def(tgl, 35, 2, 0), huc_def(tgl, 7, 5, 0)) \ - fw_def(ELKHARTLAKE, 0, guc_def(ehl, 33, 0, 4), huc_def(ehl, 9, 0, 0)) \ - fw_def(ICELAKE, 0, guc_def(icl, 33, 0, 0), huc_def(icl, 9, 0, 0)) \ - fw_def(COMETLAKE, 5, guc_def(cml, 33, 0, 0), huc_def(cml, 4, 0, 0)) \ - fw_def(COFFEELAKE, 0, guc_def(kbl, 33, 0, 0), huc_def(kbl, 4, 0, 0)) \ - fw_def(GEMINILAKE, 0, guc_def(glk, 33, 0, 0), huc_def(glk, 4, 0, 0)) \ - fw_def(KABYLAKE, 0, guc_def(kbl, 33, 0, 0), huc_def(kbl, 4, 0, 0)) \ - fw_def(BROXTON, 0, guc_def(bxt, 33, 0, 0), huc_def(bxt, 2, 0, 0)) \ - fw_def(SKYLAKE, 0, guc_def(skl, 33, 0, 0), huc_def(skl, 2, 0, 0)) + fw_def(ROCKETLAKE, 0, guc_def(tgl, 49, 0, 1), huc_def(tgl, 7, 5, 0)) \ + fw_def(TIGERLAKE, 0, guc_def(tgl, 49, 0, 1), huc_def(tgl, 7, 5, 0)) \ + fw_def(ELKHARTLAKE, 0, guc_def(ehl, 49, 0, 1), huc_def(ehl, 9, 0, 0)) \ + fw_def(ICELAKE, 0, guc_def(icl, 49, 0, 1), huc_def(icl, 9, 0, 0)) \ + fw_def(COMETLAKE, 5, guc_def(cml, 49, 0, 1), huc_def(cml, 4, 0, 0)) \ + fw_def(COFFEELAKE, 0, guc_def(kbl, 49, 0, 1), huc_def(kbl, 4, 0, 0)) \ + fw_def(GEMINILAKE, 0, guc_def(glk, 49, 0, 1), huc_def(glk, 4, 0, 0)) \ + fw_def(KABYLAKE, 0, guc_def(kbl, 49, 0, 1), huc_def(kbl, 4, 0, 0)) \ + fw_def(BROXTON, 0, guc_def(bxt, 49, 0, 1), huc_def(bxt, 2, 0, 0)) \ + fw_def(SKYLAKE, 0, guc_def(skl, 49, 0, 1), huc_def(skl, 2, 0, 0)) #define __MAKE_UC_FW_PATH(prefix_, name_, major_, minor_, patch_) \ "i915/" \ @@ -371,6 +367,9 @@ int intel_uc_fw_fetch(struct intel_uc_fw *uc_fw) } } + if (uc_fw->type == INTEL_UC_FW_TYPE_GUC) + uc_fw->private_data_size = css->private_data_size; + obj = i915_gem_object_create_shmem_from_data(i915, fw->data, fw->size); if (IS_ERR(obj)) { err = PTR_ERR(obj); diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h index 23d3a423ac0f..99bb1fe1af66 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h @@ -88,6 +88,8 @@ struct intel_uc_fw { u32 rsa_size; u32 ucode_size; + + u32 private_data_size; }; #ifdef CONFIG_DRM_I915_DEBUG_GUC diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw_abi.h b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw_abi.h index 029214cdedd5..e41ffc7a7fbc 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw_abi.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw_abi.h @@ -69,7 +69,11 @@ struct uc_css_header { #define CSS_SW_VERSION_UC_MAJOR (0xFF << 16) #define CSS_SW_VERSION_UC_MINOR (0xFF << 8) #define CSS_SW_VERSION_UC_PATCH (0xFF << 0) - u32 reserved[14]; + u32 reserved0[13]; + union { + u32 private_data_size; /* only applies to GuC */ + u32 reserved1; + }; u32 header_info; } __packed; static_assert(sizeof(struct uc_css_header) == 128); -- cgit v1.2.3 From 164e57ca151b144c6952ffb7c52cf3bf81e766f8 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 28 Oct 2020 07:58:25 -0700 Subject: drm/i915/guc: Improved reporting when GuC fails to load Rather than just saying 'GuC failed to load: -110', actually print out the GuC status register and break it down into the individual fields. Signed-off-by: John Harrison Reviewed-by: Daniele Ceraolo Spurio Signed-off-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20201028145826.2949180-3-John.C.Harrison@Intel.com --- drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c index d4a87f4c9421..f9d0907ea1a5 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fw.c @@ -76,6 +76,7 @@ static inline bool guc_ready(struct intel_uncore *uncore, u32 *status) static int guc_wait_ucode(struct intel_uncore *uncore) { + struct drm_device *drm = &uncore->i915->drm; u32 status; int ret; @@ -90,15 +91,27 @@ static int guc_wait_ucode(struct intel_uncore *uncore) ret = wait_for(guc_ready(uncore, &status), 100); DRM_DEBUG_DRIVER("GuC status %#x\n", status); - if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED) { - DRM_ERROR("GuC firmware signature verification failed\n"); - ret = -ENOEXEC; - } - - if ((status & GS_UKERNEL_MASK) == GS_UKERNEL_EXCEPTION) { - DRM_ERROR("GuC firmware exception. EIP: %#x\n", - intel_uncore_read(uncore, SOFT_SCRATCH(13))); - ret = -ENXIO; + if (ret) { + drm_err(drm, "GuC load failed: status = 0x%08X\n", status); + drm_err(drm, "GuC load failed: status: Reset = %d, " + "BootROM = 0x%02X, UKernel = 0x%02X, " + "MIA = 0x%02X, Auth = 0x%02X\n", + REG_FIELD_GET(GS_MIA_IN_RESET, status), + REG_FIELD_GET(GS_BOOTROM_MASK, status), + REG_FIELD_GET(GS_UKERNEL_MASK, status), + REG_FIELD_GET(GS_MIA_MASK, status), + REG_FIELD_GET(GS_AUTH_STATUS_MASK, status)); + + if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED) { + drm_err(drm, "GuC firmware signature verification failed\n"); + ret = -ENOEXEC; + } + + if ((status & GS_UKERNEL_MASK) == GS_UKERNEL_EXCEPTION) { + drm_err(drm, "GuC firmware exception. EIP: %#x\n", + intel_uncore_read(uncore, SOFT_SCRATCH(13))); + ret = -ENXIO; + } } return ret; -- cgit v1.2.3 From 0f41e31a7bdca9a1c51ae21ccfc569ccb9359e03 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 28 Oct 2020 07:58:26 -0700 Subject: drm/i915/guc: Clear pointers on free Clear out some pointers when objects have been de-allocated. This makes it much easier to track down use-after-free type issues. Signed-off-by: John Harrison Reviewed-by: Daniele Ceraolo Spurio Signed-off-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20201028145826.2949180-4-John.C.Harrison@Intel.com --- drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c | 1 + drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c index 7950d28beb8c..5212ff844292 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c @@ -220,6 +220,7 @@ int intel_guc_ads_create(struct intel_guc *guc) void intel_guc_ads_destroy(struct intel_guc *guc) { i915_vma_unpin_and_release(&guc->ads_vma, I915_VMA_RELEASE_MAP); + guc->ads_blob = NULL; } static void guc_ads_private_data_reset(struct intel_guc *guc) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c index 11742fca0e9e..fa9e048cc65f 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c @@ -210,6 +210,7 @@ void intel_guc_ct_fini(struct intel_guc_ct *ct) GEM_BUG_ON(ct->enabled); i915_vma_unpin_and_release(&ct->vma, I915_VMA_RELEASE_MAP); + memset(ct, 0, sizeof(*ct)); } /** -- cgit v1.2.3 From 2739d8cfc50aafff49d599cc0a5bc855445e99a7 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 2 Nov 2020 22:10:56 +0000 Subject: drm/i915/gt: Expose more parameters for emitting writes into the ring Add another lower level to emit_ggtt_write so that the GGTT nature of the write is not hardcoded into the emitter. Signed-off-by: Chris Wilson Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201102221057.29626-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_engine.h | 55 +++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h index 7c3a1012e702..760fefdfe392 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine.h +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -245,22 +245,14 @@ static inline u32 *gen12_emit_pipe_control(u32 *batch, u32 flags0, u32 flags1, u } static inline u32 * -__gen8_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags0, u32 flags1) +__gen8_emit_write_rcs(u32 *cs, u32 value, u32 offset, u32 flags0, u32 flags1) { - /* We're using qword write, offset should be aligned to 8 bytes. */ - GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); - - /* w/a for post sync ops following a GPGPU operation we - * need a prior CS_STALL, which is emitted by the flush - * following the batch. - */ *cs++ = GFX_OP_PIPE_CONTROL(6) | flags0; - *cs++ = flags1 | PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_GLOBAL_GTT_IVB; - *cs++ = gtt_offset; + *cs++ = flags1 | PIPE_CONTROL_QW_WRITE; + *cs++ = offset; *cs++ = 0; *cs++ = value; - /* We're thrashing one dword of HWS. */ - *cs++ = 0; + *cs++ = 0; /* We're thrashing one extra dword. */ return cs; } @@ -268,13 +260,38 @@ __gen8_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags0, u32 f static inline u32* gen8_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags) { - return __gen8_emit_ggtt_write_rcs(cs, value, gtt_offset, 0, flags); + /* We're using qword write, offset should be aligned to 8 bytes. */ + GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); + + return __gen8_emit_write_rcs(cs, + value, + gtt_offset, + 0, + flags | PIPE_CONTROL_GLOBAL_GTT_IVB); } static inline u32* gen12_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags0, u32 flags1) { - return __gen8_emit_ggtt_write_rcs(cs, value, gtt_offset, flags0, flags1); + /* We're using qword write, offset should be aligned to 8 bytes. */ + GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); + + return __gen8_emit_write_rcs(cs, + value, + gtt_offset, + flags0, + flags1 | PIPE_CONTROL_GLOBAL_GTT_IVB); +} + +static inline u32 * +__gen8_emit_flush_dw(u32 *cs, u32 value, u32 gtt_offset, u32 flags) +{ + *cs++ = (MI_FLUSH_DW + 1) | flags; + *cs++ = gtt_offset; + *cs++ = 0; + *cs++ = value; + + return cs; } static inline u32 * @@ -285,12 +302,10 @@ gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset, u32 flags) /* Offset should be aligned to 8 bytes for both (QW/DW) write types */ GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); - *cs++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW | flags; - *cs++ = gtt_offset | MI_FLUSH_DW_USE_GTT; - *cs++ = 0; - *cs++ = value; - - return cs; + return __gen8_emit_flush_dw(cs, + value, + gtt_offset | MI_FLUSH_DW_USE_GTT, + flags | MI_FLUSH_DW_OP_STOREDW); } static inline void __intel_engine_reset(struct intel_engine_cs *engine, -- cgit v1.2.3 From 09212e81e5450743e5b06b27c4e344e4c45b630d Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 2 Nov 2020 22:10:57 +0000 Subject: drm/i915/gt: Flush xcs before tgl breadcrumbs In a simple test case that writes to scratch and then busy-waits for the batch to be signaled, we observe that the signal is before the write is posted. That is bad news. Splitting the flush + write_dword into two separate flush_dw prevents the issue from being reproduced, we can presume the post-sync op is not so post-sync. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/216 Testcase: igt/gem_exec_fence/parallel Testcase: igt/i915_selftest/live/gt_timelines Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: stable@vger.kernel.org Acked-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201102221057.29626-2-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gt/intel_lrc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index d0be98b67138..f3eb68a76a25 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -5047,7 +5047,9 @@ gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) { - return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); + /* XXX Stalling flush before seqno write; post-sync not */ + cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0)); + return gen12_emit_fini_breadcrumb_tail(rq, cs); } static u32 * -- cgit v1.2.3 From bc73e5d33048b7ab5f12b11b5d923700467a8e1d Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 2 Nov 2020 16:19:31 +0000 Subject: drm/i915: Hold onto an explicit ref to i915_vma_work.pinned Since __vma_release is run by a kworker after the fence has been signaled, it is no longer protected by the active reference on the vma, and so the alias of vw->pinned to vma->obj is also not protected by a reference on the object. Add an explicit reference for vw->pinned so it will always be safe. Found by inspection. Fixes: 54d7195f8c64 ("drm/i915: Unpin vma->obj on early error") Reported-by: Tvrtko Ursulin Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: # v5.6+ Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20201102161931.30031-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_vma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index ffb5287e055a..caa9b041616b 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -314,8 +314,10 @@ static void __vma_release(struct dma_fence_work *work) { struct i915_vma_work *vw = container_of(work, typeof(*vw), base); - if (vw->pinned) + if (vw->pinned) { __i915_gem_object_unpin_pages(vw->pinned); + i915_gem_object_put(vw->pinned); + } i915_vm_free_pt_stash(vw->vm, &vw->stash); i915_vm_put(vw->vm); @@ -431,7 +433,7 @@ int i915_vma_bind(struct i915_vma *vma, if (vma->obj) { __i915_gem_object_pin_pages(vma->obj); - work->pinned = vma->obj; + work->pinned = i915_gem_object_get(vma->obj); } } else { vma->ops->bind_vma(vma->vm, NULL, vma, cache_level, bind_flags); -- cgit v1.2.3 From c648ae338e927f8fb6dd274646e5825852b7fd63 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 7 Oct 2020 10:09:47 +0100 Subject: drm/i915/gem: Perform all asynchronous waits prior to marking payload start The initial breadcrumb marks the transition from context wait and setup into the request payload. We use the marker to determine if the request is merely waiting to begin, or is inside the payload and hung. Forgetting to include a breadcrumb before the user payload would mean we do not reset the guilty user request, and conversely if the initial breadcrumb is too early we blame the user for a problem elsewhere. Signed-off-by: Chris Wilson Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20201007090947.19950-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gem/i915_gem_client_blt.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c b/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c index 272cf3ea68d5..44821d94544f 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_client_blt.c @@ -202,12 +202,6 @@ retry: if (unlikely(err)) goto out_request; - if (w->ce->engine->emit_init_breadcrumb) { - err = w->ce->engine->emit_init_breadcrumb(rq); - if (unlikely(err)) - goto out_request; - } - /* * w->dma is already exported via (vma|obj)->resv we need only * keep track of the GPU activity within this vma/request, and @@ -217,9 +211,15 @@ retry: if (err) goto out_request; - err = w->ce->engine->emit_bb_start(rq, - batch->node.start, batch->node.size, - 0); + if (rq->engine->emit_init_breadcrumb) { + err = rq->engine->emit_init_breadcrumb(rq); + if (unlikely(err)) + goto out_request; + } + + err = rq->engine->emit_bb_start(rq, + batch->node.start, batch->node.size, + 0); out_request: if (unlikely(err)) { i915_request_set_error_once(rq, err); -- cgit v1.2.3 From 0049b688459b846f819b6e51c24cd0781fcfde41 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 5 Nov 2020 15:49:33 +0000 Subject: drm/i915/gem: Allow backends to override pread implementation As there are more and more complicated interactions between the different backing stores and userspace, push the control into the backends rather than accumulate them all inside the ioctl handlers. Signed-off-by: Matthew Auld Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201105154934.16022-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gem/i915_gem_object_types.h | 2 ++ drivers/gpu/drm/i915/i915_gem.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index fedfebf13344..e2d9b7e1e152 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -56,6 +56,8 @@ struct drm_i915_gem_object_ops { void (*truncate)(struct drm_i915_gem_object *obj); void (*writeback)(struct drm_i915_gem_object *obj); + int (*pread)(struct drm_i915_gem_object *obj, + const struct drm_i915_gem_pread *arg); int (*pwrite)(struct drm_i915_gem_object *obj, const struct drm_i915_gem_pwrite *arg); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index bb0c12975f38..d58fe1ddc3e1 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -527,6 +527,12 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data, trace_i915_gem_object_pread(obj, args->offset, args->size); + ret = -ENODEV; + if (obj->ops->pread) + ret = obj->ops->pread(obj, args); + if (ret != -ENODEV) + goto out; + ret = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -- cgit v1.2.3 From 852e1b3644817f071427b83859b889c788a0cf69 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 5 Nov 2020 15:49:34 +0000 Subject: drm/i915/gem: Pull phys pread/pwrite implementations to the backend Move the specialised interactions with the physical GEM object from the pread/pwrite ioctl handler into the phys backend. Currently, if one is able to exhaust the entire aperture and then try to pwrite into an object not backed by struct page, we accidentally invoked the phys pwrite handler on a non-phys object; calamitous. Fixes: c6790dc22312 ("drm/i915: Wean off drm_pci_alloc/drm_pci_free") Testcase: igt/gem_pwrite/exhaustion Signed-off-by: Chris Wilson Reviewed-by: Matthew Auld Cc: stable@vger.kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20201105154934.16022-2-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/gem/i915_gem_phys.c | 55 ++++++++++++++++++++++++++++++++ drivers/gpu/drm/i915/i915_gem.c | 26 --------------- 2 files changed, 55 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_phys.c b/drivers/gpu/drm/i915/gem/i915_gem_phys.c index 28147aab47b9..3a4dfe2ef1da 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_phys.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_phys.c @@ -134,6 +134,58 @@ i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj, vaddr, dma); } +static int +phys_pwrite(struct drm_i915_gem_object *obj, + const struct drm_i915_gem_pwrite *args) +{ + void *vaddr = sg_page(obj->mm.pages->sgl) + args->offset; + char __user *user_data = u64_to_user_ptr(args->data_ptr); + int err; + + err = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE | + I915_WAIT_ALL, + MAX_SCHEDULE_TIMEOUT); + if (err) + return err; + + /* + * We manually control the domain here and pretend that it + * remains coherent i.e. in the GTT domain, like shmem_pwrite. + */ + i915_gem_object_invalidate_frontbuffer(obj, ORIGIN_CPU); + + if (copy_from_user(vaddr, user_data, args->size)) + return -EFAULT; + + drm_clflush_virt_range(vaddr, args->size); + intel_gt_chipset_flush(&to_i915(obj->base.dev)->gt); + + i915_gem_object_flush_frontbuffer(obj, ORIGIN_CPU); + return 0; +} + +static int +phys_pread(struct drm_i915_gem_object *obj, + const struct drm_i915_gem_pread *args) +{ + void *vaddr = sg_page(obj->mm.pages->sgl) + args->offset; + char __user *user_data = u64_to_user_ptr(args->data_ptr); + int err; + + err = i915_gem_object_wait(obj, + I915_WAIT_INTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT); + if (err) + return err; + + drm_clflush_virt_range(vaddr, args->size); + if (copy_to_user(user_data, vaddr, args->size)) + return -EFAULT; + + return 0; +} + static void phys_release(struct drm_i915_gem_object *obj) { fput(obj->base.filp); @@ -144,6 +196,9 @@ static const struct drm_i915_gem_object_ops i915_gem_phys_ops = { .get_pages = i915_gem_object_get_pages_phys, .put_pages = i915_gem_object_put_pages_phys, + .pread = phys_pread, + .pwrite = phys_pwrite, + .release = phys_release, }; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index d58fe1ddc3e1..58276694c848 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -179,30 +179,6 @@ try_again: return ret; } -static int -i915_gem_phys_pwrite(struct drm_i915_gem_object *obj, - struct drm_i915_gem_pwrite *args, - struct drm_file *file) -{ - void *vaddr = sg_page(obj->mm.pages->sgl) + args->offset; - char __user *user_data = u64_to_user_ptr(args->data_ptr); - - /* - * We manually control the domain here and pretend that it - * remains coherent i.e. in the GTT domain, like shmem_pwrite. - */ - i915_gem_object_invalidate_frontbuffer(obj, ORIGIN_CPU); - - if (copy_from_user(vaddr, user_data, args->size)) - return -EFAULT; - - drm_clflush_virt_range(vaddr, args->size); - intel_gt_chipset_flush(&to_i915(obj->base.dev)->gt); - - i915_gem_object_flush_frontbuffer(obj, ORIGIN_CPU); - return 0; -} - static int i915_gem_create(struct drm_file *file, struct intel_memory_region *mr, @@ -872,8 +848,6 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data, if (ret == -EFAULT || ret == -ENOSPC) { if (i915_gem_object_has_struct_page(obj)) ret = i915_gem_shmem_pwrite(obj, args); - else - ret = i915_gem_phys_pwrite(obj, args, file); } i915_gem_object_unpin_pages(obj); -- cgit v1.2.3 From ad18fa0f5f052046cad96fee762b5c64f42dd86a Mon Sep 17 00:00:00 2001 From: Venkata Sandeep Dhanalakota Date: Thu, 5 Nov 2020 17:18:42 -0800 Subject: drm/i915: Correctly set SFC capability for video engines SFC capability of video engines is not set correctly because i915 is testing for incorrect bits. Fixes: c5d3e39caa45 ("drm/i915: Engine discovery query") Cc: Matt Roper Cc: Tvrtko Ursulin Signed-off-by: Venkata Sandeep Dhanalakota Signed-off-by: Daniele Ceraolo Spurio Reviewed-by: Tvrtko Ursulin Cc: # v5.3+ Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201106011842.36203-1-daniele.ceraolospurio@intel.com --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index b80d7285db2f..9018582e7229 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -372,7 +372,8 @@ static void __setup_engine_capabilities(struct intel_engine_cs *engine) * instances. */ if ((INTEL_GEN(i915) >= 11 && - engine->gt->info.vdbox_sfc_access & engine->mask) || + (engine->gt->info.vdbox_sfc_access & + BIT(engine->instance))) || (INTEL_GEN(i915) >= 9 && engine->instance == 0)) engine->uabi_capabilities |= I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC; -- cgit v1.2.3 From bda3002485a3dcf975b4ecf3fab9dd53f228db6d Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Wed, 4 Nov 2020 13:47:42 +0000 Subject: drm/i915: Improve record of hung engines in error state Between events which trigger engine and GPU resets and capturing the error state we lose information on which engine triggered the reset. Improve this by passing in the hung engine mask down to error capture. Result is that the list of engines in user visible "GPU HANG: ecode ::, " is now a list of hanging and not just active engines. Most importantly the displayed process is now the one which was actually hung. v2: * Stub prototype. (Chris) Signed-off-by: Tvrtko Ursulin Reviewed-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201104134743.916027-1-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/gt/intel_lrc.c | 2 ++ drivers/gpu/drm/i915/gt/intel_reset.c | 2 +- drivers/gpu/drm/i915/i915_debugfs.c | 2 +- drivers/gpu/drm/i915/i915_gpu_error.c | 35 +++++++++++++++++++++++------------ drivers/gpu/drm/i915/i915_gpu_error.h | 10 +++++++--- 5 files changed, 34 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index f3eb68a76a25..8a51c1c3a091 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -3037,6 +3037,8 @@ static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) if (!cap->error->gt->engine) goto err_gt; + cap->error->gt->engine->hung = true; + return cap; err_gt: diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 4e5e13dc95da..9fb4306b2900 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -1251,7 +1251,7 @@ void intel_gt_handle_error(struct intel_gt *gt, engine_mask &= gt->info.engine_mask; if (flags & I915_ERROR_CAPTURE) { - i915_capture_error_state(gt->i915); + i915_capture_error_state(gt, engine_mask); intel_gt_clear_error_registers(gt, engine_mask); } diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index ea469168cd44..a727552d2bc6 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -725,7 +725,7 @@ static int i915_gpu_info_open(struct inode *inode, struct file *file) gpu = NULL; with_intel_runtime_pm(&i915->runtime_pm, wakeref) - gpu = i915_gpu_coredump(i915); + gpu = i915_gpu_coredump(&i915->gt, ALL_ENGINES); if (IS_ERR(gpu)) return PTR_ERR(gpu); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index ae370909e8dd..994738d974cc 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -570,6 +570,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, ee->vm_info.pp_dir_base); } } + err_printf(m, " hung: %u\n", ee->hung); err_printf(m, " engine reset count: %u\n", ee->reset_count); for (n = 0; n < ee->num_ports; n++) { @@ -1456,6 +1457,7 @@ capture_engine(struct intel_engine_cs *engine, static void gt_record_engines(struct intel_gt_coredump *gt, + intel_engine_mask_t engine_mask, struct i915_vma_compress *compress) { struct intel_engine_cs *engine; @@ -1471,6 +1473,8 @@ gt_record_engines(struct intel_gt_coredump *gt, if (!ee) continue; + ee->hung = engine->mask & engine_mask; + gt->simulated |= ee->simulated; if (ee->simulated) { kfree(ee); @@ -1663,11 +1667,13 @@ static const char *error_msg(struct i915_gpu_coredump *error) for (gt = error->gt; gt; gt = gt->next) { struct intel_engine_coredump *cs; - if (gt->engine && !first) - first = gt->engine; - - for (cs = gt->engine; cs; cs = cs->next) - engines |= cs->engine->mask; + for (cs = gt->engine; cs; cs = cs->next) { + if (cs->hung) { + engines |= cs->engine->mask; + if (!first) + first = cs; + } + } } len = scnprintf(error->error_msg, sizeof(error->error_msg), @@ -1781,8 +1787,10 @@ void i915_vma_capture_finish(struct intel_gt_coredump *gt, kfree(compress); } -struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915) +struct i915_gpu_coredump * +i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask) { + struct drm_i915_private *i915 = gt->i915; struct i915_gpu_coredump *error; /* Check if GPU capture has been disabled */ @@ -1794,7 +1802,7 @@ struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915) if (!error) return ERR_PTR(-ENOMEM); - error->gt = intel_gt_coredump_alloc(&i915->gt, ALLOW_FAIL); + error->gt = intel_gt_coredump_alloc(gt, ALLOW_FAIL); if (error->gt) { struct i915_vma_compress *compress; @@ -1806,7 +1814,7 @@ struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915) } gt_record_info(error->gt); - gt_record_engines(error->gt, compress); + gt_record_engines(error->gt, engine_mask, compress); if (INTEL_INFO(i915)->has_gt_uc) error->gt->uc = gt_record_uc(error->gt, compress); @@ -1853,20 +1861,23 @@ void i915_error_state_store(struct i915_gpu_coredump *error) /** * i915_capture_error_state - capture an error record for later analysis - * @i915: i915 device + * @gt: intel_gt which originated the hang + * @engine_mask: hung engines + * * * Should be called when an error is detected (either a hang or an error * interrupt) to capture error state from the time of the error. Fills * out a structure which becomes available in debugfs for user level tools * to pick up. */ -void i915_capture_error_state(struct drm_i915_private *i915) +void i915_capture_error_state(struct intel_gt *gt, + intel_engine_mask_t engine_mask) { struct i915_gpu_coredump *error; - error = i915_gpu_coredump(i915); + error = i915_gpu_coredump(gt, engine_mask); if (IS_ERR(error)) { - cmpxchg(&i915->gpu_error.first_error, NULL, error); + cmpxchg(>->i915->gpu_error.first_error, NULL, error); return; } diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index 0220b0992808..16bc42de4b84 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -59,6 +59,7 @@ struct i915_request_coredump { struct intel_engine_coredump { const struct intel_engine_cs *engine; + bool hung; bool simulated; u32 reset_count; @@ -218,8 +219,10 @@ struct drm_i915_error_state_buf { __printf(2, 3) void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...); -struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915); -void i915_capture_error_state(struct drm_i915_private *i915); +struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt, + intel_engine_mask_t engine_mask); +void i915_capture_error_state(struct intel_gt *gt, + intel_engine_mask_t engine_mask); struct i915_gpu_coredump * i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp); @@ -271,7 +274,8 @@ void i915_disable_error_state(struct drm_i915_private *i915, int err); #else -static inline void i915_capture_error_state(struct drm_i915_private *i915) +static inline void +i915_capture_error_state(struct intel_gt *gt, intel_engine_mask_t engine_mask) { } -- cgit v1.2.3 From 2dae0c852940189af35b65895443f9ab05e1b319 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Thu, 5 Nov 2020 11:38:42 +0000 Subject: drm/i915: Use ABI engine class in error state ecode Instead of printing out the internal engine mask, which can change between kernel versions making it difficult to map to actual engines, present a bitmask of hanging engines ABI classes. For example: [drm] GPU HANG: ecode 9:8:24dffffd, in gem_exec_schedu [1334] Engine ABI class is useful to quickly categorize render vs media etc hangs in bug reports. Considering virtual engine even more so than the current scheme. v2: * Do not re-order fields. (Chris) Signed-off-by: Tvrtko Ursulin Reviewed-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201105113842.1395391-1-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/i915_gpu_error.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 994738d974cc..f38e6abd4579 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1659,17 +1659,16 @@ static u32 generate_ecode(const struct intel_engine_coredump *ee) static const char *error_msg(struct i915_gpu_coredump *error) { struct intel_engine_coredump *first = NULL; + unsigned int hung_classes = 0; struct intel_gt_coredump *gt; - intel_engine_mask_t engines; int len; - engines = 0; for (gt = error->gt; gt; gt = gt->next) { struct intel_engine_coredump *cs; for (cs = gt->engine; cs; cs = cs->next) { if (cs->hung) { - engines |= cs->engine->mask; + hung_classes |= BIT(cs->engine->uabi_class); if (!first) first = cs; } @@ -1678,7 +1677,7 @@ static const char *error_msg(struct i915_gpu_coredump *error) len = scnprintf(error->error_msg, sizeof(error->error_msg), "GPU HANG: ecode %d:%x:%08x", - INTEL_GEN(error->i915), engines, + INTEL_GEN(error->i915), hung_classes, generate_ecode(first)); if (first && first->context.pid) { /* Just show the first executing process, more is confusing */ -- cgit v1.2.3 From 330b7d33056bb0fa7d7f672d4a98495f200fe9d4 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Mon, 9 Nov 2020 11:12:49 +0000 Subject: drm/i915/region: fix order when adding blocks When performing an allocation we try split it down into the largest possible power-of-two blocks/pages-sizes, and for the common case we expect to allocate the blocks in descending order. This also naturally fits with our GTT alignment tricks(including the hugepages selftest), where we sometimes try to align to the largest possible GTT page-size for the allocation, in the hope that translates to bigger GTT page-sizes. Currently, we seem to incorrectly add the blocks in the opposite order, which is definitely not the intended behaviour. Reported-by: CQ Tang Signed-off-by: Matthew Auld Cc: Chris Wilson Cc: CQ Tang Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201109111249.109365-1-matthew.auld@intel.com --- drivers/gpu/drm/i915/intel_memory_region.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_memory_region.c b/drivers/gpu/drm/i915/intel_memory_region.c index 180e1078ef7c..b326993a1026 100644 --- a/drivers/gpu/drm/i915/intel_memory_region.c +++ b/drivers/gpu/drm/i915/intel_memory_region.c @@ -114,7 +114,7 @@ __intel_memory_region_get_pages_buddy(struct intel_memory_region *mem, n_pages -= BIT(order); block->private = mem; - list_add(&block->link, blocks); + list_add_tail(&block->link, blocks); if (!n_pages) break; -- cgit v1.2.3 From 695dc55b573985569259e18f8e6261a77924342b Mon Sep 17 00:00:00 2001 From: Rodrigo Vivi Date: Wed, 11 Nov 2020 09:09:36 -0500 Subject: drm/i915/tgl: Fix Media power gate sequence. Some media power gates are disabled by default. commit 5d86923060fc ("drm/i915/tgl: Enable VD HCP/MFX sub-pipe power gating") tried to enable it, but it duplicated an existent register. So, the main PG setup sequences ended up overwriting it. So, let's now merge this to the main PG setup sequence. v2: (Chris): s/BIT/REG_BIT, remove useless comment, remove useless =0, use the right gt, remove rc6 sequence doubt from commit message. Fixes: 5d86923060fc ("drm/i915/tgl: Enable VD HCP/MFX sub-pipe power gating") Cc: Lucas De Marchi Cc: stable@vger.kernel.org#v5.5+ Cc: Dale B Stimson Signed-off-by: Rodrigo Vivi Cc: Chris Wilson Reviewed-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20201111072859.1186070-1-rodrigo.vivi@intel.com --- drivers/gpu/drm/i915/gt/intel_rc6.c | 22 +++++++++++++++++----- drivers/gpu/drm/i915/i915_reg.h | 12 +++++------- drivers/gpu/drm/i915/intel_pm.c | 13 ------------- 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c index ab675d35030d..d7b8e4457fc2 100644 --- a/drivers/gpu/drm/i915/gt/intel_rc6.c +++ b/drivers/gpu/drm/i915/gt/intel_rc6.c @@ -56,9 +56,12 @@ static inline void set(struct intel_uncore *uncore, i915_reg_t reg, u32 val) static void gen11_rc6_enable(struct intel_rc6 *rc6) { - struct intel_uncore *uncore = rc6_to_uncore(rc6); + struct intel_gt *gt = rc6_to_gt(rc6); + struct intel_uncore *uncore = gt->uncore; struct intel_engine_cs *engine; enum intel_engine_id id; + u32 pg_enable; + int i; /* 2b: Program RC6 thresholds.*/ set(uncore, GEN6_RC6_WAKE_RATE_LIMIT, 54 << 16 | 85); @@ -102,10 +105,19 @@ static void gen11_rc6_enable(struct intel_rc6 *rc6) GEN6_RC_CTL_RC6_ENABLE | GEN6_RC_CTL_EI_MODE(1); - set(uncore, GEN9_PG_ENABLE, - GEN9_RENDER_PG_ENABLE | - GEN9_MEDIA_PG_ENABLE | - GEN11_MEDIA_SAMPLER_PG_ENABLE); + pg_enable = + GEN9_RENDER_PG_ENABLE | + GEN9_MEDIA_PG_ENABLE | + GEN11_MEDIA_SAMPLER_PG_ENABLE; + + if (INTEL_GEN(gt->i915) >= 12) { + for (i = 0; i < I915_MAX_VCS; i++) + if (HAS_ENGINE(gt, _VCS(i))) + pg_enable |= (VDN_HCP_POWERGATE_ENABLE(i) | + VDN_MFX_POWERGATE_ENABLE(i)); + } + + set(uncore, GEN9_PG_ENABLE, pg_enable); } static void gen9_rc6_enable(struct intel_rc6 *rc6) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index ac691927e29d..6fb1746e2125 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -8984,10 +8984,6 @@ enum { #define GEN9_PWRGT_MEDIA_STATUS_MASK (1 << 0) #define GEN9_PWRGT_RENDER_STATUS_MASK (1 << 1) -#define POWERGATE_ENABLE _MMIO(0xa210) -#define VDN_HCP_POWERGATE_ENABLE(n) BIT(((n) * 2) + 3) -#define VDN_MFX_POWERGATE_ENABLE(n) BIT(((n) * 2) + 4) - #define GTFIFODBG _MMIO(0x120000) #define GT_FIFO_SBDEDICATE_FREE_ENTRY_CHV (0x1f << 20) #define GT_FIFO_FREE_ENTRIES_CHV (0x7f << 13) @@ -9127,9 +9123,11 @@ enum { #define GEN9_MEDIA_PG_IDLE_HYSTERESIS _MMIO(0xA0C4) #define GEN9_RENDER_PG_IDLE_HYSTERESIS _MMIO(0xA0C8) #define GEN9_PG_ENABLE _MMIO(0xA210) -#define GEN9_RENDER_PG_ENABLE REG_BIT(0) -#define GEN9_MEDIA_PG_ENABLE REG_BIT(1) -#define GEN11_MEDIA_SAMPLER_PG_ENABLE REG_BIT(2) +#define GEN9_RENDER_PG_ENABLE REG_BIT(0) +#define GEN9_MEDIA_PG_ENABLE REG_BIT(1) +#define GEN11_MEDIA_SAMPLER_PG_ENABLE REG_BIT(2) +#define VDN_HCP_POWERGATE_ENABLE(n) REG_BIT(3 + 2 * (n)) +#define VDN_MFX_POWERGATE_ENABLE(n) REG_BIT(4 + 2 * (n)) #define GEN8_PUSHBUS_CONTROL _MMIO(0xA248) #define GEN8_PUSHBUS_ENABLE _MMIO(0xA250) #define GEN8_PUSHBUS_SHIFT _MMIO(0xA25C) diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index b4bd19266b8c..2000d3991d12 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -7118,23 +7118,10 @@ static void icl_init_clock_gating(struct drm_i915_private *dev_priv) static void tgl_init_clock_gating(struct drm_i915_private *dev_priv) { - u32 vd_pg_enable = 0; - unsigned int i; - /* Wa_1409120013:tgl */ I915_WRITE(ILK_DPFC_CHICKEN, ILK_DPFC_CHICKEN_COMP_DUMMY_PIXEL); - /* This is not a WA. Enable VD HCP & MFX_ENC powergate */ - for (i = 0; i < I915_MAX_VCS; i++) { - if (HAS_ENGINE(&dev_priv->gt, _VCS(i))) - vd_pg_enable |= VDN_HCP_POWERGATE_ENABLE(i) | - VDN_MFX_POWERGATE_ENABLE(i); - } - - I915_WRITE(POWERGATE_ENABLE, - I915_READ(POWERGATE_ENABLE) | vd_pg_enable); - /* Wa_1409825376:tgl (pre-prod)*/ if (IS_TGL_REVID(dev_priv, TGL_REVID_A0, TGL_REVID_A0)) I915_WRITE(GEN9_CLKGATE_DIS_3, I915_READ(GEN9_CLKGATE_DIS_3) | -- cgit v1.2.3