From 149c84077fe717af883bae459623ef1cebd86388 Mon Sep 17 00:00:00 2001 From: Daniel Vetter <daniel.vetter@ffwll.ch> Date: Wed, 15 Feb 2012 23:50:23 +0100 Subject: drm/i915: implement SNB workaround for lazy global gtt PIPE_CONTROL on snb needs global gtt mappings in place to workaround a hw gotcha. No other commands need such a workaround. Luckily we can detect a PIPE_CONTROL commands easily because they have a write_domain = I915_GEM_DOMAIN_INSTRUCTION (and nothing else has that). v2: Binding the target of such a reloc into the global gtt actually works instead of binding the source, which is rather pointless ... v3: Kill a superflous has_global_gtt_mapping assignement noticed by Chris Wilson. Reviewed-and-tested-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 81687af00893..0e051eca3639 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -273,6 +273,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, { struct drm_device *dev = obj->base.dev; struct drm_gem_object *target_obj; + struct drm_i915_gem_object *target_i915_obj; uint32_t target_offset; int ret = -EINVAL; @@ -281,7 +282,8 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, if (unlikely(target_obj == NULL)) return -ENOENT; - target_offset = to_intel_bo(target_obj)->gtt_offset; + target_i915_obj = to_intel_bo(target_obj); + target_offset = target_i915_obj->gtt_offset; /* The target buffer should have appeared before us in the * exec_object list, so it should have a GTT space bound by now. @@ -383,6 +385,16 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, io_mapping_unmap_atomic(reloc_page); } + /* Sandybridge PPGTT errata: We need a global gtt mapping for MI and + * pipe_control writes because the gpu doesn't properly redirect them + * through the ppgtt for non_secure batchbuffers. */ + if (unlikely(IS_GEN6(dev) && + reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION && + !target_i915_obj->has_global_gtt_mapping)) { + i915_gem_gtt_bind_object(target_i915_obj, + target_i915_obj->cache_level); + } + /* and update the user's relocation entry */ reloc->presumed_offset = target_offset; -- cgit v1.2.3 From 1d83f4426fa0555c98c989915be6df01a8125aca Mon Sep 17 00:00:00 2001 From: Chris Wilson <chris@chris-wilson.co.uk> Date: Sat, 24 Mar 2012 20:12:53 +0000 Subject: drm/i915: Batch copy_from_user for relocation processing Originally the code tried to allocate a large enough array to perform the copy using vmalloc, performance wasn't great and throughput was improved by processing each individual relocation entry separately. This too is not as efficient as one would desire. A compromise would be to allocate a single page, or to allocate a few entries on the stack, and process the copy in batches. The latter gives simpler code and more consistent performance due to a lack of heuristic. x11perf -copywinwin10: n450/pnv i3-330m i5-2520m (cpu) before: 249000 785000 1280000 (80%) page: 264000 896000 1280000 (65%) on-stack: 264000 902000 1280000 (67%) v2: Use 512-bytes of stack for batching rather than allocate a page. v3: Tidy the code slightly with more descriptive variable names Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 42 +++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 0e051eca3639..1fa01313d89f 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -405,30 +405,46 @@ static int i915_gem_execbuffer_relocate_object(struct drm_i915_gem_object *obj, struct eb_objects *eb) { +#define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry)) + struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)]; struct drm_i915_gem_relocation_entry __user *user_relocs; struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; - int i, ret; + int remain, ret; user_relocs = (void __user *)(uintptr_t)entry->relocs_ptr; - for (i = 0; i < entry->relocation_count; i++) { - struct drm_i915_gem_relocation_entry reloc; - if (__copy_from_user_inatomic(&reloc, - user_relocs+i, - sizeof(reloc))) + remain = entry->relocation_count; + while (remain) { + struct drm_i915_gem_relocation_entry *r = stack_reloc; + int count = remain; + if (count > ARRAY_SIZE(stack_reloc)) + count = ARRAY_SIZE(stack_reloc); + remain -= count; + + if (__copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0]))) return -EFAULT; - ret = i915_gem_execbuffer_relocate_entry(obj, eb, &reloc); - if (ret) - return ret; + do { + u64 offset = r->presumed_offset; - if (__copy_to_user_inatomic(&user_relocs[i].presumed_offset, - &reloc.presumed_offset, - sizeof(reloc.presumed_offset))) - return -EFAULT; + ret = i915_gem_execbuffer_relocate_entry(obj, eb, r); + if (ret) + return ret; + + if (r->presumed_offset != offset && + __copy_to_user_inatomic(&user_relocs->presumed_offset, + &r->presumed_offset, + sizeof(r->presumed_offset))) { + return -EFAULT; + } + + user_relocs++; + r++; + } while (--count); } return 0; +#undef N_RELOC } static int -- cgit v1.2.3 From dabdfe021ab1e985e6566009c774fb03f14b568e Mon Sep 17 00:00:00 2001 From: Chris Wilson <chris@chris-wilson.co.uk> Date: Mon, 26 Mar 2012 10:10:27 +0200 Subject: drm/i915: Avoid using mappable space for relocation processing through the CPU We try to avoid writing the relocations through the uncached GTT, if the buffer is currently in the CPU write domain and so will be flushed out to main memory afterwards anyway. Also on SandyBridge we can safely write to the pages in cacheable memory, so long as the buffer is LLC mapped. In either of these cases, we therefore do not need to force the reallocation of the buffer into the mappable region of the GTT, reducing the aperture pressure. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> --- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_gem.c | 4 +--- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 36 +++++++++++++++++++++--------- 3 files changed, 28 insertions(+), 14 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index d680f09e872a..93e06a3225cc 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1266,6 +1266,8 @@ int __must_check i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write); int __must_check +i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write); +int __must_check i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj, u32 alignment, struct intel_ring_buffer *pipelined); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 90e3fc18b8b5..09c033e5e02b 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -39,8 +39,6 @@ static __must_check int i915_gem_object_flush_gpu_write_domain(struct drm_i915_gem_object *obj); static void i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj); static void i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj); -static __must_check int i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, - bool write); static __must_check int i915_gem_object_set_cpu_read_domain_range(struct drm_i915_gem_object *obj, uint64_t offset, uint64_t size); @@ -3073,7 +3071,7 @@ i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj) * This function returns when the move is complete, including waiting on * flushes to occur. */ -static int +int i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write) { uint32_t old_write_domain, old_read_domains; diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 1fa01313d89f..eb85860001ec 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -266,6 +266,12 @@ eb_destroy(struct eb_objects *eb) kfree(eb); } +static inline int use_cpu_reloc(struct drm_i915_gem_object *obj) +{ + return (obj->base.write_domain == I915_GEM_DOMAIN_CPU || + obj->cache_level != I915_CACHE_NONE); +} + static int i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, struct eb_objects *eb, @@ -354,11 +360,19 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, return ret; } + /* We can't wait for rendering with pagefaults disabled */ + if (obj->active && in_atomic()) + return -EFAULT; + reloc->delta += target_offset; - if (obj->base.write_domain == I915_GEM_DOMAIN_CPU) { + if (use_cpu_reloc(obj)) { uint32_t page_offset = reloc->offset & ~PAGE_MASK; char *vaddr; + ret = i915_gem_object_set_to_cpu_domain(obj, 1); + if (ret) + return ret; + vaddr = kmap_atomic(obj->pages[reloc->offset >> PAGE_SHIFT]); *(uint32_t *)(vaddr + page_offset) = reloc->delta; kunmap_atomic(vaddr); @@ -367,10 +381,6 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj, uint32_t __iomem *reloc_entry; void __iomem *reloc_page; - /* We can't wait for rendering with pagefaults disabled */ - if (obj->active && in_atomic()) - return -EFAULT; - ret = i915_gem_object_set_to_gtt_domain(obj, 1); if (ret) return ret; @@ -492,6 +502,13 @@ i915_gem_execbuffer_relocate(struct drm_device *dev, #define __EXEC_OBJECT_HAS_FENCE (1<<31) +static int +need_reloc_mappable(struct drm_i915_gem_object *obj) +{ + struct drm_i915_gem_exec_object2 *entry = obj->exec_entry; + return entry->relocation_count && !use_cpu_reloc(obj); +} + static int pin_and_fence_object(struct drm_i915_gem_object *obj, struct intel_ring_buffer *ring) @@ -505,8 +522,7 @@ pin_and_fence_object(struct drm_i915_gem_object *obj, has_fenced_gpu_access && entry->flags & EXEC_OBJECT_NEEDS_FENCE && obj->tiling_mode != I915_TILING_NONE; - need_mappable = - entry->relocation_count ? true : need_fence; + need_mappable = need_fence || need_reloc_mappable(obj); ret = i915_gem_object_pin(obj, entry->alignment, need_mappable); if (ret) @@ -563,8 +579,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, has_fenced_gpu_access && entry->flags & EXEC_OBJECT_NEEDS_FENCE && obj->tiling_mode != I915_TILING_NONE; - need_mappable = - entry->relocation_count ? true : need_fence; + need_mappable = need_fence || need_reloc_mappable(obj); if (need_mappable) list_move(&obj->exec_list, &ordered_objects); @@ -604,8 +619,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring, has_fenced_gpu_access && entry->flags & EXEC_OBJECT_NEEDS_FENCE && obj->tiling_mode != I915_TILING_NONE; - need_mappable = - entry->relocation_count ? true : need_fence; + need_mappable = need_fence || need_reloc_mappable(obj); if ((entry->alignment && obj->gtt_offset & (entry->alignment - 1)) || (need_mappable && !obj->map_and_fenceable)) -- cgit v1.2.3 From f56f821feb7b36223f309e0ec05986bb137ce418 Mon Sep 17 00:00:00 2001 From: Daniel Vetter <daniel.vetter@ffwll.ch> Date: Sun, 25 Mar 2012 19:47:41 +0200 Subject: mm: extend prefault helpers to fault in more than PAGE_SIZE drm/i915 wants to read/write more than one page in its fastpath and hence needs to prefault more than PAGE_SIZE bytes. Add new functions in filemap.h to make that possible. Also kill a copy&pasted spurious space in both functions while at it. v2: As suggested by Andrew Morton, add a multipage parameter to both functions to avoid the additional branch for the pagemap.c hotpath. My gcc 4.6 here seems to dtrt and indeed reap these branches where not needed. v3: Becaus I couldn't find a way around adding a uaddr += PAGE_SIZE to the filemap.c hotpaths (that the compiler couldn't remove again), let's go with separate new functions for the multipage use-case. v4: Adjust comment to CodingStlye and fix spelling. Acked-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> --- drivers/gpu/drm/i915/i915_gem.c | 6 +-- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +- include/linux/pagemap.h | 64 +++++++++++++++++++++++++++++- 3 files changed, 66 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c') diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index e9cac478cced..6dc832902f53 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -416,7 +416,7 @@ i915_gem_shmem_pread(struct drm_device *dev, mutex_unlock(&dev->struct_mutex); if (!prefaulted) { - ret = fault_in_pages_writeable(user_data, remain); + ret = fault_in_multipages_writeable(user_data, remain); /* Userspace is tricking us, but we've already clobbered * its pages with the prefault and promised to write the * data up to the first fault. Hence ignore any errors @@ -809,8 +809,8 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data, args->size)) return -EFAULT; - ret = fault_in_pages_readable((char __user *)(uintptr_t)args->data_ptr, - args->size); + ret = fault_in_multipages_readable((char __user *)(uintptr_t)args->data_ptr, + args->size); if (ret) return -EFAULT; diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index eb85860001ec..8e0b686d3afb 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -997,7 +997,7 @@ validate_exec_list(struct drm_i915_gem_exec_object2 *exec, if (!access_ok(VERIFY_WRITE, ptr, length)) return -EFAULT; - if (fault_in_pages_readable(ptr, length)) + if (fault_in_multipages_readable(ptr, length)) return -EFAULT; } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index cfaaa6949b8b..c93a9a9bcd35 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -426,7 +426,7 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size) */ if (((unsigned long)uaddr & PAGE_MASK) != ((unsigned long)end & PAGE_MASK)) - ret = __put_user(0, end); + ret = __put_user(0, end); } return ret; } @@ -445,13 +445,73 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size) if (((unsigned long)uaddr & PAGE_MASK) != ((unsigned long)end & PAGE_MASK)) { - ret = __get_user(c, end); + ret = __get_user(c, end); (void)c; } } return ret; } +/* + * Multipage variants of the above prefault helpers, useful if more than + * PAGE_SIZE of data needs to be prefaulted. These are separate from the above + * functions (which only handle up to PAGE_SIZE) to avoid clobbering the + * filemap.c hotpaths. + */ +static inline int fault_in_multipages_writeable(char __user *uaddr, int size) +{ + int ret; + const char __user *end = uaddr + size - 1; + + if (unlikely(size == 0)) + return 0; + + /* + * Writing zeroes into userspace here is OK, because we know that if + * the zero gets there, we'll be overwriting it. + */ + while (uaddr <= end) { + ret = __put_user(0, uaddr); + if (ret != 0) + return ret; + uaddr += PAGE_SIZE; + } + + /* Check whether the range spilled into the next page. */ + if (((unsigned long)uaddr & PAGE_MASK) == + ((unsigned long)end & PAGE_MASK)) + ret = __put_user(0, end); + + return ret; +} + +static inline int fault_in_multipages_readable(const char __user *uaddr, + int size) +{ + volatile char c; + int ret; + const char __user *end = uaddr + size - 1; + + if (unlikely(size == 0)) + return 0; + + while (uaddr <= end) { + ret = __get_user(c, uaddr); + if (ret != 0) + return ret; + uaddr += PAGE_SIZE; + } + + /* Check whether the range spilled into the next page. */ + if (((unsigned long)uaddr & PAGE_MASK) == + ((unsigned long)end & PAGE_MASK)) { + ret = __get_user(c, end); + (void)c; + } + + return ret; +} + int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, -- cgit v1.2.3