From 3fc21924100b13f73c734d0ce8dfcfe913fcf7a8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 24 Feb 2017 14:55:48 -0800 Subject: mm: validate device_hotplug is held for memory hotplug mem_hotplug_begin() assumes that it can set mem_hotplug.active_writer and run the hotplug process without racing another thread. Validate this assumption with a lockdep assertion. Link: http://lkml.kernel.org/r/148693886229.16345.1770484669403334689.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Ben Hutchings Cc: Michal Hocko Cc: Toshi Kani Cc: Vlastimil Babka Cc: Logan Gunthorpe Cc: Greg Kroah-Hartman Cc: Masayoshi Mizuma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index bd684fc8ec1d..a48a7ff70164 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1139,6 +1139,7 @@ static inline bool device_supports_offline(struct device *dev) extern void lock_device_hotplug(void); extern void unlock_device_hotplug(void); extern int lock_device_hotplug_sysfs(void); +void assert_held_device_hotplug(void); extern int device_offline(struct device *dev); extern int device_online(struct device *dev); extern void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); -- cgit v1.2.3 From 0262d9c845ec349edf93f69688a5129c36cc2232 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 24 Feb 2017 14:55:59 -0800 Subject: memblock: embed memblock type name within struct memblock_type Provide the name of each memblock type with struct memblock_type. This allows to get rid of the function memblock_type_name() and duplicating the type names in __memblock_dump_all(). The only memblock_type usage out of mm/memblock.c seems to be arch/s390/kernel/crash_dump.c. While at it, give it a name. Link: http://lkml.kernel.org/r/20170120123456.46508-4-heiko.carstens@de.ibm.com Signed-off-by: Heiko Carstens Cc: Philipp Hachtmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/crash_dump.c | 1 + include/linux/memblock.h | 1 + mm/memblock.c | 35 +++++++++++------------------------ 3 files changed, 13 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index e2293c662bdf..dd1d5c62c374 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -32,6 +32,7 @@ static struct memblock_type oldmem_type = { .max = 1, .total_size = 0, .regions = &oldmem_region, + .name = "oldmem", }; struct save_area { diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 38bcf00cbed3..bdfc65af4152 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -42,6 +42,7 @@ struct memblock_type { unsigned long max; /* size of the allocated array */ phys_addr_t total_size; /* size of all regions */ struct memblock_region *regions; + char *name; }; struct memblock { diff --git a/mm/memblock.c b/mm/memblock.c index 647d79ed07a3..b64b47803e52 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -35,15 +35,18 @@ struct memblock memblock __initdata_memblock = { .memory.regions = memblock_memory_init_regions, .memory.cnt = 1, /* empty dummy entry */ .memory.max = INIT_MEMBLOCK_REGIONS, + .memory.name = "memory", .reserved.regions = memblock_reserved_init_regions, .reserved.cnt = 1, /* empty dummy entry */ .reserved.max = INIT_MEMBLOCK_REGIONS, + .reserved.name = "reserved", #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP .physmem.regions = memblock_physmem_init_regions, .physmem.cnt = 1, /* empty dummy entry */ .physmem.max = INIT_PHYSMEM_REGIONS, + .physmem.name = "physmem", #endif .bottom_up = false, @@ -64,22 +67,6 @@ ulong __init_memblock choose_memblock_flags(void) return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } -/* inline so we don't get a warning when pr_debug is compiled out */ -static __init_memblock const char * -memblock_type_name(struct memblock_type *type) -{ - if (type == &memblock.memory) - return "memory"; - else if (type == &memblock.reserved) - return "reserved"; -#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP - else if (type == &memblock.physmem) - return "physmem"; -#endif - else - return "unknown"; -} - /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) { @@ -406,12 +393,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, } if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", - memblock_type_name(type), type->max, type->max * 2); + type->name, type->max, type->max * 2); return -1; } memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]", - memblock_type_name(type), type->max * 2, (u64)addr, + type->name, type->max * 2, (u64)addr, (u64)addr + new_size - 1); /* @@ -1697,14 +1684,14 @@ phys_addr_t __init_memblock memblock_get_current_limit(void) return memblock.current_limit; } -static void __init_memblock memblock_dump(struct memblock_type *type, char *name) +static void __init_memblock memblock_dump(struct memblock_type *type) { phys_addr_t base, end, size; unsigned long flags; int idx; struct memblock_region *rgn; - pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); + pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt); for_each_memblock_type(type, rgn) { char nid_buf[32] = ""; @@ -1719,7 +1706,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name memblock_get_region_node(rgn)); #endif pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#lx\n", - name, idx, &base, &end, &size, nid_buf, flags); + type->name, idx, &base, &end, &size, nid_buf, flags); } } @@ -1730,10 +1717,10 @@ void __init_memblock __memblock_dump_all(void) &memblock.memory.total_size, &memblock.reserved.total_size); - memblock_dump(&memblock.memory, "memory"); - memblock_dump(&memblock.reserved, "reserved"); + memblock_dump(&memblock.memory); + memblock_dump(&memblock.reserved); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP - memblock_dump(&memblock.physmem, "physmem"); + memblock_dump(&memblock.physmem); #endif } -- cgit v1.2.3 From d811914d87576c562e849c00d9f9beff45038801 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 24 Feb 2017 14:56:02 -0800 Subject: userfaultfd: non-cooperative: rename *EVENT_MADVDONTNEED to *EVENT_REMOVE Patch series "userfaultfd: non-cooperative: add madvise() event for MADV_REMOVE request". These patches add notification of madvise(MADV_REMOVE) event to non-cooperative userfaultfd monitor. The first pacth renames EVENT_MADVDONTNEED to EVENT_REMOVE along with relevant functions and structures. Using _REMOVE instead of _MADVDONTNEED describes the event semantics more clearly and I hope it's not too late for such change in the ABI. This patch (of 3): The UFFD_EVENT_MADVDONTNEED purpose is to notify uffd monitor about removal of certain range from address space tracked by userfaultfd. Hence, UFFD_EVENT_REMOVE seems to better reflect the operation semantics. Respectively, 'madv_dn' field of uffd_msg is renamed to 'remove' and the madvise_userfault_dontneed callback is renamed to userfaultfd_remove. Link: http://lkml.kernel.org/r/1484814154-1557-2-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrea Arcangeli Acked-by: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 14 +++++++------- include/linux/userfaultfd_k.h | 16 ++++++++-------- include/uapi/linux/userfaultfd.h | 8 ++++---- mm/madvise.c | 2 +- tools/testing/selftests/vm/userfaultfd.c | 16 ++++++++-------- 5 files changed, 28 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 18406158e13f..8fe601b4875e 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -681,16 +681,16 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, userfaultfd_event_wait_completion(ctx, &ewq); } -void madvise_userfault_dontneed(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end) +void userfaultfd_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue ewq; ctx = vma->vm_userfaultfd_ctx.ctx; - if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_MADVDONTNEED)) + if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE)) return; userfaultfd_ctx_get(ctx); @@ -700,9 +700,9 @@ void madvise_userfault_dontneed(struct vm_area_struct *vma, msg_init(&ewq.msg); - ewq.msg.event = UFFD_EVENT_MADVDONTNEED; - ewq.msg.arg.madv_dn.start = start; - ewq.msg.arg.madv_dn.end = end; + ewq.msg.event = UFFD_EVENT_REMOVE; + ewq.msg.arg.remove.start = start; + ewq.msg.arg.remove.end = end; userfaultfd_event_wait_completion(ctx, &ewq); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f431861f22f1..2521542f6c07 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -61,10 +61,10 @@ extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *, unsigned long from, unsigned long to, unsigned long len); -extern void madvise_userfault_dontneed(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, - unsigned long end); +extern void userfaultfd_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, + unsigned long end); #else /* CONFIG_USERFAULTFD */ @@ -112,10 +112,10 @@ static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx, { } -static inline void madvise_userfault_dontneed(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, - unsigned long end) +static inline void userfaultfd_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, + unsigned long end) { } #endif /* CONFIG_USERFAULTFD */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 9ac4b68c54d1..b742c40c2880 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -20,7 +20,7 @@ #define UFFD_API ((__u64)0xAA) #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ - UFFD_FEATURE_EVENT_MADVDONTNEED | \ + UFFD_FEATURE_EVENT_REMOVE | \ UFFD_FEATURE_MISSING_HUGETLBFS | \ UFFD_FEATURE_MISSING_SHMEM) #define UFFD_API_IOCTLS \ @@ -92,7 +92,7 @@ struct uffd_msg { struct { __u64 start; __u64 end; - } madv_dn; + } remove; struct { /* unused reserved fields */ @@ -109,7 +109,7 @@ struct uffd_msg { #define UFFD_EVENT_PAGEFAULT 0x12 #define UFFD_EVENT_FORK 0x13 #define UFFD_EVENT_REMAP 0x14 -#define UFFD_EVENT_MADVDONTNEED 0x15 +#define UFFD_EVENT_REMOVE 0x15 /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ @@ -155,7 +155,7 @@ struct uffdio_api { #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) #define UFFD_FEATURE_EVENT_REMAP (1<<2) -#define UFFD_FEATURE_EVENT_MADVDONTNEED (1<<3) +#define UFFD_FEATURE_EVENT_REMOVE (1<<3) #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) #define UFFD_FEATURE_MISSING_SHMEM (1<<5) __u64 features; diff --git a/mm/madvise.c b/mm/madvise.c index b530a4986035..ab5ef141cc9b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -479,7 +479,7 @@ static long madvise_dontneed(struct vm_area_struct *vma, if (!can_madv_dontneed_vma(vma)) return -EINVAL; - madvise_userfault_dontneed(vma, prev, start, end); + userfaultfd_remove(vma, prev, start, end); zap_page_range(vma, start, end - start); return 0; } diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 5a840a605a16..9eb77df568f7 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -398,12 +398,12 @@ static void *uffd_poll_thread(void *arg) uffd = msg.arg.fork.ufd; pollfd[0].fd = uffd; break; - case UFFD_EVENT_MADVDONTNEED: - uffd_reg.range.start = msg.arg.madv_dn.start; - uffd_reg.range.len = msg.arg.madv_dn.end - - msg.arg.madv_dn.start; + case UFFD_EVENT_REMOVE: + uffd_reg.range.start = msg.arg.remove.start; + uffd_reg.range.len = msg.arg.remove.end - + msg.arg.remove.start; if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) - fprintf(stderr, "madv_dn failure\n"), exit(1); + fprintf(stderr, "remove failure\n"), exit(1); break; case UFFD_EVENT_REMAP: area_dst = (char *)(unsigned long)msg.arg.remap.to; @@ -570,7 +570,7 @@ static int userfaultfd_open(int features) * mremap, the entire monitored area is accessed in a single pass for * HUGETLB_TEST. * The release of the pages currently generates event only for - * anonymous memory (UFFD_EVENT_MADVDONTNEED), hence it is not checked + * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked * for hugetlb and shmem. */ static int faulting_process(void) @@ -715,14 +715,14 @@ static int userfaultfd_events_test(void) pid_t pid; char c; - printf("testing events (fork, remap, madv_dn): "); + printf("testing events (fork, remap, remove): "); fflush(stdout); if (release_pages(area_dst)) return 1; features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | - UFFD_FEATURE_EVENT_MADVDONTNEED; + UFFD_FEATURE_EVENT_REMOVE; if (userfaultfd_open(features) < 0) return 1; fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); -- cgit v1.2.3 From 1276ad68e2491d1ceeb65f55d790f9277593c459 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 24 Feb 2017 14:56:11 -0800 Subject: mm: vmscan: scan dirty pages even in laptop mode Patch series "mm: vmscan: fix kswapd writeback regression". We noticed a regression on multiple hadoop workloads when moving from 3.10 to 4.0 and 4.6, which involves kswapd getting tangled up in page writeout, causing direct reclaim herds that also don't make progress. I tracked it down to the thrash avoidance efforts after 3.10 that make the kernel better at keeping use-once cache and use-many cache sorted on the inactive and active list, with more aggressive protection of the active list as long as there is inactive cache. Unfortunately, our workload's use-once cache is mostly from streaming writes. Waiting for writes to avoid potential reloads in the future is not a good tradeoff. These patches do the following: 1. Wake the flushers when kswapd sees a lump of dirty pages. It's possible to be below the dirty background limit and still have cache velocity push them through the LRU. So start a-flushin'. 2. Let kswapd only write pages that have been rotated twice. This makes sure we really tried to get all the clean pages on the inactive list before resorting to horrible LRU-order writeback. 3. Move rotating dirty pages off the inactive list. Instead of churning or waiting on page writeback, we'll go after clean active cache. This might lead to thrashing, but in this state memory demand outstrips IO speed anyway, and reads are faster than writes. Mel backported the series to 4.10-rc5 with one minor conflict and ran a couple of tests on it. Mix of read/write random workload didn't show anything interesting. Write-only database didn't show much difference in performance but there were slight reductions in IO -- probably in the noise. simoop did show big differences although not as big as Mel expected. This is Chris Mason's workload that similate the VM activity of hadoop. Mel won't go through the full details but over the samples measured during an hour it reported 4.10.0-rc5 4.10.0-rc5 vanilla johannes-v1r1 Amean p50-Read 21346531.56 ( 0.00%) 21697513.24 ( -1.64%) Amean p95-Read 24700518.40 ( 0.00%) 25743268.98 ( -4.22%) Amean p99-Read 27959842.13 ( 0.00%) 28963271.11 ( -3.59%) Amean p50-Write 1138.04 ( 0.00%) 989.82 ( 13.02%) Amean p95-Write 1106643.48 ( 0.00%) 12104.00 ( 98.91%) Amean p99-Write 1569213.22 ( 0.00%) 36343.38 ( 97.68%) Amean p50-Allocation 85159.82 ( 0.00%) 79120.70 ( 7.09%) Amean p95-Allocation 204222.58 ( 0.00%) 129018.43 ( 36.82%) Amean p99-Allocation 278070.04 ( 0.00%) 183354.43 ( 34.06%) Amean final-p50-Read 21266432.00 ( 0.00%) 21921792.00 ( -3.08%) Amean final-p95-Read 24870912.00 ( 0.00%) 26116096.00 ( -5.01%) Amean final-p99-Read 28147712.00 ( 0.00%) 29523968.00 ( -4.89%) Amean final-p50-Write 1130.00 ( 0.00%) 977.00 ( 13.54%) Amean final-p95-Write 1033216.00 ( 0.00%) 2980.00 ( 99.71%) Amean final-p99-Write 1517568.00 ( 0.00%) 32672.00 ( 97.85%) Amean final-p50-Allocation 86656.00 ( 0.00%) 78464.00 ( 9.45%) Amean final-p95-Allocation 211712.00 ( 0.00%) 116608.00 ( 44.92%) Amean final-p99-Allocation 287232.00 ( 0.00%) 168704.00 ( 41.27%) The latencies are actually completely horrific in comparison to 4.4 (and 4.10-rc5 is worse than 4.9 according to historical data for reasons Mel hasn't analysed yet). Still, 95% of write latency (p95-write) is halved by the series and allocation latency is way down. Direct reclaim activity is one fifth of what it was according to vmstats. Kswapd activity is higher but this is not necessarily surprising. Kswapd efficiency is unchanged at 99% (99% of pages scanned were reclaimed) but direct reclaim efficiency went from 77% to 99% In the vanilla kernel, 627MB of data was written back from reclaim context. With the series, no data was written back. With or without the patch, pages are being immediately reclaimed after writeback completes. However, with the patch, only 1/8th of the pages are reclaimed like this. This patch (of 5): We have an elaborate dirty/writeback throttling mechanism inside the reclaim scanner, but for that to work the pages have to go through shrink_page_list() and get counted for what they are. Otherwise, we mess up the LRU order and don't match reclaim speed to writeback. Especially during deactivation, there is never a reason to skip dirty pages; nothing is even trying to write them out from there. Don't mess up the LRU order for nothing, shuffle these pages along. Link: http://lkml.kernel.org/r/20170123181641.23938-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Minchan Kim Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Hillf Danton Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 -- mm/vmscan.c | 14 ++------------ 2 files changed, 2 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 82fc632fd11d..8e02b3750fe0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -236,8 +236,6 @@ struct lruvec { #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) -/* Isolate clean file */ -#define ISOLATE_CLEAN ((__force isolate_mode_t)0x1) /* Isolate unmapped file */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) /* Isolate for asynchronous migration */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 7bb23ff229b6..0d05f7f3b532 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -87,6 +87,7 @@ struct scan_control { /* The highest zone to isolate pages for reclaim from */ enum zone_type reclaim_idx; + /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; /* Can mapped pages be reclaimed? */ @@ -1373,13 +1374,10 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) * wants to isolate pages it will be able to operate on without * blocking - clean pages for the most part. * - * ISOLATE_CLEAN means that only clean pages should be isolated. This - * is used by reclaim when it is cannot write to backing storage - * * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages * that it is possible to migrate without blocking */ - if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { + if (mode & ISOLATE_ASYNC_MIGRATE) { /* All the caller can do on PageWriteback is block */ if (PageWriteback(page)) return ret; @@ -1387,10 +1385,6 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) if (PageDirty(page)) { struct address_space *mapping; - /* ISOLATE_CLEAN means only clean pages */ - if (mode & ISOLATE_CLEAN) - return ret; - /* * Only pages without mappings or that have a * ->migratepage callback are possible to migrate @@ -1731,8 +1725,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (!sc->may_unmap) isolate_mode |= ISOLATE_UNMAPPED; - if (!sc->may_writepage) - isolate_mode |= ISOLATE_CLEAN; spin_lock_irq(&pgdat->lru_lock); @@ -1929,8 +1921,6 @@ static void shrink_active_list(unsigned long nr_to_scan, if (!sc->may_unmap) isolate_mode |= ISOLATE_UNMAPPED; - if (!sc->may_writepage) - isolate_mode |= ISOLATE_CLEAN; spin_lock_irq(&pgdat->lru_lock); -- cgit v1.2.3 From 726d061fbd3658e4bfeffa1b8e82da97de2ca4dd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 24 Feb 2017 14:56:14 -0800 Subject: mm: vmscan: kick flushers when we encounter dirty pages on the LRU Memory pressure can put dirty pages at the end of the LRU without anybody running into dirty limits. Don't start writing individual pages from kswapd while the flushers might be asleep. Unlike the old direct reclaim flusher wakeup (removed in the next patch) that flushes the number of pages just scanned, this patch wakes the flushers for all outstanding dirty pages. That seemed to perform better in a synthetic test that pushes dirty pages to the end of the LRU and into reclaim, because we know LRU aging outstrips writeback already, and this way we give younger dirty pages a headstart rather than wait until reclaim runs into them as well. It also means less plugging and risk of exhausting the struct request pool from reclaim. There is a concern that this will cause temporary files that used to get dirtied and truncated before writeback to now get written to disk under memory pressure. If this turns out to be a real problem, we'll have to revisit this and tame the reclaim flusher wakeups. [hannes@cmpxchg.org: mention dirty expiration as a condition] Link: http://lkml.kernel.org/r/20170126174739.GA30636@cmpxchg.org Link: http://lkml.kernel.org/r/20170123181641.23938-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Minchan Kim Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Hillf Danton Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 2 +- include/trace/events/writeback.h | 2 +- mm/vmscan.c | 18 +++++++++++++----- 3 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 5527d910ba3d..a3c0cbd7c888 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -46,7 +46,7 @@ enum writeback_sync_modes { */ enum wb_reason { WB_REASON_BACKGROUND, - WB_REASON_TRY_TO_FREE_PAGES, + WB_REASON_VMSCAN, WB_REASON_SYNC, WB_REASON_PERIODIC, WB_REASON_LAPTOP_TIMER, diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 2ccd9ccbf9ef..7bd8783a590f 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -31,7 +31,7 @@ #define WB_WORK_REASON \ EM( WB_REASON_BACKGROUND, "background") \ - EM( WB_REASON_TRY_TO_FREE_PAGES, "try_to_free_pages") \ + EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ diff --git a/mm/vmscan.c b/mm/vmscan.c index 0d05f7f3b532..83c92b866afe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1798,12 +1798,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, /* * If dirty pages are scanned that are not queued for IO, it - * implies that flushers are not keeping up. In this case, flag - * the pgdat PGDAT_DIRTY and kswapd will start writing pages from - * reclaim context. + * implies that flushers are not doing their job. This can + * happen when memory pressure pushes dirty pages to the end of + * the LRU before the dirty limits are breached and the dirty + * data has expired. It can also happen when the proportion of + * dirty pages grows not through writes but through memory + * pressure reclaiming all the clean cache. And in some cases, + * the flushers simply cannot keep up with the allocation + * rate. Nudge the flusher threads in case they are asleep, but + * also allow kswapd to start writing pages during reclaim. */ - if (stat.nr_unqueued_dirty == nr_taken) + if (stat.nr_unqueued_dirty == nr_taken) { + wakeup_flusher_threads(0, WB_REASON_VMSCAN); set_bit(PGDAT_DIRTY, &pgdat->flags); + } /* * If kswapd scans pages marked marked for immediate @@ -2787,7 +2795,7 @@ retry: writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; if (total_scanned > writeback_threshold) { wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, - WB_REASON_TRY_TO_FREE_PAGES); + WB_REASON_VMSCAN); sc->may_writepage = 1; } } while (--sc->priority >= 0); -- cgit v1.2.3 From c55e8d035b28b2867e68b0e2d0eee2c0f1016b43 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 24 Feb 2017 14:56:23 -0800 Subject: mm: vmscan: move dirty pages out of the way until they're flushed We noticed a performance regression when moving hadoop workloads from 3.10 kernels to 4.0 and 4.6. This is accompanied by increased pageout activity initiated by kswapd as well as frequent bursts of allocation stalls and direct reclaim scans. Even lowering the dirty ratios to the equivalent of less than 1% of memory would not eliminate the issue, suggesting that dirty pages concentrate where the scanner is looking. This can be traced back to recent efforts of thrash avoidance. Where 3.10 would not detect refaulting pages and continuously supply clean cache to the inactive list, a thrashing workload on 4.0+ will detect and activate refaulting pages right away, distilling used-once pages on the inactive list much more effectively. This is by design, and it makes sense for clean cache. But for the most part our workload's cache faults are refaults and its use-once cache is from streaming writes. We end up with most of the inactive list dirty, and we don't go after the active cache as long as we have use-once pages around. But waiting for writes to avoid reclaiming clean cache that *might* refault is a bad trade-off. Even if the refaults happen, reads are faster than writes. Before getting bogged down on writeback, reclaim should first look at *all* cache in the system, even active cache. To accomplish this, activate pages that are dirty or under writeback when they reach the end of the inactive LRU. The pages are marked for immediate reclaim, meaning they'll get moved back to the inactive LRU tail as soon as they're written back and become reclaimable. But in the meantime, by reducing the inactive list to only immediately reclaimable pages, we allow the scanner to deactivate and refill the inactive list with clean cache from the active list tail to guarantee forward progress. [hannes@cmpxchg.org: update comment] Link: http://lkml.kernel.org/r/20170202191957.22872-8-hannes@cmpxchg.org Link: http://lkml.kernel.org/r/20170123181641.23938-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Minchan Kim Acked-by: Michal Hocko Acked-by: Hillf Danton Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 7 +++++++ mm/swap.c | 9 +++++---- mm/vmscan.c | 15 ++++++++++++--- 3 files changed, 24 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 41d376e7116d..e030a68ead7e 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -50,6 +50,13 @@ static __always_inline void add_page_to_lru_list(struct page *page, list_add(&page->lru, &lruvec->lists[lru]); } +static __always_inline void add_page_to_lru_list_tail(struct page *page, + struct lruvec *lruvec, enum lru_list lru) +{ + update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page)); + list_add_tail(&page->lru, &lruvec->lists[lru]); +} + static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { diff --git a/mm/swap.c b/mm/swap.c index aabf2e90fe32..c4910f14f957 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -209,9 +209,10 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, { int *pgmoved = arg; - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - enum lru_list lru = page_lru_base_type(page); - list_move_tail(&page->lru, &lruvec->lists[lru]); + if (PageLRU(page) && !PageUnevictable(page)) { + del_page_from_lru_list(page, lruvec, page_lru(page)); + ClearPageActive(page); + add_page_to_lru_list_tail(page, lruvec, page_lru(page)); (*pgmoved)++; } } @@ -235,7 +236,7 @@ static void pagevec_move_tail(struct pagevec *pvec) */ void rotate_reclaimable_page(struct page *page) { - if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && + if (!PageLocked(page) && !PageDirty(page) && !PageUnevictable(page) && PageLRU(page)) { struct pagevec *pvec; unsigned long flags; diff --git a/mm/vmscan.c b/mm/vmscan.c index 92e56cadceae..ae3d982216b5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1056,6 +1056,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, * throttling so we could easily OOM just because too many * pages are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. + * + * In cases 1) and 2) we activate the pages to get them out of + * the way while we continue scanning for clean pages on the + * inactive list and refilling from the active list. The + * observation here is that waiting for disk writes is more + * expensive than potentially causing reloads down the line. + * Since they're marked for immediate reclaim, they won't put + * memory pressure on the cache working set any longer than it + * takes to write them to disk. */ if (PageWriteback(page)) { /* Case 1 above */ @@ -1063,7 +1072,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, PageReclaim(page) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { nr_immediate++; - goto keep_locked; + goto activate_locked; /* Case 2 above */ } else if (sane_reclaim(sc) || @@ -1081,7 +1090,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ SetPageReclaim(page); nr_writeback++; - goto keep_locked; + goto activate_locked; /* Case 3 above */ } else { @@ -1174,7 +1183,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, inc_node_page_state(page, NR_VMSCAN_IMMEDIATE); SetPageReclaim(page); - goto keep_locked; + goto activate_locked; } if (references == PAGEREF_RECLAIM_CLEAN) -- cgit v1.2.3 From 11bac80004499ea59f361ef2a5516c84b6eab675 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 24 Feb 2017 14:56:41 -0800 Subject: mm, fs: reduce fault, page_mkwrite, and pfn_mkwrite to take only vmf ->fault(), ->page_mkwrite(), and ->pfn_mkwrite() calls do not need to take a vma and vmf parameter when the vma already resides in vmf. Remove the vma parameter to simplify things. [arnd@arndb.de: fix ARM build] Link: http://lkml.kernel.org/r/20170125223558.1451224-1-arnd@arndb.de Link: http://lkml.kernel.org/r/148521301778.19116.10840599906674778980.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dave Jiang Signed-off-by: Arnd Bergmann Reviewed-by: Ross Zwisler Cc: Theodore Ts'o Cc: Darrick J. Wong Cc: Matthew Wilcox Cc: Dave Hansen Cc: Christoph Hellwig Cc: Jan Kara Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_64_vio.c | 4 +-- arch/powerpc/platforms/cell/spufs/file.c | 39 ++++++++++++------------ drivers/android/binder.c | 2 +- drivers/char/agp/alpha-agp.c | 5 ++- drivers/char/mspec.c | 6 ++-- drivers/dax/dax.c | 12 ++++---- drivers/gpu/drm/armada/armada_gem.c | 9 +++--- drivers/gpu/drm/drm_vm.c | 32 ++++++++++--------- drivers/gpu/drm/etnaviv/etnaviv_drv.h | 2 +- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 3 +- drivers/gpu/drm/exynos/exynos_drm_gem.c | 3 +- drivers/gpu/drm/exynos/exynos_drm_gem.h | 2 +- drivers/gpu/drm/gma500/framebuffer.c | 3 +- drivers/gpu/drm/gma500/gem.c | 3 +- drivers/gpu/drm/gma500/psb_drv.h | 2 +- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/gpu/drm/i915/i915_gem.c | 4 +-- drivers/gpu/drm/msm/msm_drv.h | 2 +- drivers/gpu/drm/msm/msm_gem.c | 3 +- drivers/gpu/drm/omapdrm/omap_drv.h | 2 +- drivers/gpu/drm/omapdrm/omap_gem.c | 4 +-- drivers/gpu/drm/qxl/qxl_ttm.c | 6 ++-- drivers/gpu/drm/radeon/radeon_ttm.c | 6 ++-- drivers/gpu/drm/tegra/gem.c | 3 +- drivers/gpu/drm/ttm/ttm_bo_vm.c | 10 +++--- drivers/gpu/drm/udl/udl_drv.h | 2 +- drivers/gpu/drm/udl/udl_gem.c | 3 +- drivers/gpu/drm/vgem/vgem_drv.c | 3 +- drivers/gpu/drm/virtio/virtgpu_ttm.c | 7 ++--- drivers/hsi/clients/cmt_speech.c | 4 +-- drivers/hwtracing/intel_th/msu.c | 6 ++-- drivers/infiniband/hw/hfi1/file_ops.c | 4 +-- drivers/infiniband/hw/qib/qib_file_ops.c | 2 +- drivers/media/v4l2-core/videobuf-dma-sg.c | 3 +- drivers/misc/cxl/context.c | 3 +- drivers/misc/sgi-gru/grumain.c | 3 +- drivers/misc/sgi-gru/grutables.h | 2 +- drivers/scsi/cxlflash/superpipe.c | 6 ++-- drivers/scsi/sg.c | 3 +- drivers/staging/android/ion/ion.c | 6 ++-- drivers/staging/lustre/lustre/llite/llite_mmap.c | 7 +++-- drivers/staging/lustre/lustre/llite/vvp_io.c | 2 +- drivers/target/target_core_user.c | 6 ++-- drivers/uio/uio.c | 6 ++-- drivers/usb/mon/mon_bin.c | 4 +-- drivers/video/fbdev/core/fb_defio.c | 16 +++++----- drivers/xen/privcmd.c | 4 +-- fs/9p/vfs_file.c | 4 +-- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 6 ++-- fs/ceph/addr.c | 8 +++-- fs/cifs/file.c | 2 +- fs/dax.c | 15 ++++----- fs/ext2/file.c | 17 +++++------ fs/ext4/ext4.h | 4 +-- fs/ext4/file.c | 17 +++++------ fs/ext4/inode.c | 9 +++--- fs/f2fs/file.c | 7 ++--- fs/fuse/file.c | 6 ++-- fs/gfs2/file.c | 8 ++--- fs/iomap.c | 5 ++- fs/kernfs/file.c | 13 ++++---- fs/ncpfs/mmap.c | 7 ++--- fs/nfs/file.c | 4 +-- fs/nilfs2/file.c | 3 +- fs/ocfs2/mmap.c | 15 ++++----- fs/proc/vmcore.c | 4 +-- fs/ubifs/file.c | 5 ++- fs/xfs/xfs_file.c | 25 +++++++-------- include/linux/dax.h | 5 ++- include/linux/iomap.h | 3 +- include/linux/mm.h | 10 +++--- ipc/shm.c | 6 ++-- kernel/events/core.c | 6 ++-- kernel/relay.c | 4 +-- mm/filemap.c | 19 ++++++------ mm/hugetlb.c | 2 +- mm/memory.c | 6 ++-- mm/mmap.c | 9 +++--- mm/nommu.c | 2 +- mm/shmem.c | 3 +- security/selinux/selinuxfs.c | 5 ++- sound/core/pcm_native.c | 15 ++++----- sound/usb/usx2y/us122l.c | 5 ++- sound/usb/usx2y/usX2Yhwdep.c | 7 ++--- sound/usb/usx2y/usx2yhwdeppcm.c | 5 ++- virt/kvm/kvm_main.c | 4 +-- 87 files changed, 283 insertions(+), 290 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 491c5d8120f7..ab9d14c0e460 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -102,9 +102,9 @@ static void release_spapr_tce_table(struct rcu_head *head) kfree(stt); } -static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int kvm_spapr_tce_fault(struct vm_fault *vmf) { - struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; + struct kvmppc_spapr_tce_table *stt = vmf->vma->vm_file->private_data; struct page *page; if (vmf->pgoff >= kvmppc_tce_pages(stt->size)) diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index a35e2c29d7ee..e5ec1368f0cd 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -233,8 +233,9 @@ spufs_mem_write(struct file *file, const char __user *buffer, } static int -spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +spufs_mem_mmap_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct spu_context *ctx = vma->vm_file->private_data; unsigned long pfn, offset; @@ -311,12 +312,11 @@ static const struct file_operations spufs_mem_fops = { .mmap = spufs_mem_mmap, }; -static int spufs_ps_fault(struct vm_area_struct *vma, - struct vm_fault *vmf, +static int spufs_ps_fault(struct vm_fault *vmf, unsigned long ps_offs, unsigned long ps_size) { - struct spu_context *ctx = vma->vm_file->private_data; + struct spu_context *ctx = vmf->vma->vm_file->private_data; unsigned long area, offset = vmf->pgoff << PAGE_SHIFT; int ret = 0; @@ -354,7 +354,7 @@ static int spufs_ps_fault(struct vm_area_struct *vma, down_read(¤t->mm->mmap_sem); } else { area = ctx->spu->problem_phys + ps_offs; - vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT); + vm_insert_pfn(vmf->vma, vmf->address, (area + offset) >> PAGE_SHIFT); spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu); } @@ -367,10 +367,9 @@ refault: } #if SPUFS_MMAP_4K -static int spufs_cntl_mmap_fault(struct vm_area_struct *vma, - struct vm_fault *vmf) +static int spufs_cntl_mmap_fault(struct vm_fault *vmf) { - return spufs_ps_fault(vma, vmf, 0x4000, SPUFS_CNTL_MAP_SIZE); + return spufs_ps_fault(vmf, 0x4000, SPUFS_CNTL_MAP_SIZE); } static const struct vm_operations_struct spufs_cntl_mmap_vmops = { @@ -1067,15 +1066,15 @@ static ssize_t spufs_signal1_write(struct file *file, const char __user *buf, } static int -spufs_signal1_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +spufs_signal1_mmap_fault(struct vm_fault *vmf) { #if SPUFS_SIGNAL_MAP_SIZE == 0x1000 - return spufs_ps_fault(vma, vmf, 0x14000, SPUFS_SIGNAL_MAP_SIZE); + return spufs_ps_fault(vmf, 0x14000, SPUFS_SIGNAL_MAP_SIZE); #elif SPUFS_SIGNAL_MAP_SIZE == 0x10000 /* For 64k pages, both signal1 and signal2 can be used to mmap the whole * signal 1 and 2 area */ - return spufs_ps_fault(vma, vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE); + return spufs_ps_fault(vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE); #else #error unsupported page size #endif @@ -1205,15 +1204,15 @@ static ssize_t spufs_signal2_write(struct file *file, const char __user *buf, #if SPUFS_MMAP_4K static int -spufs_signal2_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +spufs_signal2_mmap_fault(struct vm_fault *vmf) { #if SPUFS_SIGNAL_MAP_SIZE == 0x1000 - return spufs_ps_fault(vma, vmf, 0x1c000, SPUFS_SIGNAL_MAP_SIZE); + return spufs_ps_fault(vmf, 0x1c000, SPUFS_SIGNAL_MAP_SIZE); #elif SPUFS_SIGNAL_MAP_SIZE == 0x10000 /* For 64k pages, both signal1 and signal2 can be used to mmap the whole * signal 1 and 2 area */ - return spufs_ps_fault(vma, vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE); + return spufs_ps_fault(vmf, 0x10000, SPUFS_SIGNAL_MAP_SIZE); #else #error unsupported page size #endif @@ -1334,9 +1333,9 @@ DEFINE_SPUFS_ATTRIBUTE(spufs_signal2_type, spufs_signal2_type_get, #if SPUFS_MMAP_4K static int -spufs_mss_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +spufs_mss_mmap_fault(struct vm_fault *vmf) { - return spufs_ps_fault(vma, vmf, 0x0000, SPUFS_MSS_MAP_SIZE); + return spufs_ps_fault(vmf, 0x0000, SPUFS_MSS_MAP_SIZE); } static const struct vm_operations_struct spufs_mss_mmap_vmops = { @@ -1396,9 +1395,9 @@ static const struct file_operations spufs_mss_fops = { }; static int -spufs_psmap_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +spufs_psmap_mmap_fault(struct vm_fault *vmf) { - return spufs_ps_fault(vma, vmf, 0x0000, SPUFS_PS_MAP_SIZE); + return spufs_ps_fault(vmf, 0x0000, SPUFS_PS_MAP_SIZE); } static const struct vm_operations_struct spufs_psmap_mmap_vmops = { @@ -1456,9 +1455,9 @@ static const struct file_operations spufs_psmap_fops = { #if SPUFS_MMAP_4K static int -spufs_mfc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +spufs_mfc_mmap_fault(struct vm_fault *vmf) { - return spufs_ps_fault(vma, vmf, 0x3000, SPUFS_MFC_MAP_SIZE); + return spufs_ps_fault(vmf, 0x3000, SPUFS_MFC_MAP_SIZE); } static const struct vm_operations_struct spufs_mfc_mmap_vmops = { diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 15b263a420e8..2bbcdc6fdfee 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -3342,7 +3342,7 @@ static void binder_vma_close(struct vm_area_struct *vma) binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES); } -static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int binder_vm_fault(struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c index 737187865269..53fe633df1e8 100644 --- a/drivers/char/agp/alpha-agp.c +++ b/drivers/char/agp/alpha-agp.c @@ -11,15 +11,14 @@ #include "agp.h" -static int alpha_core_agp_vm_fault(struct vm_area_struct *vma, - struct vm_fault *vmf) +static int alpha_core_agp_vm_fault(struct vm_fault *vmf) { alpha_agp_info *agp = agp_bridge->dev_private_data; dma_addr_t dma_addr; unsigned long pa; struct page *page; - dma_addr = vmf->address - vma->vm_start + agp->aperture.bus_base; + dma_addr = vmf->address - vmf->vma->vm_start + agp->aperture.bus_base; pa = agp->ops->translate(agp, dma_addr); if (pa == (unsigned long)-EINVAL) diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index a697ca0cab1e..a9c2fa3c81e5 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -191,12 +191,12 @@ mspec_close(struct vm_area_struct *vma) * Creates a mspec page and maps it to user space. */ static int -mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +mspec_fault(struct vm_fault *vmf) { unsigned long paddr, maddr; unsigned long pfn; pgoff_t index = vmf->pgoff; - struct vma_data *vdata = vma->vm_private_data; + struct vma_data *vdata = vmf->vma->vm_private_data; maddr = (volatile unsigned long) vdata->maddr[index]; if (maddr == 0) { @@ -227,7 +227,7 @@ mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * be because another thread has installed the pte first, so it * is no problem. */ - vm_insert_pfn(vma, vmf->address, pfn); + vm_insert_pfn(vmf->vma, vmf->address, pfn); return VM_FAULT_NOPAGE; } diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 18e9875f6277..0261f332bf3e 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -419,8 +419,7 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, return -1; } -static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, - struct vm_fault *vmf) +static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) { struct device *dev = &dax_dev->dev; struct dax_region *dax_region; @@ -428,7 +427,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, phys_addr_t phys; pfn_t pfn; - if (check_vma(dax_dev, vma, __func__)) + if (check_vma(dax_dev, vmf->vma, __func__)) return VM_FAULT_SIGBUS; dax_region = dax_dev->region; @@ -446,7 +445,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); - rc = vm_insert_mixed(vma, vmf->address, pfn); + rc = vm_insert_mixed(vmf->vma, vmf->address, pfn); if (rc == -ENOMEM) return VM_FAULT_OOM; @@ -456,8 +455,9 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } -static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int dax_dev_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; int rc; struct file *filp = vma->vm_file; struct dax_dev *dax_dev = filp->private_data; @@ -466,7 +466,7 @@ static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) current->comm, (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", vma->vm_start, vma->vm_end); rcu_read_lock(); - rc = __dax_dev_fault(dax_dev, vma, vmf); + rc = __dax_dev_fault(dax_dev, vmf); rcu_read_unlock(); return rc; diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c index 560d416deab2..1597458d884e 100644 --- a/drivers/gpu/drm/armada/armada_gem.c +++ b/drivers/gpu/drm/armada/armada_gem.c @@ -14,14 +14,15 @@ #include #include "armada_ioctlP.h" -static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int armada_gem_vm_fault(struct vm_fault *vmf) { - struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data); + struct drm_gem_object *gobj = vmf->vma->vm_private_data; + struct armada_gem_object *obj = drm_to_armada_gem(gobj); unsigned long pfn = obj->phys_addr >> PAGE_SHIFT; int ret; - pfn += (vmf->address - vma->vm_start) >> PAGE_SHIFT; - ret = vm_insert_pfn(vma, vmf->address, pfn); + pfn += (vmf->address - vmf->vma->vm_start) >> PAGE_SHIFT; + ret = vm_insert_pfn(vmf->vma, vmf->address, pfn); switch (ret) { case 0: diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index bd311c77c254..bae6e26038ee 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c @@ -96,8 +96,9 @@ static pgprot_t drm_dma_prot(uint32_t map_type, struct vm_area_struct *vma) * map, get the page, increment the use count and return it. */ #if IS_ENABLED(CONFIG_AGP) -static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int drm_do_vm_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct drm_file *priv = vma->vm_file->private_data; struct drm_device *dev = priv->minor->dev; struct drm_local_map *map = NULL; @@ -168,7 +169,7 @@ vm_fault_error: return VM_FAULT_SIGBUS; /* Disallow mremap */ } #else -static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int drm_do_vm_fault(struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } @@ -184,8 +185,9 @@ static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * Get the mapping, find the real physical page to map, get the page, and * return it. */ -static int drm_do_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int drm_do_vm_shm_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct drm_local_map *map = vma->vm_private_data; unsigned long offset; unsigned long i; @@ -280,14 +282,14 @@ static void drm_vm_shm_close(struct vm_area_struct *vma) /** * \c fault method for DMA virtual memory. * - * \param vma virtual memory area. * \param address access address. * \return pointer to the page structure. * * Determine the page number from the page offset and get it from drm_device_dma::pagelist. */ -static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int drm_do_vm_dma_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct drm_file *priv = vma->vm_file->private_data; struct drm_device *dev = priv->minor->dev; struct drm_device_dma *dma = dev->dma; @@ -315,14 +317,14 @@ static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) /** * \c fault method for scatter-gather virtual memory. * - * \param vma virtual memory area. * \param address access address. * \return pointer to the page structure. * * Determine the map offset from the page offset and get it from drm_sg_mem::pagelist. */ -static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int drm_do_vm_sg_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct drm_local_map *map = vma->vm_private_data; struct drm_file *priv = vma->vm_file->private_data; struct drm_device *dev = priv->minor->dev; @@ -347,24 +349,24 @@ static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return 0; } -static int drm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int drm_vm_fault(struct vm_fault *vmf) { - return drm_do_vm_fault(vma, vmf); + return drm_do_vm_fault(vmf); } -static int drm_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int drm_vm_shm_fault(struct vm_fault *vmf) { - return drm_do_vm_shm_fault(vma, vmf); + return drm_do_vm_shm_fault(vmf); } -static int drm_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int drm_vm_dma_fault(struct vm_fault *vmf) { - return drm_do_vm_dma_fault(vma, vmf); + return drm_do_vm_dma_fault(vmf); } -static int drm_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int drm_vm_sg_fault(struct vm_fault *vmf) { - return drm_do_vm_sg_fault(vma, vmf); + return drm_do_vm_sg_fault(vmf); } /** AGP virtual memory operations */ diff --git a/drivers/gpu/drm/etnaviv/etnaviv_drv.h b/drivers/gpu/drm/etnaviv/etnaviv_drv.h index c255eda40526..e41f38667c1c 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_drv.h +++ b/drivers/gpu/drm/etnaviv/etnaviv_drv.h @@ -73,7 +73,7 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data, struct drm_file *file); int etnaviv_gem_mmap(struct file *filp, struct vm_area_struct *vma); -int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf); +int etnaviv_gem_fault(struct vm_fault *vmf); int etnaviv_gem_mmap_offset(struct drm_gem_object *obj, u64 *offset); struct sg_table *etnaviv_gem_prime_get_sg_table(struct drm_gem_object *obj); void *etnaviv_gem_prime_vmap(struct drm_gem_object *obj); diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index aa6e35ddc87f..e78f1406885d 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -175,8 +175,9 @@ int etnaviv_gem_mmap(struct file *filp, struct vm_area_struct *vma) return obj->ops->mmap(obj, vma); } -int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +int etnaviv_gem_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct drm_gem_object *obj = vma->vm_private_data; struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj); struct page **pages, *page; diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index 57b81460fec8..4c28f7ffcc4d 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -447,8 +447,9 @@ int exynos_drm_gem_dumb_map_offset(struct drm_file *file_priv, return ret; } -int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +int exynos_drm_gem_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct drm_gem_object *obj = vma->vm_private_data; struct exynos_drm_gem *exynos_gem = to_exynos_gem(obj); unsigned long pfn; diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.h b/drivers/gpu/drm/exynos/exynos_drm_gem.h index df7c543d6558..85457255fcd1 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.h +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.h @@ -116,7 +116,7 @@ int exynos_drm_gem_dumb_map_offset(struct drm_file *file_priv, uint64_t *offset); /* page fault handler and mmap fault address(virtual) to physical memory. */ -int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf); +int exynos_drm_gem_fault(struct vm_fault *vmf); /* set vm_flags and we can change the vm attribute to other one at here. */ int exynos_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma); diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c index da42d2e1d397..ffe6b4ffa1a8 100644 --- a/drivers/gpu/drm/gma500/framebuffer.c +++ b/drivers/gpu/drm/gma500/framebuffer.c @@ -111,8 +111,9 @@ static int psbfb_pan(struct fb_var_screeninfo *var, struct fb_info *info) return 0; } -static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int psbfb_vm_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct psb_framebuffer *psbfb = vma->vm_private_data; struct drm_device *dev = psbfb->base.dev; struct drm_psb_private *dev_priv = dev->dev_private; diff --git a/drivers/gpu/drm/gma500/gem.c b/drivers/gpu/drm/gma500/gem.c index 527c62917660..7da061aab729 100644 --- a/drivers/gpu/drm/gma500/gem.c +++ b/drivers/gpu/drm/gma500/gem.c @@ -164,8 +164,9 @@ int psb_gem_dumb_create(struct drm_file *file, struct drm_device *dev, * vma->vm_private_data points to the GEM object that is backing this * mapping. */ -int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +int psb_gem_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct drm_gem_object *obj; struct gtt_range *r; int ret; diff --git a/drivers/gpu/drm/gma500/psb_drv.h b/drivers/gpu/drm/gma500/psb_drv.h index 05d7aaf47eea..83e22fd4cfc0 100644 --- a/drivers/gpu/drm/gma500/psb_drv.h +++ b/drivers/gpu/drm/gma500/psb_drv.h @@ -752,7 +752,7 @@ extern int psb_gem_dumb_create(struct drm_file *file, struct drm_device *dev, struct drm_mode_create_dumb *args); extern int psb_gem_dumb_map_gtt(struct drm_file *file, struct drm_device *dev, uint32_t handle, uint64_t *offset); -extern int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf); +extern int psb_gem_fault(struct vm_fault *vmf); /* psb_device.c */ extern const struct psb_ops psb_chip_ops; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index bcc81912b5e5..0a4b42d31391 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3352,7 +3352,7 @@ int __must_check i915_gem_wait_for_idle(struct drm_i915_private *dev_priv, unsigned int flags); int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv); void i915_gem_resume(struct drm_i915_private *dev_priv); -int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf); +int i915_gem_fault(struct vm_fault *vmf); int i915_gem_object_wait(struct drm_i915_gem_object *obj, unsigned int flags, long timeout, diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 88f3628b4e29..6908123162d1 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1772,7 +1772,6 @@ compute_partial_view(struct drm_i915_gem_object *obj, /** * i915_gem_fault - fault a page into the GTT - * @area: CPU VMA in question * @vmf: fault info * * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped @@ -1789,9 +1788,10 @@ compute_partial_view(struct drm_i915_gem_object *obj, * The current feature set supported by i915_gem_fault() and thus GTT mmaps * is exposed via I915_PARAM_MMAP_GTT_VERSION (see i915_gem_mmap_gtt_version). */ -int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf) +int i915_gem_fault(struct vm_fault *vmf) { #define MIN_CHUNK_PAGES ((1 << 20) >> PAGE_SHIFT) /* 1 MiB */ + struct vm_area_struct *area = vmf->vma; struct drm_i915_gem_object *obj = to_intel_bo(area->vm_private_data); struct drm_device *dev = obj->base.dev; struct drm_i915_private *dev_priv = to_i915(dev); diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index cdd7b2f8e977..c3b14876edaa 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -206,7 +206,7 @@ void msm_gem_shrinker_cleanup(struct drm_device *dev); int msm_gem_mmap_obj(struct drm_gem_object *obj, struct vm_area_struct *vma); int msm_gem_mmap(struct file *filp, struct vm_area_struct *vma); -int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf); +int msm_gem_fault(struct vm_fault *vmf); uint64_t msm_gem_mmap_offset(struct drm_gem_object *obj); int msm_gem_get_iova_locked(struct drm_gem_object *obj, int id, uint64_t *iova); diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index e140b05af134..59811f29607d 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -191,8 +191,9 @@ int msm_gem_mmap(struct file *filp, struct vm_area_struct *vma) return msm_gem_mmap_obj(vma->vm_private_data, vma); } -int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +int msm_gem_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct drm_gem_object *obj = vma->vm_private_data; struct drm_device *dev = obj->dev; struct msm_drm_private *priv = dev->dev_private; diff --git a/drivers/gpu/drm/omapdrm/omap_drv.h b/drivers/gpu/drm/omapdrm/omap_drv.h index 36d93ce84a29..65977982f15f 100644 --- a/drivers/gpu/drm/omapdrm/omap_drv.h +++ b/drivers/gpu/drm/omapdrm/omap_drv.h @@ -188,7 +188,7 @@ int omap_gem_dumb_create(struct drm_file *file, struct drm_device *dev, int omap_gem_mmap(struct file *filp, struct vm_area_struct *vma); int omap_gem_mmap_obj(struct drm_gem_object *obj, struct vm_area_struct *vma); -int omap_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf); +int omap_gem_fault(struct vm_fault *vmf); int omap_gem_op_start(struct drm_gem_object *obj, enum omap_gem_op op); int omap_gem_op_finish(struct drm_gem_object *obj, enum omap_gem_op op); int omap_gem_op_sync(struct drm_gem_object *obj, enum omap_gem_op op); diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c index 74a9968df421..5d5a9f517c30 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem.c +++ b/drivers/gpu/drm/omapdrm/omap_gem.c @@ -518,7 +518,6 @@ static int fault_2d(struct drm_gem_object *obj, /** * omap_gem_fault - pagefault handler for GEM objects - * @vma: the VMA of the GEM object * @vmf: fault detail * * Invoked when a fault occurs on an mmap of a GEM managed area. GEM @@ -529,8 +528,9 @@ static int fault_2d(struct drm_gem_object *obj, * vma->vm_private_data points to the GEM object that is backing this * mapping. */ -int omap_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +int omap_gem_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct drm_gem_object *obj = vma->vm_private_data; struct omap_gem_object *omap_obj = to_omap_bo(obj); struct drm_device *dev = obj->dev; diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c index 4e1a40389964..7d1cab57c89e 100644 --- a/drivers/gpu/drm/qxl/qxl_ttm.c +++ b/drivers/gpu/drm/qxl/qxl_ttm.c @@ -105,15 +105,15 @@ static void qxl_ttm_global_fini(struct qxl_device *qdev) static struct vm_operations_struct qxl_ttm_vm_ops; static const struct vm_operations_struct *ttm_vm_ops; -static int qxl_ttm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int qxl_ttm_fault(struct vm_fault *vmf) { struct ttm_buffer_object *bo; int r; - bo = (struct ttm_buffer_object *)vma->vm_private_data; + bo = (struct ttm_buffer_object *)vmf->vma->vm_private_data; if (bo == NULL) return VM_FAULT_NOPAGE; - r = ttm_vm_ops->fault(vma, vmf); + r = ttm_vm_ops->fault(vmf); return r; } diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index 7a10b3852970..684f1703aa5c 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -979,19 +979,19 @@ void radeon_ttm_set_active_vram_size(struct radeon_device *rdev, u64 size) static struct vm_operations_struct radeon_ttm_vm_ops; static const struct vm_operations_struct *ttm_vm_ops = NULL; -static int radeon_ttm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int radeon_ttm_fault(struct vm_fault *vmf) { struct ttm_buffer_object *bo; struct radeon_device *rdev; int r; - bo = (struct ttm_buffer_object *)vma->vm_private_data; + bo = (struct ttm_buffer_object *)vmf->vma->vm_private_data; if (bo == NULL) { return VM_FAULT_NOPAGE; } rdev = radeon_get_rdev(bo->bdev); down_read(&rdev->pm.mclk_lock); - r = ttm_vm_ops->fault(vma, vmf); + r = ttm_vm_ops->fault(vmf); up_read(&rdev->pm.mclk_lock); return r; } diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c index b523a5d4a38c..17e62ecb5d4d 100644 --- a/drivers/gpu/drm/tegra/gem.c +++ b/drivers/gpu/drm/tegra/gem.c @@ -441,8 +441,9 @@ int tegra_bo_dumb_map_offset(struct drm_file *file, struct drm_device *drm, return 0; } -static int tegra_bo_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int tegra_bo_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct drm_gem_object *gem = vma->vm_private_data; struct tegra_bo *bo = to_tegra_bo(gem); struct page *page; diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 88169141bef5..35ffb3754feb 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -43,7 +43,6 @@ #define TTM_BO_VM_NUM_PREFAULT 16 static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo, - struct vm_area_struct *vma, struct vm_fault *vmf) { int ret = 0; @@ -67,7 +66,7 @@ static int ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo, goto out_unlock; ttm_bo_reference(bo); - up_read(&vma->vm_mm->mmap_sem); + up_read(&vmf->vma->vm_mm->mmap_sem); (void) dma_fence_wait(bo->moving, true); ttm_bo_unreserve(bo); ttm_bo_unref(&bo); @@ -92,8 +91,9 @@ out_unlock: return ret; } -static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int ttm_bo_vm_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct ttm_buffer_object *bo = (struct ttm_buffer_object *) vma->vm_private_data; struct ttm_bo_device *bdev = bo->bdev; @@ -124,7 +124,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (vmf->flags & FAULT_FLAG_ALLOW_RETRY) { if (!(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { ttm_bo_reference(bo); - up_read(&vma->vm_mm->mmap_sem); + up_read(&vmf->vma->vm_mm->mmap_sem); (void) ttm_bo_wait_unreserved(bo); ttm_bo_unref(&bo); } @@ -168,7 +168,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * Wait for buffer data in transit, due to a pipelined * move. */ - ret = ttm_bo_vm_fault_idle(bo, vma, vmf); + ret = ttm_bo_vm_fault_idle(bo, vmf); if (unlikely(ret != 0)) { retval = ret; diff --git a/drivers/gpu/drm/udl/udl_drv.h b/drivers/gpu/drm/udl/udl_drv.h index 6c4286e57362..2a75ab80527a 100644 --- a/drivers/gpu/drm/udl/udl_drv.h +++ b/drivers/gpu/drm/udl/udl_drv.h @@ -134,7 +134,7 @@ void udl_gem_put_pages(struct udl_gem_object *obj); int udl_gem_vmap(struct udl_gem_object *obj); void udl_gem_vunmap(struct udl_gem_object *obj); int udl_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma); -int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf); +int udl_gem_fault(struct vm_fault *vmf); int udl_handle_damage(struct udl_framebuffer *fb, int x, int y, int width, int height); diff --git a/drivers/gpu/drm/udl/udl_gem.c b/drivers/gpu/drm/udl/udl_gem.c index 3c0c4bd3f750..775c50e4f02c 100644 --- a/drivers/gpu/drm/udl/udl_gem.c +++ b/drivers/gpu/drm/udl/udl_gem.c @@ -100,8 +100,9 @@ int udl_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma) return ret; } -int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +int udl_gem_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct udl_gem_object *obj = to_udl_bo(vma->vm_private_data); struct page *page; unsigned int page_offset; diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index 477e07f0ecb6..7ccbb03e98de 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -50,8 +50,9 @@ static void vgem_gem_free_object(struct drm_gem_object *obj) kfree(vgem_obj); } -static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int vgem_gem_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct drm_vgem_gem_object *obj = vma->vm_private_data; /* We don't use vmf->pgoff since that has the fake offset */ unsigned long vaddr = vmf->address; diff --git a/drivers/gpu/drm/virtio/virtgpu_ttm.c b/drivers/gpu/drm/virtio/virtgpu_ttm.c index 9cc7079f7aca..70ec8ca8d9b1 100644 --- a/drivers/gpu/drm/virtio/virtgpu_ttm.c +++ b/drivers/gpu/drm/virtio/virtgpu_ttm.c @@ -114,18 +114,17 @@ static void virtio_gpu_ttm_global_fini(struct virtio_gpu_device *vgdev) static struct vm_operations_struct virtio_gpu_ttm_vm_ops; static const struct vm_operations_struct *ttm_vm_ops; -static int virtio_gpu_ttm_fault(struct vm_area_struct *vma, - struct vm_fault *vmf) +static int virtio_gpu_ttm_fault(struct vm_fault *vmf) { struct ttm_buffer_object *bo; struct virtio_gpu_device *vgdev; int r; - bo = (struct ttm_buffer_object *)vma->vm_private_data; + bo = (struct ttm_buffer_object *)vmf->vma->vm_private_data; if (bo == NULL) return VM_FAULT_NOPAGE; vgdev = virtio_gpu_get_vgdev(bo->bdev); - r = ttm_vm_ops->fault(vma, vmf); + r = ttm_vm_ops->fault(vmf); return r; } #endif diff --git a/drivers/hsi/clients/cmt_speech.c b/drivers/hsi/clients/cmt_speech.c index 3deef6cc7d7c..7175e6bedf21 100644 --- a/drivers/hsi/clients/cmt_speech.c +++ b/drivers/hsi/clients/cmt_speech.c @@ -1098,9 +1098,9 @@ static void cs_hsi_stop(struct cs_hsi_iface *hi) kfree(hi); } -static int cs_char_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int cs_char_vma_fault(struct vm_fault *vmf) { - struct cs_char *csdata = vma->vm_private_data; + struct cs_char *csdata = vmf->vma->vm_private_data; struct page *page; page = virt_to_page(csdata->mmap_base); diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c index e8d55a153a65..e88afe1a435c 100644 --- a/drivers/hwtracing/intel_th/msu.c +++ b/drivers/hwtracing/intel_th/msu.c @@ -1188,9 +1188,9 @@ static void msc_mmap_close(struct vm_area_struct *vma) mutex_unlock(&msc->buf_mutex); } -static int msc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int msc_mmap_fault(struct vm_fault *vmf) { - struct msc_iter *iter = vma->vm_file->private_data; + struct msc_iter *iter = vmf->vma->vm_file->private_data; struct msc *msc = iter->msc; vmf->page = msc_buffer_get_page(msc, vmf->pgoff); @@ -1198,7 +1198,7 @@ static int msc_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->mapping = vmf->vma->vm_file->f_mapping; vmf->page->index = vmf->pgoff; return 0; diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index bd786b7bd30b..f46033984d07 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -92,7 +92,7 @@ static unsigned int poll_next(struct file *, struct poll_table_struct *); static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long); static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16); static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int); -static int vma_fault(struct vm_area_struct *, struct vm_fault *); +static int vma_fault(struct vm_fault *); static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, unsigned long arg); @@ -695,7 +695,7 @@ done: * Local (non-chip) user memory is not mapped right away but as it is * accessed by the user-level code. */ -static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int vma_fault(struct vm_fault *vmf) { struct page *page; diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 2d1eacf1dfed..9396c1807cc3 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -893,7 +893,7 @@ bail: /* * qib_file_vma_fault - handle a VMA page fault. */ -static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int qib_file_vma_fault(struct vm_fault *vmf) { struct page *page; diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index ba63ca57ed7e..36bd904946bd 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -434,8 +434,9 @@ static void videobuf_vm_close(struct vm_area_struct *vma) * now ...). Bounce buffers don't work very well for the data rates * video capture has. */ -static int videobuf_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int videobuf_vm_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct page *page; dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n", diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 3907387b6d15..062bf6ca2625 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -121,8 +121,9 @@ void cxl_context_set_mapping(struct cxl_context *ctx, mutex_unlock(&ctx->mapping_lock); } -static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int cxl_mmap_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct cxl_context *ctx = vma->vm_file->private_data; u64 area, offset; diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c index af2e077da4b8..3641f1334cf0 100644 --- a/drivers/misc/sgi-gru/grumain.c +++ b/drivers/misc/sgi-gru/grumain.c @@ -926,8 +926,9 @@ again: * * Note: gru segments alway mmaped on GRU_GSEG_PAGESIZE boundaries. */ -int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +int gru_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct gru_thread_state *gts; unsigned long paddr, vaddr; unsigned long expires; diff --git a/drivers/misc/sgi-gru/grutables.h b/drivers/misc/sgi-gru/grutables.h index 5c3ce2459675..b5e308b50ed1 100644 --- a/drivers/misc/sgi-gru/grutables.h +++ b/drivers/misc/sgi-gru/grutables.h @@ -665,7 +665,7 @@ extern unsigned long gru_reserve_cb_resources(struct gru_state *gru, int cbr_au_count, char *cbmap); extern unsigned long gru_reserve_ds_resources(struct gru_state *gru, int dsr_au_count, char *dsmap); -extern int gru_fault(struct vm_area_struct *, struct vm_fault *vmf); +extern int gru_fault(struct vm_fault *vmf); extern struct gru_mm_struct *gru_register_mmu_notifier(void); extern void gru_drop_mmu_notifier(struct gru_mm_struct *gms); diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c index 90869cee2b20..ef5bf55f08a4 100644 --- a/drivers/scsi/cxlflash/superpipe.c +++ b/drivers/scsi/cxlflash/superpipe.c @@ -1053,7 +1053,6 @@ out: /** * cxlflash_mmap_fault() - mmap fault handler for adapter file descriptor - * @vma: VM area associated with mapping. * @vmf: VM fault associated with current fault. * * To support error notification via MMIO, faults are 'caught' by this routine @@ -1067,8 +1066,9 @@ out: * * Return: 0 on success, VM_FAULT_SIGBUS on failure */ -static int cxlflash_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int cxlflash_mmap_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct file *file = vma->vm_file; struct cxl_context *ctx = cxl_fops_get_context(file); struct cxlflash_cfg *cfg = container_of(file->f_op, struct cxlflash_cfg, @@ -1097,7 +1097,7 @@ static int cxlflash_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (likely(!ctxi->err_recovery_active)) { vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - rc = ctxi->cxl_mmap_vmops->fault(vma, vmf); + rc = ctxi->cxl_mmap_vmops->fault(vmf); } else { dev_dbg(dev, "%s: err recovery active, use err_page\n", __func__); diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index e831e01f9fa6..29b86505f796 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1185,8 +1185,9 @@ sg_fasync(int fd, struct file *filp, int mode) } static int -sg_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +sg_vma_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; Sg_fd *sfp; unsigned long offset, len, sa; Sg_scatter_hold *rsv_schp; diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index 969600779e44..2c3ffbcbd621 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -870,9 +870,9 @@ static void ion_buffer_sync_for_device(struct ion_buffer *buffer, mutex_unlock(&buffer->lock); } -static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int ion_vm_fault(struct vm_fault *vmf) { - struct ion_buffer *buffer = vma->vm_private_data; + struct ion_buffer *buffer = vmf->vma->vm_private_data; unsigned long pfn; int ret; @@ -881,7 +881,7 @@ static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]); pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff])); - ret = vm_insert_pfn(vma, vmf->address, pfn); + ret = vm_insert_pfn(vmf->vma, vmf->address, pfn); mutex_unlock(&buffer->lock); if (ret) return VM_FAULT_ERROR; diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c index 9afa6bec3e6f..896196c74cd2 100644 --- a/drivers/staging/lustre/lustre/llite/llite_mmap.c +++ b/drivers/staging/lustre/lustre/llite/llite_mmap.c @@ -321,7 +321,7 @@ out: return fault_ret; } -static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int ll_fault(struct vm_fault *vmf) { int count = 0; bool printed = false; @@ -335,7 +335,7 @@ static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) set = cfs_block_sigsinv(sigmask(SIGKILL) | sigmask(SIGTERM)); restart: - result = ll_fault0(vma, vmf); + result = ll_fault0(vmf->vma, vmf); LASSERT(!(result & VM_FAULT_LOCKED)); if (result == 0) { struct page *vmpage = vmf->page; @@ -362,8 +362,9 @@ restart: return result; } -static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +static int ll_page_mkwrite(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; int count = 0; bool printed = false; bool retry; diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c index 3e9cf710501b..4c57755e06e7 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_io.c +++ b/drivers/staging/lustre/lustre/llite/vvp_io.c @@ -1014,7 +1014,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) { struct vm_fault *vmf = cfio->ft_vmf; - cfio->ft_flags = filemap_fault(cfio->ft_vma, vmf); + cfio->ft_flags = filemap_fault(vmf); cfio->ft_flags_valid = 1; if (vmf->page) { diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 8041710b6972..5c1cb2df3a54 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -783,15 +783,15 @@ static int tcmu_find_mem_index(struct vm_area_struct *vma) return -1; } -static int tcmu_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int tcmu_vma_fault(struct vm_fault *vmf) { - struct tcmu_dev *udev = vma->vm_private_data; + struct tcmu_dev *udev = vmf->vma->vm_private_data; struct uio_info *info = &udev->uio_info; struct page *page; unsigned long offset; void *addr; - int mi = tcmu_find_mem_index(vma); + int mi = tcmu_find_mem_index(vmf->vma); if (mi < 0) return VM_FAULT_SIGBUS; diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index fba021f5736a..31d95dc9c202 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -597,14 +597,14 @@ static int uio_find_mem_index(struct vm_area_struct *vma) return -1; } -static int uio_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int uio_vma_fault(struct vm_fault *vmf) { - struct uio_device *idev = vma->vm_private_data; + struct uio_device *idev = vmf->vma->vm_private_data; struct page *page; unsigned long offset; void *addr; - int mi = uio_find_mem_index(vma); + int mi = uio_find_mem_index(vmf->vma); if (mi < 0) return VM_FAULT_SIGBUS; diff --git a/drivers/usb/mon/mon_bin.c b/drivers/usb/mon/mon_bin.c index 91c22276c03b..9fb8b1e6ecc2 100644 --- a/drivers/usb/mon/mon_bin.c +++ b/drivers/usb/mon/mon_bin.c @@ -1223,9 +1223,9 @@ static void mon_bin_vma_close(struct vm_area_struct *vma) /* * Map ring pages to user space. */ -static int mon_bin_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int mon_bin_vma_fault(struct vm_fault *vmf) { - struct mon_reader_bin *rp = vma->vm_private_data; + struct mon_reader_bin *rp = vmf->vma->vm_private_data; unsigned long offset, chunk_idx; struct page *pageptr; diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index 74b5bcac8bf2..37f69c061210 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -37,12 +37,11 @@ static struct page *fb_deferred_io_page(struct fb_info *info, unsigned long offs } /* this is to find and return the vmalloc-ed fb pages */ -static int fb_deferred_io_fault(struct vm_area_struct *vma, - struct vm_fault *vmf) +static int fb_deferred_io_fault(struct vm_fault *vmf) { unsigned long offset; struct page *page; - struct fb_info *info = vma->vm_private_data; + struct fb_info *info = vmf->vma->vm_private_data; offset = vmf->pgoff << PAGE_SHIFT; if (offset >= info->fix.smem_len) @@ -54,8 +53,8 @@ static int fb_deferred_io_fault(struct vm_area_struct *vma, get_page(page); - if (vma->vm_file) - page->mapping = vma->vm_file->f_mapping; + if (vmf->vma->vm_file) + page->mapping = vmf->vma->vm_file->f_mapping; else printk(KERN_ERR "no mapping available\n"); @@ -91,11 +90,10 @@ int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasy EXPORT_SYMBOL_GPL(fb_deferred_io_fsync); /* vm_ops->page_mkwrite handler */ -static int fb_deferred_io_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) +static int fb_deferred_io_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct fb_info *info = vma->vm_private_data; + struct fb_info *info = vmf->vma->vm_private_data; struct fb_deferred_io *fbdefio = info->fbdefio; struct page *cur; @@ -105,7 +103,7 @@ static int fb_deferred_io_mkwrite(struct vm_area_struct *vma, deferred framebuffer IO. then if userspace touches a page again, we repeat the same scheme */ - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); /* protect against the workqueue changing the page list */ mutex_lock(&fbdefio->lock); diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 2077a3ac7c0c..7a92a5e1d40c 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -804,10 +804,10 @@ static void privcmd_close(struct vm_area_struct *vma) kfree(pages); } -static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int privcmd_fault(struct vm_fault *vmf) { printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", - vma, vma->vm_start, vma->vm_end, + vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end, vmf->pgoff, (void *)vmf->address); return VM_FAULT_SIGBUS; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 6a0f3fa85ef7..3de3b4a89d89 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -534,11 +534,11 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) } static int -v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +v9fs_vm_page_mkwrite(struct vm_fault *vmf) { struct v9fs_inode *v9inode; struct page *page = vmf->page; - struct file *filp = vma->vm_file; + struct file *filp = vmf->vma->vm_file; struct inode *inode = file_inode(filp); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6a823719b6c5..adf16307842a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3147,7 +3147,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +int btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1e861a063721..ea1d500cfba6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8964,10 +8964,10 @@ again: * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ -int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +int btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; @@ -9000,7 +9000,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = btrfs_delalloc_reserve_space(inode, page_start, reserved_space); if (!ret) { - ret = file_update_time(vma->vm_file); + ret = file_update_time(vmf->vma->vm_file); reserved = 1; } if (ret) { diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e4b066cd912a..09860c0ec7c1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1386,8 +1386,9 @@ static void ceph_restore_sigs(sigset_t *oldset) /* * vm ops */ -static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int ceph_filemap_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; @@ -1416,7 +1417,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || ci->i_inline_version == CEPH_INLINE_NONE) { current->journal_info = vma->vm_file; - ret = filemap_fault(vma, vmf); + ret = filemap_fault(vmf); current->journal_info = NULL; } else ret = -EAGAIN; @@ -1477,8 +1478,9 @@ out_restore: /* * Reuse write_begin here for simplicity. */ -static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +static int ceph_page_mkwrite(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 98dc842e7245..aa3debbba826 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3282,7 +3282,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) * sure that it doesn't change while being written back. */ static int -cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +cifs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; diff --git a/fs/dax.c b/fs/dax.c index 3f1181563fb1..f955c0df33bb 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -925,12 +925,11 @@ static int dax_insert_mapping(struct address_space *mapping, /** * dax_pfn_mkwrite - handle first write to DAX page - * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault */ -int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +int dax_pfn_mkwrite(struct vm_fault *vmf) { - struct file *file = vma->vm_file; + struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; void *entry, **slot; pgoff_t index = vmf->pgoff; @@ -1121,7 +1120,6 @@ static int dax_fault_return(int error) /** * dax_iomap_fault - handle a page fault on a DAX file - * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @ops: iomap ops passed from the file system * @@ -1129,10 +1127,9 @@ static int dax_fault_return(int error) * or mkwrite handler for DAX files. Assumes the caller has done all the * necessary locking for the page fault to proceed successfully. */ -int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - const struct iomap_ops *ops) +int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops) { - struct address_space *mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; @@ -1205,11 +1202,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } error = dax_insert_mapping(mapping, iomap.bdev, sector, - PAGE_SIZE, &entry, vma, vmf); + PAGE_SIZE, &entry, vmf->vma, vmf); /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index b0f241528a30..0bf0d971205a 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -87,19 +87,19 @@ out_unlock: * The default page_lock and i_size verification done by non-DAX fault paths * is sufficient because ext2 doesn't support hole punching. */ -static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int ext2_dax_fault(struct vm_fault *vmf) { - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); int ret; if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); } down_read(&ei->dax_sem); - ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops); + ret = dax_iomap_fault(vmf, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) @@ -107,16 +107,15 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return ret; } -static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) +static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf) { - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); loff_t size; int ret; sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); down_read(&ei->dax_sem); /* check that the faulting page hasn't raced with truncate */ @@ -124,7 +123,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; else - ret = dax_pfn_mkwrite(vma, vmf); + ret = dax_pfn_mkwrite(vmf); up_read(&ei->dax_sem); sb_end_pagefault(inode->i_sb); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cee23b684f47..2fd17e8e4984 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2483,8 +2483,8 @@ extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); -extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); -extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); +extern int ext4_page_mkwrite(struct vm_fault *vmf); +extern int ext4_filemap_fault(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_update_reserve_space(struct inode *inode, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 13021a054fc0..21e1f17fe36d 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -253,19 +253,19 @@ out: } #ifdef CONFIG_FS_DAX -static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int ext4_dax_fault(struct vm_fault *vmf) { int result; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; if (write) { sb_start_pagefault(sb); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); } down_read(&EXT4_I(inode)->i_mmap_sem); - result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops); + result = dax_iomap_fault(vmf, &ext4_iomap_ops); up_read(&EXT4_I(inode)->i_mmap_sem); if (write) sb_end_pagefault(sb); @@ -303,22 +303,21 @@ ext4_dax_pmd_fault(struct vm_fault *vmf) * wp_pfn_shared() fails. Thus fault gets retried and things work out as * desired. */ -static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) +static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf) { - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; loff_t size; int ret; sb_start_pagefault(sb); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); down_read(&EXT4_I(inode)->i_mmap_sem); size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; else - ret = dax_pfn_mkwrite(vma, vmf); + ret = dax_pfn_mkwrite(vmf); up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 75212a6e69f8..41d8e53e5a7f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5821,8 +5821,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) return !buffer_mapped(bh); } -int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +int ext4_page_mkwrite(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct page *page = vmf->page; loff_t size; unsigned long len; @@ -5912,13 +5913,13 @@ out: return ret; } -int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +int ext4_filemap_fault(struct vm_fault *vmf) { - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); int err; down_read(&EXT4_I(inode)->i_mmap_sem); - err = filemap_fault(vma, vmf); + err = filemap_fault(vmf); up_read(&EXT4_I(inode)->i_mmap_sem); return err; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 49f10dce817d..1edc86e874e3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -32,11 +32,10 @@ #include "trace.h" #include -static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) +static int f2fs_vm_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; int err; @@ -58,7 +57,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_balance_fs(sbi, dn.node_changed); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2401c5dabb2a..e80bfd06daf5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2043,12 +2043,12 @@ static void fuse_vma_close(struct vm_area_struct *vma) * - sync(2) * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER */ -static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +static int fuse_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); lock_page(page); if (page->mapping != inode->i_mapping) { unlock_page(page); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 016c11eaca7c..6fe2a59c6a9a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -379,10 +379,10 @@ static int gfs2_allocate_page_backing(struct page *page) * blocks allocated on disk to back that page. */ -static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +static int gfs2_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_alloc_parms ap = { .aflags = 0, }; @@ -399,7 +399,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out; - gfs2_size_hint(vma->vm_file, pos, PAGE_SIZE); + gfs2_size_hint(vmf->vma->vm_file, pos, PAGE_SIZE); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); @@ -407,7 +407,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out_uninit; /* Update file times before taking page lock */ - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); diff --git a/fs/iomap.c b/fs/iomap.c index d89f70bbb952..d209f42cdcb8 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -445,11 +445,10 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, return length; } -int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, - const struct iomap_ops *ops) +int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) { struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); unsigned long length; loff_t offset, size; ssize_t ret; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 78219d5644e9..4f0535890b30 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -348,9 +348,9 @@ static void kernfs_vma_open(struct vm_area_struct *vma) kernfs_put_active(of->kn); } -static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int kernfs_vma_fault(struct vm_fault *vmf) { - struct file *file = vma->vm_file; + struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; @@ -362,16 +362,15 @@ static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; if (of->vm_ops->fault) - ret = of->vm_ops->fault(vma, vmf); + ret = of->vm_ops->fault(vmf); kernfs_put_active(of->kn); return ret; } -static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) +static int kernfs_vma_page_mkwrite(struct vm_fault *vmf) { - struct file *file = vma->vm_file; + struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; @@ -383,7 +382,7 @@ static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma, ret = 0; if (of->vm_ops->page_mkwrite) - ret = of->vm_ops->page_mkwrite(vma, vmf); + ret = of->vm_ops->page_mkwrite(vmf); else file_update_time(file); diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 39f57bef8531..0c3905e0542e 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -27,10 +27,9 @@ * XXX: how are we excluding truncate/invalidate here? Maybe need to lock * page? */ -static int ncp_file_mmap_fault(struct vm_area_struct *area, - struct vm_fault *vmf) +static int ncp_file_mmap_fault(struct vm_fault *vmf) { - struct inode *inode = file_inode(area->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); char *pg_addr; unsigned int already_read; unsigned int count; @@ -90,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area, * -- nyc */ count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT); + mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); return VM_FAULT_MAJOR; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 26dbe8b0c10d..668213984d68 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -528,10 +528,10 @@ const struct address_space_operations nfs_file_aops = { * writable, implying that someone is about to modify the page through a * shared-writable mapping */ -static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +static int nfs_vm_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct file *filp = vma->vm_file; + struct file *filp = vmf->vma->vm_file; struct inode *inode = file_inode(filp); unsigned pagelen; int ret = VM_FAULT_NOPAGE; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 547381f3ce13..c5fa3dee72fc 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -51,8 +51,9 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) return err; } -static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +static int nilfs_page_mkwrite(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct page *page = vmf->page; struct inode *inode = file_inode(vma->vm_file); struct nilfs_transaction_info ti; diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 429088786e93..098f5c712569 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -44,17 +44,18 @@ #include "ocfs2_trace.h" -static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) +static int ocfs2_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; sigset_t oldset; int ret; ocfs2_block_signals(&oldset); - ret = filemap_fault(area, vmf); + ret = filemap_fault(vmf); ocfs2_unblock_signals(&oldset); - trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno, - area, vmf->page, vmf->pgoff); + trace_ocfs2_fault(OCFS2_I(vma->vm_file->f_mapping->host)->ip_blkno, + vma, vmf->page, vmf->pgoff); return ret; } @@ -127,10 +128,10 @@ out: return ret; } -static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +static int ocfs2_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct buffer_head *di_bh = NULL; sigset_t oldset; int ret; @@ -160,7 +161,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page); + ret = __ocfs2_page_mkwrite(vmf->vma->vm_file, di_bh, page); up_write(&OCFS2_I(inode)->ip_alloc_sem); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 5105b1599981..f52d8e857ff7 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -265,10 +265,10 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, * On s390 the fault handler is used for memory regions that can't be mapped * directly with remap_pfn_range(). */ -static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int mmap_vmcore_fault(struct vm_fault *vmf) { #ifdef CONFIG_S390 - struct address_space *mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; pgoff_t index = vmf->pgoff; struct page *page; loff_t offset; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index b0d783774c96..d9ae86f96df7 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1506,11 +1506,10 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) * mmap()d file has taken write protection fault and is being made writable. * UBIFS must ensure page is budgeted for. */ -static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) +static int ubifs_vm_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct ubifs_info *c = inode->i_sb->s_fs_info; struct timespec now = ubifs_current_time(inode); struct ubifs_budget_req req = { .new_page = 1 }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 022014016d80..9cc10136ba0b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1379,22 +1379,21 @@ xfs_file_llseek( */ STATIC int xfs_filemap_page_mkwrite( - struct vm_area_struct *vma, struct vm_fault *vmf) { - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); int ret; trace_xfs_filemap_page_mkwrite(XFS_I(inode)); sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, &xfs_iomap_ops); } else { - ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); + ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); } @@ -1406,23 +1405,22 @@ xfs_filemap_page_mkwrite( STATIC int xfs_filemap_fault( - struct vm_area_struct *vma, struct vm_fault *vmf) { - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); int ret; trace_xfs_filemap_fault(XFS_I(inode)); /* DAX can shortcut the normal fault path on write faults! */ if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) - return xfs_filemap_page_mkwrite(vma, vmf); + return xfs_filemap_page_mkwrite(vmf); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) - ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, &xfs_iomap_ops); else - ret = filemap_fault(vma, vmf); + ret = filemap_fault(vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); return ret; @@ -1471,11 +1469,10 @@ xfs_filemap_pmd_fault( */ static int xfs_filemap_pfn_mkwrite( - struct vm_area_struct *vma, struct vm_fault *vmf) { - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); int ret = VM_FAULT_NOPAGE; loff_t size; @@ -1483,7 +1480,7 @@ xfs_filemap_pfn_mkwrite( trace_xfs_filemap_pfn_mkwrite(ip); sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); /* check if the faulting page hasn't raced with truncate */ xfs_ilock(ip, XFS_MMAPLOCK_SHARED); @@ -1491,7 +1488,7 @@ xfs_filemap_pfn_mkwrite( if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; else if (IS_DAX(inode)) - ret = dax_pfn_mkwrite(vma, vmf); + ret = dax_pfn_mkwrite(vmf); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; diff --git a/include/linux/dax.h b/include/linux/dax.h index 1e77ff5818f1..eeb02421c848 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -38,8 +38,7 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags) ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); -int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - const struct iomap_ops *ops); +int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, @@ -83,7 +82,7 @@ static inline int dax_iomap_pmd_fault(struct vm_fault *vmf, return VM_FAULT_FALLBACK; } #endif -int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); +int dax_pfn_mkwrite(struct vm_fault *vmf); static inline bool vma_is_dax(struct vm_area_struct *vma) { diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 891459caa278..7291810067eb 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -79,8 +79,7 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); -int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, - const struct iomap_ops *ops); +int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, const struct iomap_ops *ops); diff --git a/include/linux/mm.h b/include/linux/mm.h index 574bc157a27c..3dd80ba6568a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -350,17 +350,17 @@ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); int (*mremap)(struct vm_area_struct * area); - int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); + int (*fault)(struct vm_fault *vmf); int (*pmd_fault)(struct vm_fault *vmf); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ - int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); + int (*page_mkwrite)(struct vm_fault *vmf); /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ - int (*pfn_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); + int (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware @@ -2124,10 +2124,10 @@ extern void truncate_inode_pages_range(struct address_space *, extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ -extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); +extern int filemap_fault(struct vm_fault *vmf); extern void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); -extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +extern int filemap_page_mkwrite(struct vm_fault *vmf); /* mm/page-writeback.c */ int write_one_page(struct page *page, int wait); diff --git a/ipc/shm.c b/ipc/shm.c index 81203e8ba013..7f6537b84ef5 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -374,12 +374,12 @@ void exit_shm(struct task_struct *task) up_write(&shm_ids(ns).rwsem); } -static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int shm_fault(struct vm_fault *vmf) { - struct file *file = vma->vm_file; + struct file *file = vmf->vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); - return sfd->vm_ops->fault(vma, vmf); + return sfd->vm_ops->fault(vmf); } #ifdef CONFIG_NUMA diff --git a/kernel/events/core.c b/kernel/events/core.c index 77a932b54a64..b2eb3542e829 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4925,9 +4925,9 @@ unlock: rcu_read_unlock(); } -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int perf_mmap_fault(struct vm_fault *vmf) { - struct perf_event *event = vma->vm_file->private_data; + struct perf_event *event = vmf->vma->vm_file->private_data; struct ring_buffer *rb; int ret = VM_FAULT_SIGBUS; @@ -4950,7 +4950,7 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) goto unlock; get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->mapping = vmf->vma->vm_file->f_mapping; vmf->page->index = vmf->pgoff; ret = 0; diff --git a/kernel/relay.c b/kernel/relay.c index 8f18d314a96a..8f8dc91db680 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -39,10 +39,10 @@ static void relay_file_mmap_close(struct vm_area_struct *vma) /* * fault() vm_op implementation for relay file mapping. */ -static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int relay_buf_fault(struct vm_fault *vmf) { struct page *page; - struct rchan_buf *buf = vma->vm_private_data; + struct rchan_buf *buf = vmf->vma->vm_private_data; pgoff_t pgoff = vmf->pgoff; if (!buf) diff --git a/mm/filemap.c b/mm/filemap.c index 416d563468a3..2ba46f410c7c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2169,7 +2169,6 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, /** * filemap_fault - read in file data for page fault handling - * @vma: vma in which the fault was taken * @vmf: struct vm_fault containing details of the fault * * filemap_fault() is invoked via the vma operations vector for a @@ -2191,10 +2190,10 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ -int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +int filemap_fault(struct vm_fault *vmf) { int error; - struct file *file = vma->vm_file; + struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; @@ -2216,12 +2215,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ - do_async_mmap_readahead(vma, ra, file, page, offset); + do_async_mmap_readahead(vmf->vma, ra, file, page, offset); } else if (!page) { /* No page in the page cache at all */ - do_sync_mmap_readahead(vma, ra, file, offset); + do_sync_mmap_readahead(vmf->vma, ra, file, offset); count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: page = find_get_page(mapping, offset); @@ -2229,7 +2228,7 @@ retry_find: goto no_cached_page; } - if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) { put_page(page); return ret | VM_FAULT_RETRY; } @@ -2396,14 +2395,14 @@ next: } EXPORT_SYMBOL(filemap_map_pages); -int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +int filemap_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); int ret = VM_FAULT_LOCKED; sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); + file_update_time(vmf->vma->vm_file); lock_page(page); if (page->mapping != inode->i_mapping) { unlock_page(page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 30e7709a5121..167fd0722c15 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3142,7 +3142,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ -static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int hugetlb_vm_op_fault(struct vm_fault *vmf) { BUG(); return 0; diff --git a/mm/memory.c b/mm/memory.c index 7663068a33c6..cf97d88158cd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2035,7 +2035,7 @@ static int do_page_mkwrite(struct vm_fault *vmf) vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf); + ret = vmf->vma->vm_ops->page_mkwrite(vmf); /* Restore original flags so that caller is not surprised */ vmf->flags = old_flags; if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) @@ -2307,7 +2307,7 @@ static int wp_pfn_shared(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); vmf->flags |= FAULT_FLAG_MKWRITE; - ret = vma->vm_ops->pfn_mkwrite(vma, vmf); + ret = vma->vm_ops->pfn_mkwrite(vmf); if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; return finish_mkwrite_fault(vmf); @@ -2861,7 +2861,7 @@ static int __do_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; int ret; - ret = vma->vm_ops->fault(vma, vmf); + ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | VM_FAULT_DONE_COW))) return ret; diff --git a/mm/mmap.c b/mm/mmap.c index b729084eea90..1cd70010edf0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3125,8 +3125,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) mm->data_vm += npages; } -static int special_mapping_fault(struct vm_area_struct *vma, - struct vm_fault *vmf); +static int special_mapping_fault(struct vm_fault *vmf); /* * Having a close hook prevents vma merging regardless of flags. @@ -3161,9 +3160,9 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = { .fault = special_mapping_fault, }; -static int special_mapping_fault(struct vm_area_struct *vma, - struct vm_fault *vmf) +static int special_mapping_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; struct page **pages; @@ -3173,7 +3172,7 @@ static int special_mapping_fault(struct vm_area_struct *vma, struct vm_special_mapping *sm = vma->vm_private_data; if (sm->fault) - return sm->fault(sm, vma, vmf); + return sm->fault(sm, vmf->vma, vmf); pages = sm->pages; } diff --git a/mm/nommu.c b/mm/nommu.c index bc964c26be8c..62600ba0d1d8 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1794,7 +1794,7 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +int filemap_fault(struct vm_fault *vmf) { BUG(); return 0; diff --git a/mm/shmem.c b/mm/shmem.c index 9c6d22ff44e2..f7f2330c6cb6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1908,8 +1908,9 @@ static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync return ret; } -static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int shmem_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); enum sgp_type sgp; diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index c354807381c1..c9e8a9898ce4 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -424,10 +424,9 @@ out: return ret; } -static int sel_mmap_policy_fault(struct vm_area_struct *vma, - struct vm_fault *vmf) +static int sel_mmap_policy_fault(struct vm_fault *vmf) { - struct policy_load_memory *plm = vma->vm_file->private_data; + struct policy_load_memory *plm = vmf->vma->vm_file->private_data; unsigned long offset; struct page *page; diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 9d33c1e85c79..aec9c92250fd 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -3245,10 +3245,9 @@ static unsigned int snd_pcm_capture_poll(struct file *file, poll_table * wait) /* * mmap status record */ -static int snd_pcm_mmap_status_fault(struct vm_area_struct *area, - struct vm_fault *vmf) +static int snd_pcm_mmap_status_fault(struct vm_fault *vmf) { - struct snd_pcm_substream *substream = area->vm_private_data; + struct snd_pcm_substream *substream = vmf->vma->vm_private_data; struct snd_pcm_runtime *runtime; if (substream == NULL) @@ -3282,10 +3281,9 @@ static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file /* * mmap control record */ -static int snd_pcm_mmap_control_fault(struct vm_area_struct *area, - struct vm_fault *vmf) +static int snd_pcm_mmap_control_fault(struct vm_fault *vmf) { - struct snd_pcm_substream *substream = area->vm_private_data; + struct snd_pcm_substream *substream = vmf->vma->vm_private_data; struct snd_pcm_runtime *runtime; if (substream == NULL) @@ -3341,10 +3339,9 @@ snd_pcm_default_page_ops(struct snd_pcm_substream *substream, unsigned long ofs) /* * fault callback for mmapping a RAM page */ -static int snd_pcm_mmap_data_fault(struct vm_area_struct *area, - struct vm_fault *vmf) +static int snd_pcm_mmap_data_fault(struct vm_fault *vmf) { - struct snd_pcm_substream *substream = area->vm_private_data; + struct snd_pcm_substream *substream = vmf->vma->vm_private_data; struct snd_pcm_runtime *runtime; unsigned long offset; struct page * page; diff --git a/sound/usb/usx2y/us122l.c b/sound/usb/usx2y/us122l.c index cf5dc33f4a6d..cf45bf1f7ee0 100644 --- a/sound/usb/usx2y/us122l.c +++ b/sound/usb/usx2y/us122l.c @@ -137,13 +137,12 @@ static void usb_stream_hwdep_vm_open(struct vm_area_struct *area) snd_printdd(KERN_DEBUG "%i\n", atomic_read(&us122l->mmap_count)); } -static int usb_stream_hwdep_vm_fault(struct vm_area_struct *area, - struct vm_fault *vmf) +static int usb_stream_hwdep_vm_fault(struct vm_fault *vmf) { unsigned long offset; struct page *page; void *vaddr; - struct us122l *us122l = area->vm_private_data; + struct us122l *us122l = vmf->vma->vm_private_data; struct usb_stream *s; mutex_lock(&us122l->mutex); diff --git a/sound/usb/usx2y/usX2Yhwdep.c b/sound/usb/usx2y/usX2Yhwdep.c index 0b34dbc8f302..605e1047c01d 100644 --- a/sound/usb/usx2y/usX2Yhwdep.c +++ b/sound/usb/usx2y/usX2Yhwdep.c @@ -31,19 +31,18 @@ #include "usbusx2y.h" #include "usX2Yhwdep.h" -static int snd_us428ctls_vm_fault(struct vm_area_struct *area, - struct vm_fault *vmf) +static int snd_us428ctls_vm_fault(struct vm_fault *vmf) { unsigned long offset; struct page * page; void *vaddr; snd_printdd("ENTER, start %lXh, pgoff %ld\n", - area->vm_start, + vmf->vma->vm_start, vmf->pgoff); offset = vmf->pgoff << PAGE_SHIFT; - vaddr = (char*)((struct usX2Ydev *)area->vm_private_data)->us428ctls_sharedmem + offset; + vaddr = (char *)((struct usX2Ydev *)vmf->vma->vm_private_data)->us428ctls_sharedmem + offset; page = virt_to_page(vaddr); get_page(page); vmf->page = page; diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c index 90766a92e7fd..f95164b91152 100644 --- a/sound/usb/usx2y/usx2yhwdeppcm.c +++ b/sound/usb/usx2y/usx2yhwdeppcm.c @@ -652,14 +652,13 @@ static void snd_usX2Y_hwdep_pcm_vm_close(struct vm_area_struct *area) } -static int snd_usX2Y_hwdep_pcm_vm_fault(struct vm_area_struct *area, - struct vm_fault *vmf) +static int snd_usX2Y_hwdep_pcm_vm_fault(struct vm_fault *vmf) { unsigned long offset; void *vaddr; offset = vmf->pgoff << PAGE_SHIFT; - vaddr = (char*)((struct usX2Ydev *)area->vm_private_data)->hwdep_pcm_shm + offset; + vaddr = (char *)((struct usX2Ydev *)vmf->vma->vm_private_data)->hwdep_pcm_shm + offset; vmf->page = virt_to_page(vaddr); get_page(vmf->page); return 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cc4d6e0dd2a2..5b0dd4a9b2cb 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2350,9 +2350,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) } EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); -static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int kvm_vcpu_fault(struct vm_fault *vmf) { - struct kvm_vcpu *vcpu = vma->vm_file->private_data; + struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; struct page *page; if (vmf->pgoff == 0) -- cgit v1.2.3 From a2d581675d485eb7188f521f36efc114639a3096 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 24 Feb 2017 14:56:59 -0800 Subject: mm,fs,dax: change ->pmd_fault to ->huge_fault Patch series "1G transparent hugepage support for device dax", v2. The following series implements support for 1G trasparent hugepage on x86 for device dax. The bulk of the code was written by Mathew Wilcox a while back supporting transparent 1G hugepage for fs DAX. I have forward ported the relevant bits to 4.10-rc. The current submission has only the necessary code to support device DAX. Comments from Dan Williams: So the motivation and intended user of this functionality mirrors the motivation and users of 1GB page support in hugetlbfs. Given expected capacities of persistent memory devices an in-memory database may want to reduce tlb pressure beyond what they can already achieve with 2MB mappings of a device-dax file. We have customer feedback to that effect as Willy mentioned in his previous version of these patches [1]. [1]: https://lkml.org/lkml/2016/1/31/52 Comments from Nilesh @ Oracle: There are applications which have a process model; and if you assume 10,000 processes attempting to mmap all the 6TB memory available on a server; we are looking at the following: processes : 10,000 memory : 6TB pte @ 4k page size: 8 bytes / 4K of memory * #processes = 6TB / 4k * 8 * 10000 = 1.5GB * 80000 = 120,000GB pmd @ 2M page size: 120,000 / 512 = ~240GB pud @ 1G page size: 240GB / 512 = ~480MB As you can see with 2M pages, this system will use up an exorbitant amount of DRAM to hold the page tables; but the 1G pages finally brings it down to a reasonable level. Memory sizes will keep increasing; so this number will keep increasing. An argument can be made to convert the applications from process model to thread model, but in the real world that may not be always practical. Hopefully this helps explain the use case where this is valuable. This patch (of 3): In preparation for adding the ability to handle PUD pages, convert vm_operations_struct.pmd_fault to vm_operations_struct.huge_fault. The vm_fault structure is extended to include a union of the different page table pointers that may be needed, and three flag bits are reserved to indicate which type of pointer is in the union. [ross.zwisler@linux.intel.com: remove unused function ext4_dax_huge_fault()] Link: http://lkml.kernel.org/r/1485813172-7284-1-git-send-email-ross.zwisler@linux.intel.com [dave.jiang@intel.com: clear PMD or PUD size flags when in fall through path] Link: http://lkml.kernel.org/r/148589842696.5820.16078080610311444794.stgit@djiang5-desk3.ch.intel.com Link: http://lkml.kernel.org/r/148545058784.17912.6353162518188733642.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Matthew Wilcox Signed-off-by: Dave Jiang Signed-off-by: Ross Zwisler Cc: Dave Hansen Cc: Vlastimil Babka Cc: Jan Kara Cc: Dan Williams Cc: Kirill A. Shutemov Cc: Nilesh Choudhury Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Dave Jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/dax.c | 34 +++++++++++++--------------------- fs/dax.c | 45 ++++++++++++++++++++++++++++++++------------- fs/ext2/file.c | 2 +- fs/ext4/file.c | 23 +---------------------- fs/xfs/xfs_file.c | 10 +++++----- fs/xfs/xfs_trace.h | 2 +- include/linux/dax.h | 6 ------ include/linux/mm.h | 10 +++++++++- mm/memory.c | 18 ++++++++++++------ 9 files changed, 74 insertions(+), 76 deletions(-) (limited to 'include') diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 0261f332bf3e..922ec461dcaa 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -419,7 +419,7 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, return -1; } -static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) +static int __dax_dev_pte_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) { struct device *dev = &dax_dev->dev; struct dax_region *dax_region; @@ -455,23 +455,6 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) return VM_FAULT_NOPAGE; } -static int dax_dev_fault(struct vm_fault *vmf) -{ - struct vm_area_struct *vma = vmf->vma; - int rc; - struct file *filp = vma->vm_file; - struct dax_dev *dax_dev = filp->private_data; - - dev_dbg(&dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, - current->comm, (vmf->flags & FAULT_FLAG_WRITE) - ? "write" : "read", vma->vm_start, vma->vm_end); - rcu_read_lock(); - rc = __dax_dev_fault(dax_dev, vmf); - rcu_read_unlock(); - - return rc; -} - static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) { unsigned long pmd_addr = vmf->address & PMD_MASK; @@ -510,7 +493,7 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) vmf->flags & FAULT_FLAG_WRITE); } -static int dax_dev_pmd_fault(struct vm_fault *vmf) +static int dax_dev_fault(struct vm_fault *vmf) { int rc; struct file *filp = vmf->vma->vm_file; @@ -522,7 +505,16 @@ static int dax_dev_pmd_fault(struct vm_fault *vmf) vmf->vma->vm_start, vmf->vma->vm_end); rcu_read_lock(); - rc = __dax_dev_pmd_fault(dax_dev, vmf); + switch (vmf->flags & FAULT_FLAG_SIZE_MASK) { + case FAULT_FLAG_SIZE_PTE: + rc = __dax_dev_pte_fault(dax_dev, vmf); + break; + case FAULT_FLAG_SIZE_PMD: + rc = __dax_dev_pmd_fault(dax_dev, vmf); + break; + default: + return VM_FAULT_FALLBACK; + } rcu_read_unlock(); return rc; @@ -530,7 +522,7 @@ static int dax_dev_pmd_fault(struct vm_fault *vmf) static const struct vm_operations_struct dax_dev_vm_ops = { .fault = dax_dev_fault, - .pmd_fault = dax_dev_pmd_fault, + .huge_fault = dax_dev_fault, }; static int dax_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/fs/dax.c b/fs/dax.c index f955c0df33bb..c3c29fbf64be 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1118,16 +1118,8 @@ static int dax_fault_return(int error) return VM_FAULT_SIGBUS; } -/** - * dax_iomap_fault - handle a page fault on a DAX file - * @vmf: The description of the fault - * @ops: iomap ops passed from the file system - * - * When a page fault occurs, filesystems may call this helper in their fault - * or mkwrite handler for DAX files. Assumes the caller has done all the - * necessary locking for the page fault to proceed successfully. - */ -int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops) +static int dax_iomap_pte_fault(struct vm_fault *vmf, + const struct iomap_ops *ops) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct inode *inode = mapping->host; @@ -1244,7 +1236,6 @@ int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops) } return vmf_ret; } -EXPORT_SYMBOL_GPL(dax_iomap_fault); #ifdef CONFIG_FS_DAX_PMD /* @@ -1335,7 +1326,8 @@ fallback: return VM_FAULT_FALLBACK; } -int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops) +static int dax_iomap_pmd_fault(struct vm_fault *vmf, + const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; @@ -1443,5 +1435,32 @@ out: trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); return result; } -EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); +#else +static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops) +{ + return VM_FAULT_FALLBACK; +} #endif /* CONFIG_FS_DAX_PMD */ + +/** + * dax_iomap_fault - handle a page fault on a DAX file + * @vmf: The description of the fault + * @ops: iomap ops passed from the file system + * + * When a page fault occurs, filesystems may call this helper in + * their fault handler for DAX files. dax_iomap_fault() assumes the caller + * has done all the necessary locking for page fault to proceed + * successfully. + */ +int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops) +{ + switch (vmf->flags & FAULT_FLAG_SIZE_MASK) { + case FAULT_FLAG_SIZE_PTE: + return dax_iomap_pte_fault(vmf, ops); + case FAULT_FLAG_SIZE_PMD: + return dax_iomap_pmd_fault(vmf, ops); + default: + return VM_FAULT_FALLBACK; + } +} +EXPORT_SYMBOL_GPL(dax_iomap_fault); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 0bf0d971205a..68738832beda 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -133,7 +133,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf) static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, /* - * .pmd_fault is not supported for DAX because allocation in ext2 + * .huge_fault is not supported for DAX because allocation in ext2 * cannot be reliably aligned to huge page sizes and so pmd faults * will always fail and fail back to regular faults. */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 21e1f17fe36d..502d2d07d191 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -273,27 +273,6 @@ static int ext4_dax_fault(struct vm_fault *vmf) return result; } -static int -ext4_dax_pmd_fault(struct vm_fault *vmf) -{ - int result; - struct inode *inode = file_inode(vmf->vma->vm_file); - struct super_block *sb = inode->i_sb; - bool write = vmf->flags & FAULT_FLAG_WRITE; - - if (write) { - sb_start_pagefault(sb); - file_update_time(vmf->vma->vm_file); - } - down_read(&EXT4_I(inode)->i_mmap_sem); - result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops); - up_read(&EXT4_I(inode)->i_mmap_sem); - if (write) - sb_end_pagefault(sb); - - return result; -} - /* * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() * handler we check for races agaist truncate. Note that since we cycle through @@ -326,7 +305,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf) static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, - .pmd_fault = ext4_dax_pmd_fault, + .huge_fault = ext4_dax_fault, .page_mkwrite = ext4_dax_fault, .pfn_mkwrite = ext4_dax_pfn_mkwrite, }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9cc10136ba0b..990e03819370 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1429,12 +1429,12 @@ xfs_filemap_fault( /* * Similar to xfs_filemap_fault(), the DAX fault path can call into here on * both read and write faults. Hence we need to handle both cases. There is no - * ->pmd_mkwrite callout for huge pages, so we have a single function here to + * ->huge_mkwrite callout for huge pages, so we have a single function here to * handle both cases here. @flags carries the information on the type of fault * occuring. */ STATIC int -xfs_filemap_pmd_fault( +xfs_filemap_huge_fault( struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); @@ -1444,7 +1444,7 @@ xfs_filemap_pmd_fault( if (!IS_DAX(inode)) return VM_FAULT_FALLBACK; - trace_xfs_filemap_pmd_fault(ip); + trace_xfs_filemap_huge_fault(ip); if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); @@ -1452,7 +1452,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, &xfs_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (vmf->flags & FAULT_FLAG_WRITE) @@ -1497,7 +1497,7 @@ xfs_filemap_pfn_mkwrite( static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, - .pmd_fault = xfs_filemap_pmd_fault, + .huge_fault = xfs_filemap_huge_fault, .map_pages = filemap_map_pages, .page_mkwrite = xfs_filemap_page_mkwrite, .pfn_mkwrite = xfs_filemap_pfn_mkwrite, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index fb7555e73a62..383ac227ce2c 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -687,7 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); DEFINE_INODE_EVENT(xfs_filemap_fault); -DEFINE_INODE_EVENT(xfs_filemap_pmd_fault); +DEFINE_INODE_EVENT(xfs_filemap_huge_fault); DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite); diff --git a/include/linux/dax.h b/include/linux/dax.h index eeb02421c848..cf9af225962b 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -70,17 +70,11 @@ static inline unsigned int dax_radix_order(void *entry) return PMD_SHIFT - PAGE_SHIFT; return 0; } -int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops); #else static inline unsigned int dax_radix_order(void *entry) { return 0; } -static inline int dax_iomap_pmd_fault(struct vm_fault *vmf, - const struct iomap_ops *ops) -{ - return VM_FAULT_FALLBACK; -} #endif int dax_pfn_mkwrite(struct vm_fault *vmf); diff --git a/include/linux/mm.h b/include/linux/mm.h index 3dd80ba6568a..035a688e5472 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -285,6 +285,11 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ +#define FAULT_FLAG_SIZE_MASK 0x7000 /* Support up to 8-level page tables */ +#define FAULT_FLAG_SIZE_PTE 0x0000 /* First level (eg 4k) */ +#define FAULT_FLAG_SIZE_PMD 0x1000 /* Second level (eg 2MB) */ +#define FAULT_FLAG_SIZE_PUD 0x2000 /* Third level (eg 1GB) */ + #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ @@ -314,6 +319,9 @@ struct vm_fault { unsigned long address; /* Faulting virtual address */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ + pud_t *pud; /* Pointer to pud entry matching + * the 'address' + */ pte_t orig_pte; /* Value of PTE at the time of fault */ struct page *cow_page; /* Page handler may use for COW fault */ @@ -351,7 +359,7 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*mremap)(struct vm_area_struct * area); int (*fault)(struct vm_fault *vmf); - int (*pmd_fault)(struct vm_fault *vmf); + int (*huge_fault)(struct vm_fault *vmf); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); diff --git a/mm/memory.c b/mm/memory.c index cf97d88158cd..e721e8eba570 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3466,8 +3466,8 @@ static int create_huge_pmd(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_anonymous_page(vmf); - if (vmf->vma->vm_ops->pmd_fault) - return vmf->vma->vm_ops->pmd_fault(vmf); + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf); return VM_FAULT_FALLBACK; } @@ -3475,8 +3475,8 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); - if (vmf->vma->vm_ops->pmd_fault) - return vmf->vma->vm_ops->pmd_fault(vmf); + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf); /* COW handled on pte level: split pmd */ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); @@ -3606,6 +3606,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; pud_t *pud; + int ret; pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); @@ -3615,15 +3616,18 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (!vmf.pmd) return VM_FAULT_OOM; if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { - int ret = create_huge_pmd(&vmf); + vmf.flags |= FAULT_FLAG_SIZE_PMD; + ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; + /* fall through path, remove PMD flag */ + vmf.flags &= ~FAULT_FLAG_SIZE_PMD; } else { pmd_t orig_pmd = *vmf.pmd; - int ret; barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { + vmf.flags |= FAULT_FLAG_SIZE_PMD; if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf, orig_pmd); @@ -3632,6 +3636,8 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; + /* fall through path, remove PUD flag */ + vmf.flags &= ~FAULT_FLAG_SIZE_PUD; } else { huge_pmd_set_accessed(&vmf, orig_pmd); return 0; -- cgit v1.2.3 From a00cc7d9dd93d66a3fb83fc52aa57a4bec51c517 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Feb 2017 14:57:02 -0800 Subject: mm, x86: add support for PUD-sized transparent hugepages The current transparent hugepage code only supports PMDs. This patch adds support for transparent use of PUDs with DAX. It does not include support for anonymous pages. x86 support code also added. Most of this patch simply parallels the work that was done for huge PMDs. The only major difference is how the new ->pud_entry method in mm_walk works. The ->pmd_entry method replaces the ->pte_entry method, whereas the ->pud_entry method works along with either ->pmd_entry or ->pte_entry. The pagewalk code takes care of locking the PUD before calling ->pud_walk, so handlers do not need to worry whether the PUD is stable. [dave.jiang@intel.com: fix SMP x86 32bit build for native_pud_clear()] Link: http://lkml.kernel.org/r/148719066814.31111.3239231168815337012.stgit@djiang5-desk3.ch.intel.com [dave.jiang@intel.com: native_pud_clear missing on i386 build] Link: http://lkml.kernel.org/r/148640375195.69754.3315433724330910314.stgit@djiang5-desk3.ch.intel.com Link: http://lkml.kernel.org/r/148545059381.17912.8602162635537598445.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Matthew Wilcox Signed-off-by: Dave Jiang Tested-by: Alexander Kapshuk Cc: Dave Hansen Cc: Vlastimil Babka Cc: Jan Kara Cc: Dan Williams Cc: Ross Zwisler Cc: Kirill A. Shutemov Cc: Nilesh Choudhury Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 3 + arch/x86/Kconfig | 1 + arch/x86/include/asm/paravirt.h | 11 ++ arch/x86/include/asm/paravirt_types.h | 2 + arch/x86/include/asm/pgtable-2level.h | 17 +++ arch/x86/include/asm/pgtable-3level.h | 30 ++++ arch/x86/include/asm/pgtable.h | 140 +++++++++++++++++++ arch/x86/include/asm/pgtable_64.h | 15 ++ arch/x86/kernel/paravirt.c | 1 + arch/x86/mm/pgtable.c | 31 +++++ include/asm-generic/pgtable.h | 80 ++++++++++- include/asm-generic/tlb.h | 14 ++ include/linux/huge_mm.h | 83 +++++++++++- include/linux/mm.h | 30 +++- include/linux/mmu_notifier.h | 14 ++ include/linux/pfn_t.h | 12 ++ mm/gup.c | 7 + mm/huge_memory.c | 249 ++++++++++++++++++++++++++++++++++ mm/memory.c | 88 +++++++++++- mm/pagewalk.c | 20 ++- mm/pgtable-generic.c | 14 ++ 21 files changed, 844 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index f761142976e5..d0012add6b19 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -571,6 +571,9 @@ config HAVE_IRQ_TIME_ACCOUNTING config HAVE_ARCH_TRANSPARENT_HUGEPAGE bool +config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + bool + config HAVE_ARCH_HUGE_VMAP bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 874c1238dffd..33007aa74111 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -109,6 +109,7 @@ config X86 select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64 select HAVE_ARCH_VMAP_STACK if X86_64 select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_CC_STACKPROTECTOR diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index f75fbfe550f2..0489884fdc44 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -475,6 +475,17 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, native_pmd_val(pmd)); } +static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ + if (sizeof(pudval_t) > sizeof(long)) + /* 5 arg words */ + pv_mmu_ops.set_pud_at(mm, addr, pudp, pud); + else + PVOP_VCALL4(pv_mmu_ops.set_pud_at, mm, addr, pudp, + native_pud_val(pud)); +} + static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { pmdval_t val = native_pmd_val(pmd); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index bb2de45a60f2..b060f962d581 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -249,6 +249,8 @@ struct pv_mmu_ops { void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmdval); + void (*set_pud_at)(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pudval); void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index fd74a11959de..a8b96e708c2b 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -21,6 +21,10 @@ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) *pmdp = pmd; } +static inline void native_set_pud(pud_t *pudp, pud_t pud) +{ +} + static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) { native_set_pte(ptep, pte); @@ -31,6 +35,10 @@ static inline void native_pmd_clear(pmd_t *pmdp) native_set_pmd(pmdp, __pmd(0)); } +static inline void native_pud_clear(pud_t *pudp) +{ +} + static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp) { @@ -55,6 +63,15 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#ifdef CONFIG_SMP +static inline pud_t native_pudp_get_and_clear(pud_t *xp) +{ + return __pud(xchg((pudval_t *)xp, 0)); +} +#else +#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp) +#endif + /* Bit manipulation helper on pte/pgoff entry */ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshift, unsigned long mask, unsigned int leftshift) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index cdaa58c9b39e..8f50fb3f04e1 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -121,6 +121,12 @@ static inline void native_pmd_clear(pmd_t *pmd) *(tmp + 1) = 0; } +#ifndef CONFIG_SMP +static inline void native_pud_clear(pud_t *pudp) +{ +} +#endif + static inline void pud_clear(pud_t *pudp) { set_pud(pudp, __pud(0)); @@ -176,6 +182,30 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#ifdef CONFIG_SMP +union split_pud { + struct { + u32 pud_low; + u32 pud_high; + }; + pud_t pud; +}; + +static inline pud_t native_pudp_get_and_clear(pud_t *pudp) +{ + union split_pud res, *orig = (union split_pud *)pudp; + + /* xchg acts as a barrier before setting of the high bits */ + res.pud_low = xchg(&orig->pud_low, 0); + res.pud_high = orig->pud_high; + orig->pud_high = 0; + + return res.pud; +} +#else +#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp) +#endif + /* Encode and de-code a swap entry */ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 437feb436efa..1cfb36b8c024 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -46,6 +46,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #define set_pte(ptep, pte) native_set_pte(ptep, pte) #define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) #define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd) +#define set_pud_at(mm, addr, pudp, pud) native_set_pud_at(mm, addr, pudp, pud) #define set_pte_atomic(ptep, pte) \ native_set_pte_atomic(ptep, pte) @@ -128,6 +129,16 @@ static inline int pmd_young(pmd_t pmd) return pmd_flags(pmd) & _PAGE_ACCESSED; } +static inline int pud_dirty(pud_t pud) +{ + return pud_flags(pud) & _PAGE_DIRTY; +} + +static inline int pud_young(pud_t pud) +{ + return pud_flags(pud) & _PAGE_ACCESSED; +} + static inline int pte_write(pte_t pte) { return pte_flags(pte) & _PAGE_RW; @@ -181,6 +192,13 @@ static inline int pmd_trans_huge(pmd_t pmd) return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; } +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline int pud_trans_huge(pud_t pud) +{ + return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; +} +#endif + #define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { @@ -192,6 +210,18 @@ static inline int pmd_devmap(pmd_t pmd) { return !!(pmd_val(pmd) & _PAGE_DEVMAP); } + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline int pud_devmap(pud_t pud) +{ + return !!(pud_val(pud) & _PAGE_DEVMAP); +} +#else +static inline int pud_devmap(pud_t pud) +{ + return 0; +} +#endif #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -333,6 +363,65 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); } +static inline pud_t pud_set_flags(pud_t pud, pudval_t set) +{ + pudval_t v = native_pud_val(pud); + + return __pud(v | set); +} + +static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear) +{ + pudval_t v = native_pud_val(pud); + + return __pud(v & ~clear); +} + +static inline pud_t pud_mkold(pud_t pud) +{ + return pud_clear_flags(pud, _PAGE_ACCESSED); +} + +static inline pud_t pud_mkclean(pud_t pud) +{ + return pud_clear_flags(pud, _PAGE_DIRTY); +} + +static inline pud_t pud_wrprotect(pud_t pud) +{ + return pud_clear_flags(pud, _PAGE_RW); +} + +static inline pud_t pud_mkdirty(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); +} + +static inline pud_t pud_mkdevmap(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_DEVMAP); +} + +static inline pud_t pud_mkhuge(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_PSE); +} + +static inline pud_t pud_mkyoung(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_ACCESSED); +} + +static inline pud_t pud_mkwrite(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_RW); +} + +static inline pud_t pud_mknotpresent(pud_t pud) +{ + return pud_clear_flags(pud, _PAGE_PRESENT | _PAGE_PROTNONE); +} + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline int pte_soft_dirty(pte_t pte) { @@ -344,6 +433,11 @@ static inline int pmd_soft_dirty(pmd_t pmd) return pmd_flags(pmd) & _PAGE_SOFT_DIRTY; } +static inline int pud_soft_dirty(pud_t pud) +{ + return pud_flags(pud) & _PAGE_SOFT_DIRTY; +} + static inline pte_t pte_mksoft_dirty(pte_t pte) { return pte_set_flags(pte, _PAGE_SOFT_DIRTY); @@ -354,6 +448,11 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } +static inline pud_t pud_mksoft_dirty(pud_t pud) +{ + return pud_set_flags(pud, _PAGE_SOFT_DIRTY); +} + static inline pte_t pte_clear_soft_dirty(pte_t pte) { return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); @@ -364,6 +463,11 @@ static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); } +static inline pud_t pud_clear_soft_dirty(pud_t pud) +{ + return pud_clear_flags(pud, _PAGE_SOFT_DIRTY); +} + #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ /* @@ -392,6 +496,12 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) massage_pgprot(pgprot)); } +static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) +{ + return __pud(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); +} + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { pteval_t val = pte_val(pte); @@ -771,6 +881,14 @@ static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp) return res; } +static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) +{ + pud_t res = *pudp; + + native_pud_clear(pudp); + return res; +} + static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep , pte_t pte) { @@ -783,6 +901,12 @@ static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr, native_set_pmd(pmdp, pmd); } +static inline void native_set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ + native_set_pud(pudp, pud); +} + #ifndef CONFIG_PARAVIRT /* * Rules for using pte_update - it must be called after any PTE update which @@ -861,10 +985,15 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); +extern int pudp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + pud_t entry, int dirty); #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp); +extern int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp); #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH extern int pmdp_clear_flush_young(struct vm_area_struct *vma, @@ -884,6 +1013,13 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long return native_pmdp_get_and_clear(pmdp); } +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR +static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + return native_pudp_get_and_clear(pudp); +} + #define __HAVE_ARCH_PMDP_SET_WRPROTECT static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) @@ -932,6 +1068,10 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { } +static inline void update_mmu_cache_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) +{ +} #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 62b775926045..73c7ccc38912 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -106,6 +106,21 @@ static inline void native_pud_clear(pud_t *pud) native_set_pud(pud, native_make_pud(0)); } +static inline pud_t native_pudp_get_and_clear(pud_t *xp) +{ +#ifdef CONFIG_SMP + return native_make_pud(xchg(&xp->pud, 0)); +#else + /* native_local_pudp_get_and_clear, + * but duplicated because of cyclic dependency + */ + pud_t ret = *xp; + + native_pud_clear(xp); + return ret; +#endif +} + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { *pgdp = pgd; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index a1bfba0f7234..4797e87b0fb6 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -425,6 +425,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .pmd_clear = native_pmd_clear, #endif .set_pud = native_set_pud, + .set_pud_at = native_set_pud_at, .pmd_val = PTE_IDENT, .make_pmd = PTE_IDENT, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3feec5af4e67..6cbdff26bb96 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -445,6 +445,26 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, return changed; } + +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp, pud_t entry, int dirty) +{ + int changed = !pud_same(*pudp, entry); + + VM_BUG_ON(address & ~HPAGE_PUD_MASK); + + if (changed && dirty) { + *pudp = entry; + /* + * We had a write-protection fault here and changed the pud + * to to more permissive. No need to flush the TLB for that, + * #PF is architecturally guaranteed to do that and in the + * worst-case we'll generate a spurious fault. + */ + } + + return changed; +} #endif int ptep_test_and_clear_young(struct vm_area_struct *vma, @@ -474,6 +494,17 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, return ret; } +int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp) +{ + int ret = 0; + + if (pud_young(*pudp)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *)pudp); + + return ret; +} #endif int ptep_clear_flush_young(struct vm_area_struct *vma, diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 18af2bcefe6a..a0aba0f9c57b 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -36,6 +36,9 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma, extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); +extern int pudp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + pud_t entry, int dirty); #else static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, @@ -44,6 +47,13 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma, BUILD_BUG(); return 0; } +static inline int pudp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + pud_t entry, int dirty) +{ + BUILD_BUG(); + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif @@ -121,8 +131,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, } #endif -#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR #ifdef CONFIG_TRANSPARENT_HUGEPAGE +#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) @@ -131,20 +141,40 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_clear(pmdp); return pmd; } +#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ +#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR +static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long address, + pud_t *pudp) +{ + pud_t pud = *pudp; + + pud_clear(pudp); + return pud; +} +#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#endif -#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL #ifdef CONFIG_TRANSPARENT_HUGEPAGE +#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL static inline pmd_t pmdp_huge_get_and_clear_full(struct mm_struct *mm, unsigned long address, pmd_t *pmdp, int full) { return pmdp_huge_get_and_clear(mm, address, pmdp); } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL +static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm, + unsigned long address, pud_t *pudp, + int full) +{ + return pudp_huge_get_and_clear(mm, address, pudp); +} +#endif +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, @@ -181,6 +211,9 @@ extern pte_t ptep_clear_flush(struct vm_area_struct *vma, extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); +extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pud_t *pudp); #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT @@ -208,6 +241,23 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline void pudp_set_wrprotect(struct mm_struct *mm, + unsigned long address, pud_t *pudp) +{ + pud_t old_pud = *pudp; + + set_pud_at(mm, address, pudp, pud_wrprotect(old_pud)); +} +#else +static inline void pudp_set_wrprotect(struct mm_struct *mm, + unsigned long address, pud_t *pudp) +{ + BUILD_BUG(); +} +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +#endif #ifndef pmdp_collapse_flush #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -273,12 +323,23 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } + +static inline int pud_same(pud_t pud_a, pud_t pud_b) +{ + return pud_val(pud_a) == pud_val(pud_b); +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { BUILD_BUG(); return 0; } + +static inline int pud_same(pud_t pud_a, pud_t pud_b) +{ + BUILD_BUG(); + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif @@ -640,6 +701,15 @@ static inline int pmd_write(pmd_t pmd) #endif /* __HAVE_ARCH_PMD_WRITE */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ + (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) +static inline int pud_trans_huge(pud_t pud) +{ + return 0; +} +#endif + #ifndef pmd_read_atomic static inline pmd_t pmd_read_atomic(pmd_t *pmdp) { @@ -785,8 +855,10 @@ static inline int pmd_clear_huge(pmd_t *pmd) * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) +#define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #else #define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() +#define flush_pud_tlb_range(vma, addr, end) BUILD_BUG() #endif #endif diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 7eed8cf3130a..4329bc6ef04b 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -232,6 +232,20 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) +/** + * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb + * invalidation. This is a nop so far, because only x86 needs it. + */ +#ifndef __tlb_remove_pud_tlb_entry +#define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0) +#endif + +#define tlb_remove_pud_tlb_entry(tlb, pudp, address) \ + do { \ + __tlb_adjust_range(tlb, address, HPAGE_PUD_SIZE); \ + __tlb_remove_pud_tlb_entry(tlb, pudp, address); \ + } while (0) + /* * For things like page tables caches (ie caching addresses "inside" the * page tables, like x86 does), for legacy reasons, flushing an diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f0029e786205..a3762d49ba39 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -6,6 +6,18 @@ extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma); extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); +extern int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, unsigned long addr, + struct vm_area_struct *vma); + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +extern void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); +#else +static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) +{ +} +#endif + extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -17,6 +29,9 @@ extern bool madvise_free_huge_pmd(struct mmu_gather *tlb, extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); +extern int zap_huge_pud(struct mmu_gather *tlb, + struct vm_area_struct *vma, + pud_t *pud, unsigned long addr); extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); @@ -26,8 +41,10 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa); -int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, - pfn_t pfn, bool write); +int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, pfn_t pfn, bool write); +int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, pfn_t pfn, bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, @@ -58,13 +75,14 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_NR (1<vm_mm->mmap_sem), vma); + if (pud_trans_huge(*pud) || pud_devmap(*pud)) + return __pud_trans_huge_lock(pud, vma); + else + return NULL; +} static inline int hpage_nr_pages(struct page *page) { if (unlikely(PageTransHuge(page))) @@ -143,6 +183,11 @@ static inline int hpage_nr_pages(struct page *page) return 1; } +struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, int flags); +struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, int flags); + extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *huge_zero_page; @@ -157,6 +202,11 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) return is_huge_zero_page(pmd_page(pmd)); } +static inline bool is_huge_zero_pud(pud_t pud) +{ + return false; +} + struct page *mm_get_huge_zero_page(struct mm_struct *mm); void mm_put_huge_zero_page(struct mm_struct *mm); @@ -167,6 +217,10 @@ void mm_put_huge_zero_page(struct mm_struct *mm); #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) +#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; }) +#define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) +#define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) + #define hpage_nr_pages(x) 1 #define transparent_hugepage_enabled(__vma) 0 @@ -195,6 +249,9 @@ static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page) {} +#define split_huge_pud(__vma, __pmd, __address) \ + do { } while (0) + static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { @@ -212,6 +269,11 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, { return NULL; } +static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, + struct vm_area_struct *vma) +{ + return NULL; +} static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd) { @@ -223,6 +285,11 @@ static inline bool is_huge_zero_page(struct page *page) return false; } +static inline bool is_huge_zero_pud(pud_t pud) +{ + return false; +} + static inline void mm_put_huge_zero_page(struct mm_struct *mm) { return; @@ -233,6 +300,12 @@ static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, { return NULL; } + +static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t *pud, int flags) +{ + return NULL; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 035a688e5472..d8b75d7d6a9e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -424,6 +424,10 @@ static inline int pmd_devmap(pmd_t pmd) { return 0; } +static inline int pud_devmap(pud_t pud) +{ + return 0; +} #endif /* @@ -1199,6 +1203,10 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, /** * mm_walk - callbacks for walk_page_range + * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry + * this handler should only handle pud_trans_huge() puds. + * the pmd_entry or pte_entry callbacks will be used for + * regular PUDs. * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry * this handler is required to be able to handle * pmd_trans_huge() pmds. They may simply choose to @@ -1218,6 +1226,8 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, * (see the comment on walk_page_range() for more details) */ struct mm_walk { + int (*pud_entry)(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk); int (*pmd_entry)(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk); int (*pte_entry)(pte_t *pte, unsigned long addr, @@ -1801,8 +1811,26 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) return ptl; } -extern void __init pagecache_init(void); +/* + * No scalability reason to split PUD locks yet, but follow the same pattern + * as the PMD locks to make it easier if we decide to. The VM should not be + * considered ready to switch to split PUD locks yet; there may be places + * which need to be converted from page_table_lock. + */ +static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud) +{ + return &mm->page_table_lock; +} + +static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) +{ + spinlock_t *ptl = pud_lockptr(mm, pud); + + spin_lock(ptl); + return ptl; +} +extern void __init pagecache_init(void); extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index a1a210d59961..51891fb0d3ce 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -381,6 +381,19 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) ___pmd; \ }) +#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \ +({ \ + unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \ + struct mm_struct *___mm = (__vma)->vm_mm; \ + pud_t ___pud; \ + \ + ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \ + mmu_notifier_invalidate_range(___mm, ___haddr, \ + ___haddr + HPAGE_PUD_SIZE); \ + \ + ___pud; \ +}) + #define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ @@ -475,6 +488,7 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define pmdp_clear_young_notify pmdp_test_and_clear_young #define ptep_clear_flush_notify ptep_clear_flush #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush +#define pudp_huge_clear_flush_notify pudp_huge_clear_flush #define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear #define set_pte_at_notify set_pte_at diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 033fc7bbcefa..a49b3259cad7 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -90,6 +90,13 @@ static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot) { return pfn_pmd(pfn_t_to_pfn(pfn), pgprot); } + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot) +{ + return pfn_pud(pfn_t_to_pfn(pfn), pgprot); +} +#endif #endif #ifdef __HAVE_ARCH_PTE_DEVMAP @@ -106,5 +113,10 @@ static inline bool pfn_t_devmap(pfn_t pfn) } pte_t pte_mkdevmap(pte_t pte); pmd_t pmd_mkdevmap(pmd_t pmd); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +pud_t pud_mkdevmap(pud_t pud); #endif +#endif /* __HAVE_ARCH_PTE_DEVMAP */ + #endif /* _LINUX_PFN_T_H_ */ diff --git a/mm/gup.c b/mm/gup.c index 40abe4c90383..1e67461b2733 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -253,6 +253,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } + if (pud_devmap(*pud)) { + ptl = pud_lock(mm, pud); + page = follow_devmap_pud(vma, address, pud, flags); + spin_unlock(ptl); + if (page) + return page; + } if (unlikely(pud_bad(*pud))) return no_page_table(vma, flags); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f9ecc2aeadfc..85742ac5b32e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -757,6 +757,60 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pud = pud_mkwrite(pud); + return pud; +} + +static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, pfn_t pfn, pgprot_t prot, bool write) +{ + struct mm_struct *mm = vma->vm_mm; + pud_t entry; + spinlock_t *ptl; + + ptl = pud_lock(mm, pud); + entry = pud_mkhuge(pfn_t_pud(pfn, prot)); + if (pfn_t_devmap(pfn)) + entry = pud_mkdevmap(entry); + if (write) { + entry = pud_mkyoung(pud_mkdirty(entry)); + entry = maybe_pud_mkwrite(entry, vma); + } + set_pud_at(mm, addr, pud, entry); + update_mmu_cache_pud(vma, addr, pud); + spin_unlock(ptl); +} + +int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, pfn_t pfn, bool write) +{ + pgprot_t pgprot = vma->vm_page_prot; + /* + * If we had pud_special, we could avoid all these restrictions, + * but we need to be consistent with PTEs and architectures that + * can't support a 'special' bit. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + BUG_ON(!pfn_t_devmap(pfn)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + track_pfn_insert(vma, &pgprot, pfn); + + insert_pfn_pud(vma, addr, pud, pfn, pgprot, write); + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { @@ -887,6 +941,123 @@ out: return ret; } +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static void touch_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud) +{ + pud_t _pud; + + /* + * We should set the dirty bit only for FOLL_WRITE but for now + * the dirty bit in the pud is meaningless. And if the dirty + * bit will become meaningful and we'll only set it with + * FOLL_WRITE, an atomic set_bit will be required on the pud to + * set the young bit, instead of the current set_pud_at. + */ + _pud = pud_mkyoung(pud_mkdirty(*pud)); + if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, + pud, _pud, 1)) + update_mmu_cache_pud(vma, addr, pud); +} + +struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, int flags) +{ + unsigned long pfn = pud_pfn(*pud); + struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap; + struct page *page; + + assert_spin_locked(pud_lockptr(mm, pud)); + + if (flags & FOLL_WRITE && !pud_write(*pud)) + return NULL; + + if (pud_present(*pud) && pud_devmap(*pud)) + /* pass */; + else + return NULL; + + if (flags & FOLL_TOUCH) + touch_pud(vma, addr, pud); + + /* + * device mapped pages can only be returned if the + * caller will manage the page reference count. + */ + if (!(flags & FOLL_GET)) + return ERR_PTR(-EEXIST); + + pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; + pgmap = get_dev_pagemap(pfn, NULL); + if (!pgmap) + return ERR_PTR(-EFAULT); + page = pfn_to_page(pfn); + get_page(page); + put_dev_pagemap(pgmap); + + return page; +} + +int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, unsigned long addr, + struct vm_area_struct *vma) +{ + spinlock_t *dst_ptl, *src_ptl; + pud_t pud; + int ret; + + dst_ptl = pud_lock(dst_mm, dst_pud); + src_ptl = pud_lockptr(src_mm, src_pud); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + + ret = -EAGAIN; + pud = *src_pud; + if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) + goto out_unlock; + + /* + * When page table lock is held, the huge zero pud should not be + * under splitting since we don't split the page itself, only pud to + * a page table. + */ + if (is_huge_zero_pud(pud)) { + /* No huge zero pud yet */ + } + + pudp_set_wrprotect(src_mm, addr, src_pud); + pud = pud_mkold(pud_wrprotect(pud)); + set_pud_at(dst_mm, addr, dst_pud, pud); + + ret = 0; +out_unlock: + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + return ret; +} + +void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) +{ + pud_t entry; + unsigned long haddr; + bool write = vmf->flags & FAULT_FLAG_WRITE; + + vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); + if (unlikely(!pud_same(*vmf->pud, orig_pud))) + goto unlock; + + entry = pud_mkyoung(orig_pud); + if (write) + entry = pud_mkdirty(entry); + haddr = vmf->address & HPAGE_PUD_MASK; + if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write)) + update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud); + +unlock: + spin_unlock(vmf->ptl); +} +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) { pmd_t entry; @@ -1601,6 +1772,84 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) return NULL; } +/* + * Returns true if a given pud maps a thp, false otherwise. + * + * Note that if it returns true, this routine returns without unlocking page + * table lock. So callers must unlock it. + */ +spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) +{ + spinlock_t *ptl; + + ptl = pud_lock(vma->vm_mm, pud); + if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) + return ptl; + spin_unlock(ptl); + return NULL; +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, + pud_t *pud, unsigned long addr) +{ + pud_t orig_pud; + spinlock_t *ptl; + + ptl = __pud_trans_huge_lock(pud, vma); + if (!ptl) + return 0; + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pudp_huge_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pudp related + * operations. + */ + orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud, + tlb->fullmm); + tlb_remove_pud_tlb_entry(tlb, pud, addr); + if (vma_is_dax(vma)) { + spin_unlock(ptl); + /* No zero page support yet */ + } else { + /* No support for anonymous PUD pages yet */ + BUG(); + } + return 1; +} + +static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, + unsigned long haddr) +{ + VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); + VM_BUG_ON_VMA(vma->vm_start > haddr, vma); + VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); + VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); + + count_vm_event(THP_SPLIT_PMD); + + pudp_huge_clear_flush_notify(vma, haddr, pud); +} + +void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, + unsigned long address) +{ + spinlock_t *ptl; + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & HPAGE_PUD_MASK; + + mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE); + ptl = pud_lock(mm, pud); + if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) + goto out; + __split_huge_pud_locked(vma, pud, haddr); + +out: + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE); +} +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd) { diff --git a/mm/memory.c b/mm/memory.c index e721e8eba570..41e2a2d4b2a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1001,7 +1001,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src next = pmd_addr_end(addr, end); if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; - VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); + VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, vma); if (err == -ENOMEM) @@ -1032,6 +1032,18 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src src_pud = pud_offset(src_pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { + int err; + + VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma); + err = copy_huge_pud(dst_mm, src_mm, + dst_pud, src_pud, addr, vma); + if (err == -ENOMEM) + return -ENOMEM; + if (!err) + continue; + /* fall through */ + } if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, @@ -1263,9 +1275,19 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_trans_huge(*pud) || pud_devmap(*pud)) { + if (next - addr != HPAGE_PUD_SIZE) { + VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma); + split_huge_pud(vma, pud, addr); + } else if (zap_huge_pud(tlb, vma, pud, addr)) + goto next; + /* fall through */ + } if (pud_none_or_clear_bad(pud)) continue; next = zap_pmd_range(tlb, vma, pud, addr, next, details); +next: + cond_resched(); } while (pud++, addr = next, addr != end); return addr; @@ -3490,6 +3512,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); } +static int create_huge_pud(struct vm_fault *vmf) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* No support for anonymous transparent PUD pages yet */ + if (vma_is_anonymous(vmf->vma)) + return VM_FAULT_FALLBACK; + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + return VM_FAULT_FALLBACK; +} + +static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* No support for anonymous transparent PUD pages yet */ + if (vma_is_anonymous(vmf->vma)) + return VM_FAULT_FALLBACK; + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + return VM_FAULT_FALLBACK; +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3605,14 +3651,41 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, }; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; - pud_t *pud; int ret; pgd = pgd_offset(mm, address); - pud = pud_alloc(mm, pgd, address); - if (!pud) + + vmf.pud = pud_alloc(mm, pgd, address); + if (!vmf.pud) return VM_FAULT_OOM; - vmf.pmd = pmd_alloc(mm, pud, address); + if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) { + vmf.flags |= FAULT_FLAG_SIZE_PUD; + ret = create_huge_pud(&vmf); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + pud_t orig_pud = *vmf.pud; + + barrier(); + if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { + unsigned int dirty = flags & FAULT_FLAG_WRITE; + + vmf.flags |= FAULT_FLAG_SIZE_PUD; + + /* NUMA case for anonymous PUDs would go here */ + + if (dirty && !pud_write(orig_pud)) { + ret = wp_huge_pud(&vmf, orig_pud); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + huge_pud_set_accessed(&vmf, orig_pud); + return 0; + } + } + } + + vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { @@ -3743,13 +3816,14 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) */ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { + spinlock_t *ptl; pmd_t *new = pmd_alloc_one(mm, address); if (!new) return -ENOMEM; smp_wmb(); /* See comment in __pte_alloc */ - spin_lock(&mm->page_table_lock); + ptl = pud_lock(mm, pud); #ifndef __ARCH_HAS_4LEVEL_HACK if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); @@ -3763,7 +3837,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } else /* Another has populated it */ pmd_free(mm, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); return 0; } #endif /* __PAGETABLE_PMD_FOLDED */ diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 207244489a68..03761577ae86 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -78,14 +78,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, pud = pud_offset(pgd, addr); do { + again: next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) { + if (pud_none(*pud) || !walk->vma) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) break; continue; } + + if (walk->pud_entry) { + spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); + + if (ptl) { + err = walk->pud_entry(pud, addr, next, walk); + spin_unlock(ptl); + if (err) + break; + continue; + } + } + + split_huge_pud(walk->vma, pud, addr); + if (pud_none(*pud)) + goto again; + if (walk->pmd_entry || walk->pte_entry) err = walk_pmd_range(pud, addr, next, walk); if (err) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 71c5f9109f2a..4ed5908c65b0 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -123,6 +123,20 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp) +{ + pud_t pud; + + VM_BUG_ON(address & ~HPAGE_PUD_MASK); + VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp)); + pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp); + flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); + return pud; +} +#endif #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT -- cgit v1.2.3 From c791ace1e747371658237f0d30234fef56c39669 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 24 Feb 2017 14:57:08 -0800 Subject: mm: replace FAULT_FLAG_SIZE with parameter to huge_fault Since the introduction of FAULT_FLAG_SIZE to the vm_fault flag, it has been somewhat painful with getting the flags set and removed at the correct locations. More than one kernel oops was introduced due to difficulties of getting the placement correctly. Remove the flag values and introduce an input parameter to huge_fault that indicates the size of the page entry. This makes the code easier to trace and should avoid the issues we see with the fault flags where removal of the flag was necessary in the fallback paths. Link: http://lkml.kernel.org/r/148615748258.43180.1690152053774975329.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dave Jiang Tested-by: Dan Williams Reviewed-by: Jan Kara Cc: Matthew Wilcox Cc: Dave Hansen Cc: Vlastimil Babka Cc: Ross Zwisler Cc: Kirill A. Shutemov Cc: Nilesh Choudhury Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/dax.c | 18 ++++++++++++------ fs/dax.c | 9 +++++---- fs/ext2/file.c | 2 +- fs/ext4/file.c | 12 +++++++++--- fs/xfs/xfs_file.c | 9 +++++---- include/linux/dax.h | 3 ++- include/linux/mm.h | 14 ++++++++------ mm/memory.c | 17 ++++------------- 8 files changed, 46 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index b90bb301bda0..b75c77254fdb 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -538,7 +538,8 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) } #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static int dax_dev_fault(struct vm_fault *vmf) +static int dax_dev_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) { int rc; struct file *filp = vmf->vma->vm_file; @@ -550,14 +551,14 @@ static int dax_dev_fault(struct vm_fault *vmf) vmf->vma->vm_start, vmf->vma->vm_end); rcu_read_lock(); - switch (vmf->flags & FAULT_FLAG_SIZE_MASK) { - case FAULT_FLAG_SIZE_PTE: + switch (pe_size) { + case PE_SIZE_PTE: rc = __dax_dev_pte_fault(dax_dev, vmf); break; - case FAULT_FLAG_SIZE_PMD: + case PE_SIZE_PMD: rc = __dax_dev_pmd_fault(dax_dev, vmf); break; - case FAULT_FLAG_SIZE_PUD: + case PE_SIZE_PUD: rc = __dax_dev_pud_fault(dax_dev, vmf); break; default: @@ -568,9 +569,14 @@ static int dax_dev_fault(struct vm_fault *vmf) return rc; } +static int dax_dev_fault(struct vm_fault *vmf) +{ + return dax_dev_huge_fault(vmf, PE_SIZE_PTE); +} + static const struct vm_operations_struct dax_dev_vm_ops = { .fault = dax_dev_fault, - .huge_fault = dax_dev_fault, + .huge_fault = dax_dev_huge_fault, }; static int dax_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/fs/dax.c b/fs/dax.c index c3c29fbf64be..5ae8b71ebadc 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1452,12 +1452,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, struct iomap_ops *ops) * has done all the necessary locking for page fault to proceed * successfully. */ -int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops) +int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, + const struct iomap_ops *ops) { - switch (vmf->flags & FAULT_FLAG_SIZE_MASK) { - case FAULT_FLAG_SIZE_PTE: + switch (pe_size) { + case PE_SIZE_PTE: return dax_iomap_pte_fault(vmf, ops); - case FAULT_FLAG_SIZE_PMD: + case PE_SIZE_PMD: return dax_iomap_pmd_fault(vmf, ops); default: return VM_FAULT_FALLBACK; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 68738832beda..b21891a6bfca 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = dax_iomap_fault(vmf, &ext2_iomap_ops); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 502d2d07d191..8210c1f43556 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -253,7 +253,8 @@ out: } #ifdef CONFIG_FS_DAX -static int ext4_dax_fault(struct vm_fault *vmf) +static int ext4_dax_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) { int result; struct inode *inode = file_inode(vmf->vma->vm_file); @@ -265,7 +266,7 @@ static int ext4_dax_fault(struct vm_fault *vmf) file_update_time(vmf->vma->vm_file); } down_read(&EXT4_I(inode)->i_mmap_sem); - result = dax_iomap_fault(vmf, &ext4_iomap_ops); + result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); up_read(&EXT4_I(inode)->i_mmap_sem); if (write) sb_end_pagefault(sb); @@ -273,6 +274,11 @@ static int ext4_dax_fault(struct vm_fault *vmf) return result; } +static int ext4_dax_fault(struct vm_fault *vmf) +{ + return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); +} + /* * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() * handler we check for races agaist truncate. Note that since we cycle through @@ -305,7 +311,7 @@ static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf) static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, - .huge_fault = ext4_dax_fault, + .huge_fault = ext4_dax_huge_fault, .page_mkwrite = ext4_dax_fault, .pfn_mkwrite = ext4_dax_pfn_mkwrite, }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 990e03819370..a50eca676670 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1391,7 +1391,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_iomap_fault(vmf, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); } else { ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); @@ -1418,7 +1418,7 @@ xfs_filemap_fault( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) - ret = dax_iomap_fault(vmf, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); else ret = filemap_fault(vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1435,7 +1435,8 @@ xfs_filemap_fault( */ STATIC int xfs_filemap_huge_fault( - struct vm_fault *vmf) + struct vm_fault *vmf, + enum page_entry_size pe_size) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); @@ -1452,7 +1453,7 @@ xfs_filemap_huge_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_fault(vmf, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (vmf->flags & FAULT_FLAG_WRITE) diff --git a/include/linux/dax.h b/include/linux/dax.h index cf9af225962b..d8a3dc042e1c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -38,7 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags) ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); -int dax_iomap_fault(struct vm_fault *vmf, const struct iomap_ops *ops); +int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, + const struct iomap_ops *ops); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, diff --git a/include/linux/mm.h b/include/linux/mm.h index d8b75d7d6a9e..c65aa43b5712 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -285,11 +285,6 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ -#define FAULT_FLAG_SIZE_MASK 0x7000 /* Support up to 8-level page tables */ -#define FAULT_FLAG_SIZE_PTE 0x0000 /* First level (eg 4k) */ -#define FAULT_FLAG_SIZE_PMD 0x1000 /* Second level (eg 2MB) */ -#define FAULT_FLAG_SIZE_PUD 0x2000 /* Third level (eg 1GB) */ - #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ @@ -349,6 +344,13 @@ struct vm_fault { */ }; +/* page entry size for vm->huge_fault() */ +enum page_entry_size { + PE_SIZE_PTE = 0, + PE_SIZE_PMD, + PE_SIZE_PUD, +}; + /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -359,7 +361,7 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); int (*mremap)(struct vm_area_struct * area); int (*fault)(struct vm_fault *vmf); - int (*huge_fault)(struct vm_fault *vmf); + int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); diff --git a/mm/memory.c b/mm/memory.c index 41e2a2d4b2a6..6040b74d02a2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3489,7 +3489,7 @@ static int create_huge_pmd(struct vm_fault *vmf) if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_anonymous_page(vmf); if (vmf->vma->vm_ops->huge_fault) - return vmf->vma->vm_ops->huge_fault(vmf); + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); return VM_FAULT_FALLBACK; } @@ -3498,7 +3498,7 @@ static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); if (vmf->vma->vm_ops->huge_fault) - return vmf->vma->vm_ops->huge_fault(vmf); + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); /* COW handled on pte level: split pmd */ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); @@ -3519,7 +3519,7 @@ static int create_huge_pud(struct vm_fault *vmf) if (vma_is_anonymous(vmf->vma)) return VM_FAULT_FALLBACK; if (vmf->vma->vm_ops->huge_fault) - return vmf->vma->vm_ops->huge_fault(vmf); + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } @@ -3531,7 +3531,7 @@ static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) if (vma_is_anonymous(vmf->vma)) return VM_FAULT_FALLBACK; if (vmf->vma->vm_ops->huge_fault) - return vmf->vma->vm_ops->huge_fault(vmf); + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } @@ -3659,7 +3659,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (!vmf.pud) return VM_FAULT_OOM; if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) { - vmf.flags |= FAULT_FLAG_SIZE_PUD; ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -3670,8 +3669,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { unsigned int dirty = flags & FAULT_FLAG_WRITE; - vmf.flags |= FAULT_FLAG_SIZE_PUD; - /* NUMA case for anonymous PUDs would go here */ if (dirty && !pud_write(orig_pud)) { @@ -3689,18 +3686,14 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (!vmf.pmd) return VM_FAULT_OOM; if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { - vmf.flags |= FAULT_FLAG_SIZE_PMD; ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; - /* fall through path, remove PMD flag */ - vmf.flags &= ~FAULT_FLAG_SIZE_PMD; } else { pmd_t orig_pmd = *vmf.pmd; barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { - vmf.flags |= FAULT_FLAG_SIZE_PMD; if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) return do_huge_pmd_numa_page(&vmf, orig_pmd); @@ -3709,8 +3702,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; - /* fall through path, remove PUD flag */ - vmf.flags &= ~FAULT_FLAG_SIZE_PUD; } else { huge_pmd_set_accessed(&vmf, orig_pmd); return 0; -- cgit v1.2.3 From 9e5bcd610ffcedf5e485e78a72762810b25c7f25 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 24 Feb 2017 14:57:29 -0800 Subject: mm/migration: make isolate_movable_page() return int type Patch series "HWPOISON: soft offlining for non-lru movable page", v6. After Minchan's commit bda807d44454 ("mm: migrate: support non-lru movable page migration"), some type of non-lru page like zsmalloc and virtio-balloon page also support migration. Therefore, we can: 1) soft offlining no-lru movable pages, which means when memory corrected errors occur on a non-lru movable page, we can stop to use it by migrating data onto another page and disable the original (maybe half-broken) one. 2) enable memory hotplug for non-lru movable pages, i.e. we may offline blocks, which include such pages, by using non-lru page migration. This patchset is heavily dependent on non-lru movable page migration. This patch (of 4): Change the return type of isolate_movable_page() from bool to int. It will return 0 when isolate movable page successfully, and return -EBUSY when it isolates failed. There is no functional change within this patch but prepare for later patch. [xieyisheng1@huawei.com: v6] Link: http://lkml.kernel.org/r/1486108770-630-2-git-send-email-xieyisheng1@huawei.com Link: http://lkml.kernel.org/r/1485867981-16037-2-git-send-email-ysxie@foxmail.com Signed-off-by: Yisheng Xie Suggested-by: Michal Hocko Acked-by: Minchan Kim Cc: Andi Kleen Cc: Hanjun Guo Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Naoya Horiguchi Cc: Reza Arbab Cc: Taku Izumi Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 +- mm/compaction.c | 2 +- mm/migrate.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index ae8d475a9385..43d5deb0c4cc 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -37,7 +37,7 @@ extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason); -extern bool isolate_movable_page(struct page *page, isolate_mode_t mode); +extern int isolate_movable_page(struct page *page, isolate_mode_t mode); extern void putback_movable_page(struct page *page); extern int migrate_prep(void); diff --git a/mm/compaction.c b/mm/compaction.c index 0aa2757399ee..0fdfde016ee2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -802,7 +802,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, locked = false; } - if (isolate_movable_page(page, isolate_mode)) + if (!isolate_movable_page(page, isolate_mode)) goto isolate_success; } diff --git a/mm/migrate.c b/mm/migrate.c index 87f4d0f81819..6807174e0715 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -74,7 +74,7 @@ int migrate_prep_local(void) return 0; } -bool isolate_movable_page(struct page *page, isolate_mode_t mode) +int isolate_movable_page(struct page *page, isolate_mode_t mode) { struct address_space *mapping; @@ -125,14 +125,14 @@ bool isolate_movable_page(struct page *page, isolate_mode_t mode) __SetPageIsolated(page); unlock_page(page); - return true; + return 0; out_no_isolated: unlock_page(page); out_putpage: put_page(page); out: - return false; + return -EBUSY; } /* It should be called on page which is PG_movable */ -- cgit v1.2.3 From cbae0170e5329c79c0a36a81fedd2800146aca12 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Fri, 24 Feb 2017 14:57:32 -0800 Subject: mm/migration: make isolate_movable_page always defined Define isolate_movable_page as a static inline function when CONFIG_MIGRATION is not enable. It should return -EBUSY here which means failed to isolate movable pages. This patch do not have any functional change but prepare for later patch. Link: http://lkml.kernel.org/r/1485867981-16037-3-git-send-email-ysxie@foxmail.com Signed-off-by: Yisheng Xie Acked-by: Minchan Kim Suggested-by: Michal Hocko Cc: Naoya Horiguchi Cc: Vlastimil Babka Cc: Andi Kleen Cc: Hanjun Guo Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Reza Arbab Cc: Taku Izumi Cc: Vitaly Kuznetsov Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 43d5deb0c4cc..fa76b516fa47 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -56,6 +56,8 @@ static inline int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason) { return -ENOSYS; } +static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) + { return -EBUSY; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } -- cgit v1.2.3 From ace71a19cec5eb430207c3269d8a2683f0574306 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 24 Feb 2017 14:57:45 -0800 Subject: mm: introduce page_vma_mapped_walk() Introduce a new interface to check if a page is mapped into a vma. It aims to address shortcomings of page_check_address{,_transhuge}. Existing interface is not able to handle PTE-mapped THPs: it only finds the first PTE. The rest lefted unnoticed. page_vma_mapped_walk() iterates over all possible mapping of the page in the vma. Link: http://lkml.kernel.org/r/20170129173858.45174-3-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hillf Danton Cc: Hugh Dickins Cc: Johannes Weiner Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Srikar Dronamraju Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 26 +++++++ mm/Makefile | 6 +- mm/huge_memory.c | 9 ++- mm/page_vma_mapped.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 224 insertions(+), 5 deletions(-) create mode 100644 mm/page_vma_mapped.c (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 15321fb1df6b..b76343610653 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -9,6 +9,7 @@ #include #include #include +#include /* * The anon_vma heads a list of private "related" vmas, to scan if @@ -232,6 +233,31 @@ static inline bool page_check_address_transhuge(struct page *page, } #endif +/* Avoid racy checks */ +#define PVMW_SYNC (1 << 0) +/* Look for migarion entries rather than present PTEs */ +#define PVMW_MIGRATION (1 << 1) + +struct page_vma_mapped_walk { + struct page *page; + struct vm_area_struct *vma; + unsigned long address; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + unsigned int flags; +}; + +static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) +{ + if (pvmw->pte) + pte_unmap(pvmw->pte); + if (pvmw->ptl) + spin_unlock(pvmw->ptl); +} + +bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); + /* * Used by swapoff to help locate where page is expected in vma. */ diff --git a/mm/Makefile b/mm/Makefile index 433eaf9a876e..aa0aa17cb413 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -23,8 +23,10 @@ KCOV_INSTRUMENT_vmstat.o := n mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ - mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o pgtable-generic.o + mlock.o mmap.o mprotect.o mremap.o msync.o \ + page_vma_mapped.o pagewalk.o pgtable-generic.o \ + rmap.o vmalloc.o + ifdef CONFIG_CROSS_MEMORY_ATTACH mmu-$(CONFIG_MMU) += process_vm_access.o diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 85742ac5b32e..a7bac4f2b78a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2129,9 +2129,12 @@ static void freeze_page(struct page *page) static void unfreeze_page(struct page *page) { int i; - - for (i = 0; i < HPAGE_PMD_NR; i++) - remove_migration_ptes(page + i, page + i, true); + if (PageTransHuge(page)) { + remove_migration_ptes(page, page, true); + } else { + for (i = 0; i < HPAGE_PMD_NR; i++) + remove_migration_ptes(page + i, page + i, true); + } } static void __split_huge_page_tail(struct page *head, int tail, diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c new file mode 100644 index 000000000000..dc1a54826cf2 --- /dev/null +++ b/mm/page_vma_mapped.c @@ -0,0 +1,188 @@ +#include +#include +#include +#include +#include + +#include "internal.h" + +static inline bool check_pmd(struct page_vma_mapped_walk *pvmw) +{ + pmd_t pmde; + /* + * Make sure we don't re-load pmd between present and !trans_huge check. + * We need a consistent view. + */ + pmde = READ_ONCE(*pvmw->pmd); + return pmd_present(pmde) && !pmd_trans_huge(pmde); +} + +static inline bool not_found(struct page_vma_mapped_walk *pvmw) +{ + page_vma_mapped_walk_done(pvmw); + return false; +} + +static bool map_pte(struct page_vma_mapped_walk *pvmw) +{ + pvmw->pte = pte_offset_map(pvmw->pmd, pvmw->address); + if (!(pvmw->flags & PVMW_SYNC)) { + if (pvmw->flags & PVMW_MIGRATION) { + if (!is_swap_pte(*pvmw->pte)) + return false; + } else { + if (!pte_present(*pvmw->pte)) + return false; + } + } + pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd); + spin_lock(pvmw->ptl); + return true; +} + +static bool check_pte(struct page_vma_mapped_walk *pvmw) +{ + if (pvmw->flags & PVMW_MIGRATION) { +#ifdef CONFIG_MIGRATION + swp_entry_t entry; + if (!is_swap_pte(*pvmw->pte)) + return false; + entry = pte_to_swp_entry(*pvmw->pte); + if (!is_migration_entry(entry)) + return false; + if (migration_entry_to_page(entry) - pvmw->page >= + hpage_nr_pages(pvmw->page)) { + return false; + } + if (migration_entry_to_page(entry) < pvmw->page) + return false; +#else + WARN_ON_ONCE(1); +#endif + } else { + if (!pte_present(*pvmw->pte)) + return false; + + /* THP can be referenced by any subpage */ + if (pte_page(*pvmw->pte) - pvmw->page >= + hpage_nr_pages(pvmw->page)) { + return false; + } + if (pte_page(*pvmw->pte) < pvmw->page) + return false; + } + + return true; +} + +/** + * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at + * @pvmw->address + * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags + * must be set. pmd, pte and ptl must be NULL. + * + * Returns true if the page is mapped in the vma. @pvmw->pmd and @pvmw->pte point + * to relevant page table entries. @pvmw->ptl is locked. @pvmw->address is + * adjusted if needed (for PTE-mapped THPs). + * + * If @pvmw->pmd is set but @pvmw->pte is not, you have found PMD-mapped page + * (usually THP). For PTE-mapped THP, you should run page_vma_mapped_walk() in + * a loop to find all PTEs that map the THP. + * + * For HugeTLB pages, @pvmw->pte is set to the relevant page table entry + * regardless of which page table level the page is mapped at. @pvmw->pmd is + * NULL. + * + * Retruns false if there are no more page table entries for the page in + * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped. + * + * If you need to stop the walk before page_vma_mapped_walk() returned false, + * use page_vma_mapped_walk_done(). It will do the housekeeping. + */ +bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) +{ + struct mm_struct *mm = pvmw->vma->vm_mm; + struct page *page = pvmw->page; + pgd_t *pgd; + pud_t *pud; + + /* The only possible pmd mapping has been handled on last iteration */ + if (pvmw->pmd && !pvmw->pte) + return not_found(pvmw); + + /* Only for THP, seek to next pte entry makes sense */ + if (pvmw->pte) { + if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) + return not_found(pvmw); + goto next_pte; + } + + if (unlikely(PageHuge(pvmw->page))) { + /* when pud is not present, pte will be NULL */ + pvmw->pte = huge_pte_offset(mm, pvmw->address); + if (!pvmw->pte) + return false; + + pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte); + spin_lock(pvmw->ptl); + if (!check_pte(pvmw)) + return not_found(pvmw); + return true; + } +restart: + pgd = pgd_offset(mm, pvmw->address); + if (!pgd_present(*pgd)) + return false; + pud = pud_offset(pgd, pvmw->address); + if (!pud_present(*pud)) + return false; + pvmw->pmd = pmd_offset(pud, pvmw->address); + if (pmd_trans_huge(*pvmw->pmd)) { + pvmw->ptl = pmd_lock(mm, pvmw->pmd); + if (!pmd_present(*pvmw->pmd)) + return not_found(pvmw); + if (likely(pmd_trans_huge(*pvmw->pmd))) { + if (pvmw->flags & PVMW_MIGRATION) + return not_found(pvmw); + if (pmd_page(*pvmw->pmd) != page) + return not_found(pvmw); + return true; + } else { + /* THP pmd was split under us: handle on pte level */ + spin_unlock(pvmw->ptl); + pvmw->ptl = NULL; + } + } else { + if (!check_pmd(pvmw)) + return false; + } + if (!map_pte(pvmw)) + goto next_pte; + while (1) { + if (check_pte(pvmw)) + return true; +next_pte: do { + pvmw->address += PAGE_SIZE; + if (pvmw->address >= + __vma_address(pvmw->page, pvmw->vma) + + hpage_nr_pages(pvmw->page) * PAGE_SIZE) + return not_found(pvmw); + /* Did we cross page table boundary? */ + if (pvmw->address % PMD_SIZE == 0) { + pte_unmap(pvmw->pte); + if (pvmw->ptl) { + spin_unlock(pvmw->ptl); + pvmw->ptl = NULL; + } + goto restart; + } else { + pvmw->pte++; + } + } while (pte_none(*pvmw->pte)); + + if (!pvmw->ptl) { + pvmw->ptl = pte_lockptr(mm, pvmw->pmd); + spin_lock(pvmw->ptl); + } + } +} -- cgit v1.2.3 From d53a8b49a626fdfce4390710da6d04b4314db25f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 24 Feb 2017 14:58:13 -0800 Subject: mm: drop page_check_address{,_transhuge} All users are gone. Let's drop them. Link: http://lkml.kernel.org/r/20170129173858.45174-12-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Hillf Danton Cc: Hugh Dickins Cc: Johannes Weiner Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Srikar Dronamraju Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 36 -------------- mm/rmap.c | 138 --------------------------------------------------- 2 files changed, 174 deletions(-) (limited to 'include') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b76343610653..8c89e902df3e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -197,42 +197,6 @@ int page_referenced(struct page *, int is_locked, int try_to_unmap(struct page *, enum ttu_flags flags); -/* - * Used by uprobes to replace a userspace page safely - */ -pte_t *__page_check_address(struct page *, struct mm_struct *, - unsigned long, spinlock_t **, int); - -static inline pte_t *page_check_address(struct page *page, struct mm_struct *mm, - unsigned long address, - spinlock_t **ptlp, int sync) -{ - pte_t *ptep; - - __cond_lock(*ptlp, ptep = __page_check_address(page, mm, address, - ptlp, sync)); - return ptep; -} - -/* - * Used by idle page tracking to check if a page was referenced via page - * tables. - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -bool page_check_address_transhuge(struct page *page, struct mm_struct *mm, - unsigned long address, pmd_t **pmdp, - pte_t **ptep, spinlock_t **ptlp); -#else -static inline bool page_check_address_transhuge(struct page *page, - struct mm_struct *mm, unsigned long address, - pmd_t **pmdp, pte_t **ptep, spinlock_t **ptlp) -{ - *ptep = page_check_address(page, mm, address, ptlp, 0); - *pmdp = NULL; - return !!*ptep; -} -#endif - /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) /* Look for migarion entries rather than present PTEs */ diff --git a/mm/rmap.c b/mm/rmap.c index 80525820aada..8774791e2809 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -708,144 +708,6 @@ out: return pmd; } -/* - * Check that @page is mapped at @address into @mm. - * - * If @sync is false, page_check_address may perform a racy check to avoid - * the page table lock when the pte is not present (helpful when reclaiming - * highly shared pages). - * - * On success returns with pte mapped and locked. - */ -pte_t *__page_check_address(struct page *page, struct mm_struct *mm, - unsigned long address, spinlock_t **ptlp, int sync) -{ - pmd_t *pmd; - pte_t *pte; - spinlock_t *ptl; - - if (unlikely(PageHuge(page))) { - /* when pud is not present, pte will be NULL */ - pte = huge_pte_offset(mm, address); - if (!pte) - return NULL; - - ptl = huge_pte_lockptr(page_hstate(page), mm, pte); - goto check; - } - - pmd = mm_find_pmd(mm, address); - if (!pmd) - return NULL; - - pte = pte_offset_map(pmd, address); - /* Make a quick check before getting the lock */ - if (!sync && !pte_present(*pte)) { - pte_unmap(pte); - return NULL; - } - - ptl = pte_lockptr(mm, pmd); -check: - spin_lock(ptl); - if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { - *ptlp = ptl; - return pte; - } - pte_unmap_unlock(pte, ptl); - return NULL; -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -/* - * Check that @page is mapped at @address into @mm. In contrast to - * page_check_address(), this function can handle transparent huge pages. - * - * On success returns true with pte mapped and locked. For PMD-mapped - * transparent huge pages *@ptep is set to NULL. - */ -bool page_check_address_transhuge(struct page *page, struct mm_struct *mm, - unsigned long address, pmd_t **pmdp, - pte_t **ptep, spinlock_t **ptlp) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - spinlock_t *ptl; - - if (unlikely(PageHuge(page))) { - /* when pud is not present, pte will be NULL */ - pte = huge_pte_offset(mm, address); - if (!pte) - return false; - - ptl = huge_pte_lockptr(page_hstate(page), mm, pte); - pmd = NULL; - goto check_pte; - } - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return false; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return false; - pmd = pmd_offset(pud, address); - - if (pmd_trans_huge(*pmd)) { - ptl = pmd_lock(mm, pmd); - if (!pmd_present(*pmd)) - goto unlock_pmd; - if (unlikely(!pmd_trans_huge(*pmd))) { - spin_unlock(ptl); - goto map_pte; - } - - if (pmd_page(*pmd) != page) - goto unlock_pmd; - - pte = NULL; - goto found; -unlock_pmd: - spin_unlock(ptl); - return false; - } else { - pmd_t pmde = *pmd; - - barrier(); - if (!pmd_present(pmde) || pmd_trans_huge(pmde)) - return false; - } -map_pte: - pte = pte_offset_map(pmd, address); - if (!pte_present(*pte)) { - pte_unmap(pte); - return false; - } - - ptl = pte_lockptr(mm, pmd); -check_pte: - spin_lock(ptl); - - if (!pte_present(*pte)) { - pte_unmap_unlock(pte, ptl); - return false; - } - - /* THP can be referenced by any subpage */ - if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) { - pte_unmap_unlock(pte, ptl); - return false; - } -found: - *ptep = pte; - *pmdp = pmd; - *ptlp = ptl; - return true; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - struct page_referenced_arg { int mapcount; int referenced; -- cgit v1.2.3 From 897ab3e0c49e24b62e2d54d165c7afec6bbca65b Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 24 Feb 2017 14:58:22 -0800 Subject: userfaultfd: non-cooperative: add event for memory unmaps When a non-cooperative userfaultfd monitor copies pages in the background, it may encounter regions that were already unmapped. Addition of UFFD_EVENT_UNMAP allows the uffd monitor to track precisely changes in the virtual memory layout. Since there might be different uffd contexts for the affected VMAs, we first should create a temporary representation for the unmap event for each uffd context and then notify them one by one to the appropriate userfault file descriptors. The event notification occurs after the mmap_sem has been released. [arnd@arndb.de: fix nommu build] Link: http://lkml.kernel.org/r/20170203165141.3665284-1-arnd@arndb.de [mhocko@suse.com: fix nommu build] Link: http://lkml.kernel.org/r/20170202091503.GA22823@dhcp22.suse.cz Link: http://lkml.kernel.org/r/1485542673-24387-3-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Signed-off-by: Michal Hocko Signed-off-by: Arnd Bergmann Acked-by: Hillf Danton Cc: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/kernel/vdso.c | 2 +- arch/tile/mm/elf.c | 2 +- arch/x86/entry/vdso/vma.c | 2 +- arch/x86/mm/mpx.c | 4 +-- fs/aio.c | 2 +- fs/proc/vmcore.c | 4 +-- fs/userfaultfd.c | 65 ++++++++++++++++++++++++++++++++++++++++ include/linux/mm.h | 14 +++++---- include/linux/userfaultfd_k.h | 18 +++++++++++ include/uapi/linux/userfaultfd.h | 3 ++ ipc/shm.c | 8 ++--- mm/mmap.c | 46 ++++++++++++++++++---------- mm/mremap.c | 23 ++++++++------ mm/nommu.c | 7 +++-- mm/util.c | 5 +++- 15 files changed, 160 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index f9dbfb14af33..093517e85a6c 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -111,7 +111,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) base = mmap_region(NULL, STACK_TOP, PAGE_SIZE, VM_READ|VM_WRITE|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - 0); + 0, NULL); if (IS_ERR_VALUE(base)) { ret = base; goto out; diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c index 6225cc998db1..889901824400 100644 --- a/arch/tile/mm/elf.c +++ b/arch/tile/mm/elf.c @@ -143,7 +143,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, unsigned long addr = MEM_USER_INTRPT; addr = mmap_region(NULL, addr, INTRPT_SIZE, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0); + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0, NULL); if (addr > (unsigned long) -PAGE_SIZE) retval = (int) addr; } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 10820f6cefbf..572cee3fccff 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -186,7 +186,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) if (IS_ERR(vma)) { ret = PTR_ERR(vma); - do_munmap(mm, text_start, image->size); + do_munmap(mm, text_start, image->size, NULL); } else { current->mm->context.vdso = (void __user *)text_start; current->mm->context.vdso_image = image; diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index aad4ac386f98..c98079684bdb 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -51,7 +51,7 @@ static unsigned long mpx_mmap(unsigned long len) down_write(&mm->mmap_sem); addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate); + MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL); up_write(&mm->mmap_sem); if (populate) mm_populate(addr, populate); @@ -893,7 +893,7 @@ static int unmap_entire_bt(struct mm_struct *mm, * avoid recursion, do_munmap() will check whether it comes * from one bounds table through VM_MPX flag. */ - return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm)); + return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL); } static int try_unmap_single_bt(struct mm_struct *mm, diff --git a/fs/aio.c b/fs/aio.c index 873b4ca82ccb..7e2ab9c8e39c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -512,7 +512,7 @@ static int aio_setup_ring(struct kioctx *ctx) ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, - MAP_SHARED, 0, &unused); + MAP_SHARED, 0, &unused, NULL); up_write(&mm->mmap_sem); if (IS_ERR((void *)ctx->mmap_base)) { ctx->mmap_size = 0; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index f52d8e857ff7..885d445afa0d 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -388,7 +388,7 @@ static int remap_oldmem_pfn_checked(struct vm_area_struct *vma, } return 0; fail: - do_munmap(vma->vm_mm, from, len); + do_munmap(vma->vm_mm, from, len, NULL); return -EAGAIN; } @@ -481,7 +481,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) return 0; fail: - do_munmap(vma->vm_mm, vma->vm_start, len); + do_munmap(vma->vm_mm, vma->vm_start, len, NULL); return -EAGAIN; } #else diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 8fe601b4875e..4c78458ea78d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -71,6 +71,13 @@ struct userfaultfd_fork_ctx { struct list_head list; }; +struct userfaultfd_unmap_ctx { + struct userfaultfd_ctx *ctx; + unsigned long start; + unsigned long end; + struct list_head list; +}; + struct userfaultfd_wait_queue { struct uffd_msg msg; wait_queue_t wq; @@ -709,6 +716,64 @@ void userfaultfd_remove(struct vm_area_struct *vma, down_read(&mm->mmap_sem); } +static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, + unsigned long start, unsigned long end) +{ + struct userfaultfd_unmap_ctx *unmap_ctx; + + list_for_each_entry(unmap_ctx, unmaps, list) + if (unmap_ctx->ctx == ctx && unmap_ctx->start == start && + unmap_ctx->end == end) + return true; + + return false; +} + +int userfaultfd_unmap_prep(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *unmaps) +{ + for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { + struct userfaultfd_unmap_ctx *unmap_ctx; + struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + + if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || + has_unmap_ctx(ctx, unmaps, start, end)) + continue; + + unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL); + if (!unmap_ctx) + return -ENOMEM; + + userfaultfd_ctx_get(ctx); + unmap_ctx->ctx = ctx; + unmap_ctx->start = start; + unmap_ctx->end = end; + list_add_tail(&unmap_ctx->list, unmaps); + } + + return 0; +} + +void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) +{ + struct userfaultfd_unmap_ctx *ctx, *n; + struct userfaultfd_wait_queue ewq; + + list_for_each_entry_safe(ctx, n, uf, list) { + msg_init(&ewq.msg); + + ewq.msg.event = UFFD_EVENT_UNMAP; + ewq.msg.arg.remove.start = ctx->start; + ewq.msg.arg.remove.end = ctx->end; + + userfaultfd_event_wait_completion(ctx->ctx, &ewq); + + list_del(&ctx->list); + kfree(ctx); + } +} + static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; diff --git a/include/linux/mm.h b/include/linux/mm.h index c65aa43b5712..c6fcba1d1ae5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2090,18 +2090,22 @@ extern int install_special_mapping(struct mm_struct *mm, extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); extern unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff); + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate); -extern int do_munmap(struct mm_struct *, unsigned long, size_t); + vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, + struct list_head *uf); +extern int do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf); static inline unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long pgoff, unsigned long *populate) + unsigned long pgoff, unsigned long *populate, + struct list_head *uf) { - return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate); + return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate, uf); } #ifdef CONFIG_MMU diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 2521542f6c07..a40be5d0661b 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -66,6 +66,12 @@ extern void userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); +extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *uf); +extern void userfaultfd_unmap_complete(struct mm_struct *mm, + struct list_head *uf); + #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -118,6 +124,18 @@ static inline void userfaultfd_remove(struct vm_area_struct *vma, unsigned long end) { } + +static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *uf) +{ + return 0; +} + +static inline void userfaultfd_unmap_complete(struct mm_struct *mm, + struct list_head *uf) +{ +} #endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index b742c40c2880..3b059530dac9 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -21,6 +21,7 @@ #define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ UFFD_FEATURE_EVENT_REMOVE | \ + UFFD_FEATURE_EVENT_UNMAP | \ UFFD_FEATURE_MISSING_HUGETLBFS | \ UFFD_FEATURE_MISSING_SHMEM) #define UFFD_API_IOCTLS \ @@ -110,6 +111,7 @@ struct uffd_msg { #define UFFD_EVENT_FORK 0x13 #define UFFD_EVENT_REMAP 0x14 #define UFFD_EVENT_REMOVE 0x15 +#define UFFD_EVENT_UNMAP 0x16 /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ @@ -158,6 +160,7 @@ struct uffdio_api { #define UFFD_FEATURE_EVENT_REMOVE (1<<3) #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) #define UFFD_FEATURE_MISSING_SHMEM (1<<5) +#define UFFD_FEATURE_EVENT_UNMAP (1<<6) __u64 features; __u64 ioctls; diff --git a/ipc/shm.c b/ipc/shm.c index 7f6537b84ef5..d7805acb44fd 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1222,7 +1222,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, goto invalid; } - addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); + addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate, NULL); *raddr = addr; err = 0; if (IS_ERR_VALUE(addr)) @@ -1329,7 +1329,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) */ file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); + do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1356,7 +1356,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && (vma->vm_file == file)) - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); + do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); vma = next; } @@ -1365,7 +1365,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) * given */ if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); + do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); retval = 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 13d16a2b7623..1cec28d20583 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -176,7 +176,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -static int do_brk(unsigned long addr, unsigned long len); +static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf); SYSCALL_DEFINE1(brk, unsigned long, brk) { @@ -185,6 +185,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) struct mm_struct *mm = current->mm; unsigned long min_brk; bool populate; + LIST_HEAD(uf); if (down_write_killable(&mm->mmap_sem)) return -EINTR; @@ -222,7 +223,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* Always allow shrinking brk. */ if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk)) + if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf)) goto set_brk; goto out; } @@ -232,13 +233,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) < 0) + if (do_brk(oldbrk, newbrk-oldbrk, &uf) < 0) goto out; set_brk: mm->brk = brk; populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; @@ -1304,7 +1306,8 @@ static inline int mlock_future_check(struct mm_struct *mm, unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, - unsigned long pgoff, unsigned long *populate) + unsigned long pgoff, unsigned long *populate, + struct list_head *uf) { struct mm_struct *mm = current->mm; int pkey = 0; @@ -1447,7 +1450,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_NORESERVE; } - addr = mmap_region(file, addr, len, vm_flags, pgoff); + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) @@ -1583,7 +1586,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) } unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -1609,7 +1613,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Clear old maps */ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { - if (do_munmap(mm, addr, len)) + if (do_munmap(mm, addr, len, uf)) return -ENOMEM; } @@ -2579,7 +2583,8 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, * work. This now handles partial unmappings. * Jeremy Fitzhardinge */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, + struct list_head *uf) { unsigned long end; struct vm_area_struct *vma, *prev, *last; @@ -2603,6 +2608,13 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) if (vma->vm_start >= end) return 0; + if (uf) { + int error = userfaultfd_unmap_prep(vma, start, end, uf); + + if (error) + return error; + } + /* * If we need to split any vma, do it now to save pain later. * @@ -2668,12 +2680,14 @@ int vm_munmap(unsigned long start, size_t len) { int ret; struct mm_struct *mm = current->mm; + LIST_HEAD(uf); if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = do_munmap(mm, start, len); + ret = do_munmap(mm, start, len, &uf); up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); return ret; } EXPORT_SYMBOL(vm_munmap); @@ -2773,7 +2787,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, file = get_file(vma->vm_file); ret = do_mmap_pgoff(vma->vm_file, start, size, - prot, flags, pgoff, &populate); + prot, flags, pgoff, &populate, NULL); fput(file); out: up_write(&mm->mmap_sem); @@ -2799,7 +2813,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) +static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -2838,7 +2852,7 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long */ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { - if (do_munmap(mm, addr, len)) + if (do_munmap(mm, addr, len, uf)) return -ENOMEM; } @@ -2885,9 +2899,9 @@ out: return 0; } -static int do_brk(unsigned long addr, unsigned long len) +static int do_brk(unsigned long addr, unsigned long len, struct list_head *uf) { - return do_brk_flags(addr, len, 0); + return do_brk_flags(addr, len, 0, uf); } int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags) @@ -2895,13 +2909,15 @@ int vm_brk_flags(unsigned long addr, unsigned long len, unsigned long flags) struct mm_struct *mm = current->mm; int ret; bool populate; + LIST_HEAD(uf); if (down_write_killable(&mm->mmap_sem)) return -EINTR; - ret = do_brk_flags(addr, len, flags); + ret = do_brk_flags(addr, len, flags, &uf); populate = ((mm->def_flags & VM_LOCKED) != 0); up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); return ret; diff --git a/mm/mremap.c b/mm/mremap.c index 8779928d6a70..8233b0105c82 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -252,7 +252,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, static unsigned long move_vma(struct vm_area_struct *vma, unsigned long old_addr, unsigned long old_len, unsigned long new_len, unsigned long new_addr, - bool *locked, struct vm_userfaultfd_ctx *uf) + bool *locked, struct vm_userfaultfd_ctx *uf, + struct list_head *uf_unmap) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; @@ -341,7 +342,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn_moved(vma); - if (do_munmap(mm, old_addr, old_len) < 0) { + if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { /* OOM: unable to split vma, just get accounts right */ vm_unacct_memory(excess >> PAGE_SHIFT); excess = 0; @@ -417,7 +418,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, static unsigned long mremap_to(unsigned long addr, unsigned long old_len, unsigned long new_addr, unsigned long new_len, bool *locked, - struct vm_userfaultfd_ctx *uf) + struct vm_userfaultfd_ctx *uf, + struct list_head *uf_unmap) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -435,12 +437,12 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (addr + old_len > new_addr && new_addr + new_len > addr) goto out; - ret = do_munmap(mm, new_addr, new_len); + ret = do_munmap(mm, new_addr, new_len, NULL); if (ret) goto out; if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len); + ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap); if (ret && old_len != new_len) goto out; old_len = new_len; @@ -462,7 +464,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (offset_in_page(ret)) goto out1; - ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf); + ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf, + uf_unmap); if (!(offset_in_page(ret))) goto out; out1: @@ -502,6 +505,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long charged = 0; bool locked = false; struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; + LIST_HEAD(uf_unmap); if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) return ret; @@ -528,7 +532,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (flags & MREMAP_FIXED) { ret = mremap_to(addr, old_len, new_addr, new_len, - &locked, &uf); + &locked, &uf, &uf_unmap); goto out; } @@ -538,7 +542,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * do_munmap does all the needed commit accounting */ if (old_len >= new_len) { - ret = do_munmap(mm, addr+new_len, old_len - new_len); + ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap); if (ret && old_len != new_len) goto out; ret = addr; @@ -598,7 +602,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } ret = move_vma(vma, addr, old_len, new_len, new_addr, - &locked, &uf); + &locked, &uf, &uf_unmap); } out: if (offset_in_page(ret)) { @@ -609,5 +613,6 @@ out: if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); mremap_userfaultfd_complete(&uf, addr, new_addr, old_len); + userfaultfd_unmap_complete(mm, &uf_unmap); return ret; } diff --git a/mm/nommu.c b/mm/nommu.c index 215c62296028..fe9f4fa4a7a7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1205,7 +1205,8 @@ unsigned long do_mmap(struct file *file, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, - unsigned long *populate) + unsigned long *populate, + struct list_head *uf) { struct vm_area_struct *vma; struct vm_region *region; @@ -1577,7 +1578,7 @@ static int shrink_vma(struct mm_struct *mm, * - under NOMMU conditions the chunk to be unmapped must be backed by a single * VMA, though it need not cover the whole VMA */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { struct vm_area_struct *vma; unsigned long end; @@ -1643,7 +1644,7 @@ int vm_munmap(unsigned long addr, size_t len) int ret; down_write(&mm->mmap_sem); - ret = do_munmap(mm, addr, len); + ret = do_munmap(mm, addr, len, NULL); up_write(&mm->mmap_sem); return ret; } diff --git a/mm/util.c b/mm/util.c index 3cb2164f4099..b8f538863b5a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -297,14 +298,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long ret; struct mm_struct *mm = current->mm; unsigned long populate; + LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); if (!ret) { if (down_write_killable(&mm->mmap_sem)) return -EINTR; ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, - &populate); + &populate, &uf); up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); } -- cgit v1.2.3 From ca49ca7114553587736fe78319e22f073b631380 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 24 Feb 2017 14:58:25 -0800 Subject: userfaultfd: non-cooperative: add event for exit() notification Allow userfaultfd monitor track termination of the processes that have memory backed by the uffd. [rppt@linux.vnet.ibm.com: add comment] Link: http://lkml.kernel.org/r/20170202135448.GB19804@rapoport-lnxLink: http://lkml.kernel.org/r/1485542673-24387-4-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Acked-by: Hillf Danton Cc: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 28 ++++++++++++++++++++++++++++ include/linux/userfaultfd_k.h | 7 +++++++ include/uapi/linux/userfaultfd.h | 5 ++++- kernel/exit.c | 2 ++ 4 files changed, 41 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 4c78458ea78d..b676575f2268 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -774,6 +774,34 @@ void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) } } +void userfaultfd_exit(struct mm_struct *mm) +{ + struct vm_area_struct *vma = mm->mmap; + + /* + * We can do the vma walk without locking because the caller + * (exit_mm) knows it now has exclusive access + */ + while (vma) { + struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + + if (ctx && (ctx->features & UFFD_FEATURE_EVENT_EXIT)) { + struct userfaultfd_wait_queue ewq; + + userfaultfd_ctx_get(ctx); + + msg_init(&ewq.msg); + ewq.msg.event = UFFD_EVENT_EXIT; + + userfaultfd_event_wait_completion(ctx, &ewq); + + ctx->features &= ~UFFD_FEATURE_EVENT_EXIT; + } + + vma = vma->vm_next; + } +} + static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index a40be5d0661b..0468548acebf 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -72,6 +72,8 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); +extern void userfaultfd_exit(struct mm_struct *mm); + #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -136,6 +138,11 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) { } + +static inline void userfaultfd_exit(struct mm_struct *mm) +{ +} + #endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 3b059530dac9..c055947c5c98 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -18,7 +18,8 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) -#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK | \ +#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_EXIT | \ + UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ UFFD_FEATURE_EVENT_REMOVE | \ UFFD_FEATURE_EVENT_UNMAP | \ @@ -112,6 +113,7 @@ struct uffd_msg { #define UFFD_EVENT_REMAP 0x14 #define UFFD_EVENT_REMOVE 0x15 #define UFFD_EVENT_UNMAP 0x16 +#define UFFD_EVENT_EXIT 0x17 /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ @@ -161,6 +163,7 @@ struct uffdio_api { #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) #define UFFD_FEATURE_MISSING_SHMEM (1<<5) #define UFFD_FEATURE_EVENT_UNMAP (1<<6) +#define UFFD_FEATURE_EVENT_EXIT (1<<7) __u64 features; __u64 ioctls; diff --git a/kernel/exit.c b/kernel/exit.c index 9960accbf2ab..90b09ca35c84 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -547,6 +548,7 @@ static void exit_mm(void) enter_lazy_tlb(mm, current); task_unlock(current); mm_update_next_owner(mm); + userfaultfd_exit(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); -- cgit v1.2.3 From ca96b625341027f611c3e61351a70311077ebcf5 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 24 Feb 2017 14:58:37 -0800 Subject: mm: alloc_contig_range: allow to specify GFP mask Currently alloc_contig_range assumes that the compaction should be done with the default GFP_KERNEL flags. This is probably right for all current uses of this interface, but may change as CMA is used in more use-cases (including being the default DMA memory allocator on some platforms). Change the function prototype, to allow for passing through the GFP mask set by upper layers. Also respect global restrictions by applying memalloc_noio_flags to the passed in flags. Link: http://lkml.kernel.org/r/20170127172328.18574-1-l.stach@pengutronix.de Signed-off-by: Lucas Stach Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Radim Krcmar Cc: Catalin Marinas Cc: Will Deacon Cc: Chris Zankel Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Alexander Graf Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- mm/cma.c | 3 ++- mm/hugetlb.c | 3 ++- mm/page_alloc.c | 5 +++-- 4 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0fe0b6295ab5..db373b9d3223 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -541,7 +541,7 @@ static inline bool pm_suspended_storage(void) #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, - unsigned migratetype); + unsigned migratetype, gfp_t gfp_mask); extern void free_contig_range(unsigned long pfn, unsigned nr_pages); #endif diff --git a/mm/cma.c b/mm/cma.c index 94b3460cd608..c6aed23ca6df 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -402,7 +402,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); - ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, + GFP_KERNEL); mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 167fd0722c15..2e0e8159ce8e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1052,7 +1052,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long nr_pages) { unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, + GFP_KERNEL); } static bool pfn_range_valid_gigantic(struct zone *z, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2d34cdb70f1d..8a0f33624335 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7399,6 +7399,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. + * @gfp_mask: GFP mask to use during compaction * * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES * aligned, however it's the caller's responsibility to guarantee that @@ -7412,7 +7413,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * need to be freed with free_contig_range(). */ int alloc_contig_range(unsigned long start, unsigned long end, - unsigned migratetype) + unsigned migratetype, gfp_t gfp_mask) { unsigned long outer_start, outer_end; unsigned int order; @@ -7424,7 +7425,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, - .gfp_mask = GFP_KERNEL, + .gfp_mask = memalloc_noio_flags(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); -- cgit v1.2.3 From e2f466e32f56c8b3ee0d1a40cc39e1c779dfd598 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 24 Feb 2017 14:58:41 -0800 Subject: mm: cma_alloc: allow to specify GFP mask Most users of this interface just want to use it with the default GFP_KERNEL flags, but for cases where DMA memory is allocated it may be called from a different context. No functional change yet, just passing through the flag to the underlying alloc_contig_range function. Link: http://lkml.kernel.org/r/20170127172328.18574-2-l.stach@pengutronix.de Signed-off-by: Lucas Stach Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Radim Krcmar Cc: Catalin Marinas Cc: Will Deacon Cc: Chris Zankel Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Alexander Graf Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_hv_builtin.c | 3 ++- drivers/base/dma-contiguous.c | 2 +- include/linux/cma.h | 3 ++- mm/cma.c | 5 +++-- mm/cma_debug.c | 2 +- 5 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index c42a7e63b39e..4d6c64b3041c 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -56,7 +56,8 @@ struct page *kvm_alloc_hpt_cma(unsigned long nr_pages) { VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); - return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES)); + return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES), + GFP_KERNEL); } EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma); diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index e167a1e1bccb..d1a9cbabc627 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -193,7 +193,7 @@ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - return cma_alloc(dev_get_cma_area(dev), count, align); + return cma_alloc(dev_get_cma_area(dev), count, align, GFP_KERNEL); } /** diff --git a/include/linux/cma.h b/include/linux/cma.h index 6f0a91b37f68..03f32d0bd1d8 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -29,6 +29,7 @@ extern int __init cma_declare_contiguous(phys_addr_t base, extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, struct cma **res_cma); -extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align); +extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, + gfp_t gfp_mask); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); #endif diff --git a/mm/cma.c b/mm/cma.c index c6aed23ca6df..2906ae5a83ff 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -357,7 +357,8 @@ err: * This function allocates part of contiguous memory on specific * contiguous memory area. */ -struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) +struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, + gfp_t gfp_mask) { unsigned long mask, offset; unsigned long pfn = -1; @@ -403,7 +404,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma_mutex); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, - GFP_KERNEL); + gfp_mask); mutex_unlock(&cma_mutex); if (ret == 0) { page = pfn_to_page(pfn); diff --git a/mm/cma_debug.c b/mm/cma_debug.c index f8e4b60db167..ffc0c3d0ae64 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -138,7 +138,7 @@ static int cma_alloc_mem(struct cma *cma, int count) if (!mem) return -ENOMEM; - p = cma_alloc(cma, count, 0); + p = cma_alloc(cma, count, 0, GFP_KERNEL); if (!p) { kfree(mem); return -ENOMEM; -- cgit v1.2.3 From 712c604dcdf8186295e2af694adf52c6842ad100 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Fri, 24 Feb 2017 14:58:44 -0800 Subject: mm: wire up GFP flag passing in dma_alloc_from_contiguous The callers of the DMA alloc functions already provide the proper context GFP flags. Make sure to pass them through to the CMA allocator, to make the CMA compaction context aware. Link: http://lkml.kernel.org/r/20170127172328.18574-3-l.stach@pengutronix.de Signed-off-by: Lucas Stach Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Radim Krcmar Cc: Catalin Marinas Cc: Will Deacon Cc: Chris Zankel Cc: Ralf Baechle Cc: Paolo Bonzini Cc: Alexander Graf Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/dma-mapping.c | 16 +++++++++------- arch/arm64/mm/dma-mapping.c | 4 ++-- arch/mips/mm/dma-default.c | 4 ++-- arch/x86/kernel/pci-dma.c | 3 ++- arch/xtensa/kernel/pci-dma.c | 3 ++- drivers/base/dma-contiguous.c | 5 +++-- drivers/iommu/amd_iommu.c | 2 +- drivers/iommu/intel-iommu.c | 2 +- include/linux/dma-contiguous.h | 4 ++-- 9 files changed, 24 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 82d3e79ec82b..6ffdf17e0d5c 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -349,7 +349,7 @@ static void __dma_free_buffer(struct page *page, size_t size) static void *__alloc_from_contiguous(struct device *dev, size_t size, pgprot_t prot, struct page **ret_page, const void *caller, bool want_vaddr, - int coherent_flag); + int coherent_flag, gfp_t gfp); static void *__alloc_remap_buffer(struct device *dev, size_t size, gfp_t gfp, pgprot_t prot, struct page **ret_page, @@ -420,7 +420,8 @@ static int __init atomic_pool_init(void) */ if (dev_get_cma_area(NULL)) ptr = __alloc_from_contiguous(NULL, atomic_pool_size, prot, - &page, atomic_pool_init, true, NORMAL); + &page, atomic_pool_init, true, NORMAL, + GFP_KERNEL); else ptr = __alloc_remap_buffer(NULL, atomic_pool_size, gfp, prot, &page, atomic_pool_init, true); @@ -594,14 +595,14 @@ static int __free_from_pool(void *start, size_t size) static void *__alloc_from_contiguous(struct device *dev, size_t size, pgprot_t prot, struct page **ret_page, const void *caller, bool want_vaddr, - int coherent_flag) + int coherent_flag, gfp_t gfp) { unsigned long order = get_order(size); size_t count = size >> PAGE_SHIFT; struct page *page; void *ptr = NULL; - page = dma_alloc_from_contiguous(dev, count, order); + page = dma_alloc_from_contiguous(dev, count, order, gfp); if (!page) return NULL; @@ -655,7 +656,7 @@ static inline pgprot_t __get_dma_pgprot(unsigned long attrs, pgprot_t prot) #define __get_dma_pgprot(attrs, prot) __pgprot(0) #define __alloc_remap_buffer(dev, size, gfp, prot, ret, c, wv) NULL #define __alloc_from_pool(size, ret_page) NULL -#define __alloc_from_contiguous(dev, size, prot, ret, c, wv, coherent_flag) NULL +#define __alloc_from_contiguous(dev, size, prot, ret, c, wv, coherent_flag, gfp) NULL #define __free_from_pool(cpu_addr, size) do { } while (0) #define __free_from_contiguous(dev, page, cpu_addr, size, wv) do { } while (0) #define __dma_free_remap(cpu_addr, size) do { } while (0) @@ -697,7 +698,8 @@ static void *cma_allocator_alloc(struct arm_dma_alloc_args *args, { return __alloc_from_contiguous(args->dev, args->size, args->prot, ret_page, args->caller, - args->want_vaddr, args->coherent_flag); + args->want_vaddr, args->coherent_flag, + args->gfp); } static void cma_allocator_free(struct arm_dma_free_args *args) @@ -1312,7 +1314,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size, unsigned long order = get_order(size); struct page *page; - page = dma_alloc_from_contiguous(dev, count, order); + page = dma_alloc_from_contiguous(dev, count, order, gfp); if (!page) goto error; diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 351f7595cb3e..aff1d0afeb1e 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -107,7 +107,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size, void *addr; page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, - get_order(size)); + get_order(size), flags); if (!page) return NULL; @@ -390,7 +390,7 @@ static int __init atomic_pool_init(void) if (dev_get_cma_area(NULL)) page = dma_alloc_from_contiguous(NULL, nr_pages, - pool_size_order); + pool_size_order, GFP_KERNEL); else page = alloc_pages(GFP_DMA, pool_size_order); diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c index a39c36af97ad..1895a692efd4 100644 --- a/arch/mips/mm/dma-default.c +++ b/arch/mips/mm/dma-default.c @@ -148,8 +148,8 @@ static void *mips_dma_alloc_coherent(struct device *dev, size_t size, gfp = massage_gfp_flags(dev, gfp); if (IS_ENABLED(CONFIG_DMA_CMA) && gfpflags_allow_blocking(gfp)) - page = dma_alloc_from_contiguous(dev, - count, get_order(size)); + page = dma_alloc_from_contiguous(dev, count, get_order(size), + gfp); if (!page) page = alloc_pages(gfp, get_order(size)); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index d30c37750765..d5c223c9cf11 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -91,7 +91,8 @@ again: page = NULL; /* CMA can be used only in the context which permits sleeping */ if (gfpflags_allow_blocking(flag)) { - page = dma_alloc_from_contiguous(dev, count, get_order(size)); + page = dma_alloc_from_contiguous(dev, count, get_order(size), + flag); if (page && page_to_phys(page) + size > dma_mask) { dma_release_from_contiguous(dev, page, count); page = NULL; diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c index 70e362e6038e..34c1f9fa6acc 100644 --- a/arch/xtensa/kernel/pci-dma.c +++ b/arch/xtensa/kernel/pci-dma.c @@ -158,7 +158,8 @@ static void *xtensa_dma_alloc(struct device *dev, size_t size, flag |= GFP_DMA; if (gfpflags_allow_blocking(flag)) - page = dma_alloc_from_contiguous(dev, count, get_order(size)); + page = dma_alloc_from_contiguous(dev, count, get_order(size), + flag); if (!page) page = alloc_pages(flag, get_order(size)); diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index d1a9cbabc627..b55804cac4c4 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -181,6 +181,7 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, * @dev: Pointer to device for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). + * @gfp_mask: GFP flags to use for this allocation. * * This function allocates memory buffer for specified device. It uses * device specific contiguous memory area if available or the default @@ -188,12 +189,12 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, * function. */ struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int align) + unsigned int align, gfp_t gfp_mask) { if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - return cma_alloc(dev_get_cma_area(dev), count, align, GFP_KERNEL); + return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask); } /** diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 1b5b8c5361c5..09bd3b290bb8 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -2672,7 +2672,7 @@ static void *alloc_coherent(struct device *dev, size_t size, return NULL; page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, - get_order(size)); + get_order(size), flag); if (!page) return NULL; } diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index f5e02f8e7371..a8f7ae0eb7a4 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -3829,7 +3829,7 @@ static void *intel_alloc_coherent(struct device *dev, size_t size, if (gfpflags_allow_blocking(flags)) { unsigned int count = size >> PAGE_SHIFT; - page = dma_alloc_from_contiguous(dev, count, order); + page = dma_alloc_from_contiguous(dev, count, order, flags); if (page && iommu_no_mapping(dev) && page_to_phys(page) + size > dev->coherent_dma_mask) { dma_release_from_contiguous(dev, page, count); diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index fec734df1524..b67bf6ac907d 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -112,7 +112,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size, } struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int order); + unsigned int order, gfp_t gfp_mask); bool dma_release_from_contiguous(struct device *dev, struct page *pages, int count); @@ -145,7 +145,7 @@ int dma_declare_contiguous(struct device *dev, phys_addr_t size, static inline struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int order) + unsigned int order, gfp_t gfp_mask) { return NULL; } -- cgit v1.2.3 From def5efe0376501ef7bd6b53ed061512c142e59aa Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 24 Feb 2017 14:58:47 -0800 Subject: mm, madvise: fail with ENOMEM when splitting vma will hit max_map_count If madvise(2) advice will result in the underlying vma being split and the number of areas mapped by the process will exceed /proc/sys/vm/max_map_count as a result, return ENOMEM instead of EAGAIN. EAGAIN is returned by madvise(2) when a kernel resource, such as slab, is temporarily unavailable. It indicates that userspace should retry the advice in the near future. This is important for advice such as MADV_DONTNEED which is often used by malloc implementations to free memory back to the system: we really do want to free memory back when madvise(2) returns EAGAIN because slab allocations (for vmas, anon_vmas, or mempolicies) cannot be allocated. Encountering /proc/sys/vm/max_map_count is not a temporary failure, however, so return ENOMEM to indicate this is a more serious issue. A followup patch to the man page will specify this behavior. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701241431120.42507@chino.kir.corp.google.com Signed-off-by: David Rientjes Cc: Jonathan Corbet Cc: Johannes Weiner Cc: Jerome Marchand Cc: "Kirill A. Shutemov" Cc: Michael Kerrisk Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 4 ++-- Documentation/vm/ksm.txt | 4 ++++ include/linux/mm.h | 6 ++++-- mm/madvise.c | 51 +++++++++++++++++++++++++++++++++++++-------- mm/mmap.c | 8 +++---- 5 files changed, 56 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 95ccbe6d79ce..b4ad97f10b8e 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -376,8 +376,8 @@ max_map_count: This file contains the maximum number of memory map areas a process may have. Memory map areas are used as a side-effect of calling -malloc, directly by mmap and mprotect, and also when loading shared -libraries. +malloc, directly by mmap, mprotect, and madvise, and also when loading +shared libraries. While most applications need less than a thousand maps, certain programs, particularly malloc debuggers, may consume lots of them, diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt index 0c64a81d6808..6b0ca7feb135 100644 --- a/Documentation/vm/ksm.txt +++ b/Documentation/vm/ksm.txt @@ -38,6 +38,10 @@ the range for whenever the KSM daemon is started; even if the range cannot contain any pages which KSM could actually merge; even if MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE. +If a region of memory must be split into at least one new MADV_MERGEABLE +or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process +will exceed vm.max_map_count (see Documentation/sysctl/vm.txt). + Like other madvise calls, they are intended for use on mapped areas of the user address space: they will report ENOMEM if the specified range includes unmapped gaps (though working on the intervening mapped areas), diff --git a/include/linux/mm.h b/include/linux/mm.h index c6fcba1d1ae5..ed233b002e33 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2041,8 +2041,10 @@ extern struct vm_area_struct *vma_merge(struct mm_struct *, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int split_vma(struct mm_struct *, - struct vm_area_struct *, unsigned long addr, int new_below); +extern int __split_vma(struct mm_struct *, struct vm_area_struct *, + unsigned long addr, int new_below); +extern int split_vma(struct mm_struct *, struct vm_area_struct *, + unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, struct rb_node **, struct rb_node *); diff --git a/mm/madvise.c b/mm/madvise.c index 0012071a6e50..11fc65f81ecd 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -92,14 +92,28 @@ static long madvise_behavior(struct vm_area_struct *vma, case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); - if (error) + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; goto out; + } break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); - if (error) + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; goto out; + } break; } @@ -120,15 +134,37 @@ static long madvise_behavior(struct vm_area_struct *vma, *prev = vma; if (start != vma->vm_start) { - error = split_vma(mm, vma, start, 1); - if (error) + if (unlikely(mm->map_count >= sysctl_max_map_count)) { + error = -ENOMEM; goto out; + } + error = __split_vma(mm, vma, start, 1); + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + goto out; + } } if (end != vma->vm_end) { - error = split_vma(mm, vma, end, 0); - if (error) + if (unlikely(mm->map_count >= sysctl_max_map_count)) { + error = -ENOMEM; + goto out; + } + error = __split_vma(mm, vma, end, 0); + if (error) { + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; goto out; + } } success: @@ -136,10 +172,7 @@ success: * vm_flags is protected by the mmap_sem held in write mode. */ vma->vm_flags = new_flags; - out: - if (error == -ENOMEM) - error = -EAGAIN; return error; } diff --git a/mm/mmap.c b/mm/mmap.c index 1cec28d20583..499b988b1639 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2499,11 +2499,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * __split_vma() bypasses sysctl_max_map_count checking. We use this on the - * munmap path where it doesn't make sense to fail. + * __split_vma() bypasses sysctl_max_map_count checking. We use this where it + * has already been checked or doesn't make sense to fail. */ -static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, int new_below) +int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vm_area_struct *new; int err; -- cgit v1.2.3 From 288bc54949fc2625a4fd811a188fb200cc498946 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 24 Feb 2017 14:59:16 -0800 Subject: mm/autonuma: let architecture override how the write bit should be stashed in a protnone pte. Patch series "Numabalancing preserve write fix", v2. This patch series address an issue w.r.t THP migration and autonuma preserve write feature. migrate_misplaced_transhuge_page() cannot deal with concurrent modification of the page. It does a page copy without following the migration pte sequence. IIUC, this was done to keep the migration simpler and at the time of implemenation we didn't had THP page cache which would have required a more elaborate migration scheme. That means thp autonuma migration expect the protnone with saved write to be done such that both kernel and user cannot update the page content. This patch series enables archs like ppc64 to do that. We are good with the hash translation mode with the current code, because we never create a hardware page table entry for a protnone pte. This patch (of 2): Autonuma preserves the write permission across numa fault to avoid taking a writefault after a numa fault (Commit: b191f9b106ea " mm: numa: preserve PTE write permissions across a NUMA hinting fault"). Architecture can implement protnone in different ways and some may choose to implement that by clearing Read/ Write/Exec bit of pte. Setting the write bit on such pte can result in wrong behaviour. Fix this up by allowing arch to override how to save the write bit on a protnone pte. [aneesh.kumar@linux.vnet.ibm.com: don't mark pte saved write in case of dirty_accountable] Link: http://lkml.kernel.org/r/1487942884-16517-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com [aneesh.kumar@linux.vnet.ibm.com: v3] Link: http://lkml.kernel.org/r/1487498625-10891-2-git-send-email-aneesh.kumar@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1487050314-3892-2-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michael Neuling Cc: Rik van Riel Cc: Mel Gorman Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 16 ++++++++++++++++ mm/huge_memory.c | 6 +++--- mm/memory.c | 2 +- mm/mprotect.c | 2 +- 4 files changed, 21 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index a0aba0f9c57b..6265411eb2ed 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -225,6 +225,22 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef pte_savedwrite +#define pte_savedwrite pte_write +#endif + +#ifndef pte_mk_savedwrite +#define pte_mk_savedwrite pte_mkwrite +#endif + +#ifndef pmd_savedwrite +#define pmd_savedwrite pmd_write +#endif + +#ifndef pmd_mk_savedwrite +#define pmd_mk_savedwrite pmd_mkwrite +#endif + #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index efddd02141a8..92c2ee2dfcf8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1426,7 +1426,7 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) } /* See similar comment in do_numa_page for explanation */ - if (!pmd_write(pmd)) + if (!pmd_savedwrite(pmd)) flags |= TNF_NO_GROUP; /* @@ -1489,7 +1489,7 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) goto out; clear_pmdnuma: BUG_ON(!PageLocked(page)); - was_writable = pmd_write(pmd); + was_writable = pmd_savedwrite(pmd); pmd = pmd_modify(pmd, vma->vm_page_prot); pmd = pmd_mkyoung(pmd); if (was_writable) @@ -1744,7 +1744,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); entry = pmd_modify(entry, newprot); if (preserve_write) - entry = pmd_mkwrite(entry); + entry = pmd_mk_savedwrite(entry); ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); BUG_ON(vma_is_anonymous(vma) && !preserve_write && diff --git a/mm/memory.c b/mm/memory.c index 0c759ba122e1..14fc0b40f0bb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3401,7 +3401,7 @@ static int do_numa_page(struct vm_fault *vmf) int target_nid; bool migrated = false; pte_t pte; - bool was_writable = pte_write(vmf->orig_pte); + bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; /* diff --git a/mm/mprotect.c b/mm/mprotect.c index a45b4dc6a7f5..848e946b08e5 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -99,7 +99,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ptent = ptep_modify_prot_start(mm, addr, pte); ptent = pte_modify(ptent, newprot); if (preserve_write) - ptent = pte_mkwrite(ptent); + ptent = pte_mk_savedwrite(ptent); /* Avoid taking write faults for known dirty pages */ if (dirty_accountable && pte_dirty(ptent) && -- cgit v1.2.3 From 595cd8f256d24face93b2722927ec9c980419c26 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 24 Feb 2017 14:59:19 -0800 Subject: mm/ksm: handle protnone saved writes when making page write protect Without this KSM will consider the page write protected, but a numa fault can later mark the page writable. This can result in memory corruption. Link: http://lkml.kernel.org/r/1487498625-10891-3-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 8 ++++++++ mm/ksm.c | 9 +++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 6265411eb2ed..f4ca23b158b3 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -233,6 +233,10 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres #define pte_mk_savedwrite pte_mkwrite #endif +#ifndef pte_clear_savedwrite +#define pte_clear_savedwrite pte_wrprotect +#endif + #ifndef pmd_savedwrite #define pmd_savedwrite pmd_write #endif @@ -241,6 +245,10 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres #define pmd_mk_savedwrite pmd_mkwrite #endif +#ifndef pmd_clear_savedwrite +#define pmd_clear_savedwrite pmd_wrprotect +#endif + #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, diff --git a/mm/ksm.c b/mm/ksm.c index 8960f6ecbc12..cf211c01ceac 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -880,7 +880,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) goto out_unlock; - if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte)) { + if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || + (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte))) { pte_t entry; swapped = PageSwapCache(page); @@ -905,7 +906,11 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, } if (pte_dirty(entry)) set_page_dirty(page); - entry = pte_mkclean(pte_wrprotect(entry)); + + if (pte_protnone(entry)) + entry = pte_mkclean(pte_clear_savedwrite(entry)); + else + entry = pte_mkclean(pte_wrprotect(entry)); set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = *pvmw.pte; -- cgit v1.2.3 From 3a4f8a0b3ffa733ffbb327685e83b63383127cf6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 24 Feb 2017 14:59:36 -0800 Subject: mm: remove shmem_mapping() shmem_zero_setup() duplicates Remove the prototypes for shmem_mapping() and shmem_zero_setup() from linux/mm.h, since they are already provided in linux/shmem_fs.h. But shmem_fs.h must then provide the inline stub for shmem_mapping() when CONFIG_SHMEM is not set, and a few more cfiles now need to #include it. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1702081658250.1549@eggly.anvils Signed-off-by: Hugh Dickins Cc: Johannes Weiner Cc: Michal Simek Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/microblaze/pci/pci-common.c | 1 + arch/powerpc/kernel/pci-common.c | 1 + include/linux/mm.h | 10 ---------- include/linux/shmem_fs.h | 7 +++++++ mm/madvise.c | 1 + mm/memcontrol.c | 1 + mm/mincore.c | 1 + mm/truncate.c | 1 + mm/workingset.c | 1 + 9 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index 7f696f97f9dd..13bc93242c0c 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 74bec5498972..a3f5334f5d8c 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index ed233b002e33..0d65dd72c0f4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1168,16 +1168,6 @@ extern void pagefault_out_of_memory(void); extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); -int shmem_zero_setup(struct vm_area_struct *); -#ifdef CONFIG_SHMEM -bool shmem_mapping(struct address_space *mapping); -#else -static inline bool shmem_mapping(struct address_space *mapping) -{ - return false; -} -#endif - extern bool can_do_mlock(void); extern int user_shm_lock(size_t, struct user_struct *); extern void user_shm_unlock(size_t, struct user_struct *); diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index fdaac9d4d46d..a7d6bd2a918f 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -57,7 +57,14 @@ extern int shmem_zero_setup(struct vm_area_struct *); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct user_struct *user); +#ifdef CONFIG_SHMEM extern bool shmem_mapping(struct address_space *mapping); +#else +static inline bool shmem_mapping(struct address_space *mapping) +{ + return false; +} +#endif /* CONFIG_SHMEM */ extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); diff --git a/mm/madvise.c b/mm/madvise.c index 11fc65f81ecd..dc5927c812d3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1fd6affcdde7..45867e439d31 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/mincore.c b/mm/mincore.c index ddb872da3f5b..c5687c45c326 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include diff --git a/mm/truncate.c b/mm/truncate.c index dd7b24e083c5..f2db67465495 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -20,6 +20,7 @@ #include #include /* grr. try_to_release_page, do_invalidatepage */ +#include #include #include #include "internal.h" diff --git a/mm/workingset.c b/mm/workingset.c index a67f5796b995..79ed5364375d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include -- cgit v1.2.3 From dc18d706a4367454ad1fc51e06148d54e8ecfaa0 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Fri, 24 Feb 2017 15:00:02 -0800 Subject: memory-hotplug: use dev_online for memhp_auto_online Commit 31bc3858ea3e ("add automatic onlining policy for the newly added memory") provides the capability to have added memory automatically onlined during add, but this appears to be slightly broken. The current implementation uses walk_memory_range() to call online_memory_block, which uses memory_block_change_state() to online the memory. Instead, we should be calling device_online() for the memory block in online_memory_block(). This would online the memory (the memory bus online routine memory_subsys_online() called from device_online calls memory_block_change_state()) and properly update the device struct offline flag. As a result of the current implementation, attempting to remove a memory block after adding it using auto online fails. This is because doing a remove, for instance echo offline > /sys/devices/system/memory/memoryXXX/state uses device_offline() which checks the dev->offline flag. Link: http://lkml.kernel.org/r/20170222220744.8119.19687.stgit@ltcalpine2-lp14.aus.stglabs.ibm.com Signed-off-by: Nathan Fontenot Cc: Michael Ellerman Cc: Michael Roth Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 2 +- include/linux/memory.h | 3 --- mm/memory_hotplug.c | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index fa26ffd25fa6..cc4f1d0cbffe 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -249,7 +249,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t return ret; } -int memory_block_change_state(struct memory_block *mem, +static int memory_block_change_state(struct memory_block *mem, unsigned long to_state, unsigned long from_state_req) { int ret = 0; diff --git a/include/linux/memory.h b/include/linux/memory.h index 093607f90b91..b723a686fc10 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -109,9 +109,6 @@ extern void unregister_memory_notifier(struct notifier_block *nb); extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); extern int register_new_memory(int, struct mem_section *); -extern int memory_block_change_state(struct memory_block *mem, - unsigned long to_state, - unsigned long from_state_req); #ifdef CONFIG_MEMORY_HOTREMOVE extern int unregister_memory_section(struct mem_section *); #endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c35dd1976574..1d3ed58f92ab 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1337,7 +1337,7 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default, static int online_memory_block(struct memory_block *mem, void *arg) { - return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); + return device_online(&mem->dev); } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -- cgit v1.2.3 From f9fa1d919c696e90c887d8742198023e7639d139 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 24 Feb 2017 15:00:05 -0800 Subject: kasan: drain quarantine of memcg slab objects Per memcg slab accounting and kasan have a problem with kmem_cache destruction. - kmem_cache_create() allocates a kmem_cache, which is used for allocations from processes running in root (top) memcg. - Processes running in non root memcg and allocating with either __GFP_ACCOUNT or from a SLAB_ACCOUNT cache use a per memcg kmem_cache. - Kasan catches use-after-free by having kfree() and kmem_cache_free() defer freeing of objects. Objects are placed in a quarantine. - kmem_cache_destroy() destroys root and non root kmem_caches. It takes care to drain the quarantine of objects from the root memcg's kmem_cache, but ignores objects associated with non root memcg. This causes leaks because quarantined per memcg objects refer to per memcg kmem cache being destroyed. To see the problem: 1) create a slab cache with kmem_cache_create(,,,SLAB_ACCOUNT,) 2) from non root memcg, allocate and free a few objects from cache 3) dispose of the cache with kmem_cache_destroy() kmem_cache_destroy() will trigger a "Slab cache still has objects" warning indicating that the per memcg kmem_cache structure was leaked. Fix the leak by draining kasan quarantined objects allocated from non root memcg. Racing memcg deletion is tricky, but handled. kmem_cache_destroy() => shutdown_memcg_caches() => __shutdown_memcg_cache() => shutdown_cache() flushes per memcg quarantined objects, even if that memcg has been rmdir'd and gone through memcg_deactivate_kmem_caches(). This leak only affects destroyed SLAB_ACCOUNT kmem caches when kasan is enabled. So I don't think it's worth patching stable kernels. Link: http://lkml.kernel.org/r/1482257462-36948-1-git-send-email-gthelen@google.com Signed-off-by: Greg Thelen Reviewed-by: Vladimir Davydov Acked-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 4 ++-- mm/kasan/kasan.c | 2 +- mm/kasan/quarantine.c | 1 + mm/slab_common.c | 4 +++- 4 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 820c0ad54a01..c908b25bf5a5 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -52,7 +52,7 @@ void kasan_free_pages(struct page *page, unsigned int order); void kasan_cache_create(struct kmem_cache *cache, size_t *size, unsigned long *flags); void kasan_cache_shrink(struct kmem_cache *cache); -void kasan_cache_destroy(struct kmem_cache *cache); +void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_poison_slab(struct page *page); void kasan_unpoison_object_data(struct kmem_cache *cache, void *object); @@ -98,7 +98,7 @@ static inline void kasan_cache_create(struct kmem_cache *cache, size_t *size, unsigned long *flags) {} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} -static inline void kasan_cache_destroy(struct kmem_cache *cache) {} +static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_poison_slab(struct page *page) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index b2a0cff2bb35..25f0e6521f36 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -435,7 +435,7 @@ void kasan_cache_shrink(struct kmem_cache *cache) quarantine_remove_cache(cache); } -void kasan_cache_destroy(struct kmem_cache *cache) +void kasan_cache_shutdown(struct kmem_cache *cache) { quarantine_remove_cache(cache); } diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index dae929c02bbb..6f1ed1630873 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -274,6 +274,7 @@ static void per_cpu_remove_cache(void *arg) qlist_free_all(&to_free, cache); } +/* Free all quarantined objects belonging to cache. */ void quarantine_remove_cache(struct kmem_cache *cache) { unsigned long flags, i; diff --git a/mm/slab_common.c b/mm/slab_common.c index 23ff74e61838..09d0e849b07f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -528,6 +528,9 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) static int shutdown_cache(struct kmem_cache *s) { + /* free asan quarantined objects */ + kasan_cache_shutdown(s); + if (__kmem_cache_shutdown(s) != 0) return -EBUSY; @@ -816,7 +819,6 @@ void kmem_cache_destroy(struct kmem_cache *s) get_online_cpus(); get_online_mems(); - kasan_cache_destroy(s); mutex_lock(&slab_mutex); s->refcount--; -- cgit v1.2.3 From 796f571b0c5cf3efd2f652779770fa7bbbc2bb03 Mon Sep 17 00:00:00 2001 From: Lafcadio Wluiki Date: Fri, 24 Feb 2017 15:00:23 -0800 Subject: procfs: use an enum for possible hidepid values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the hidepid parameter was checked by comparing literal integers 0, 1, 2. Let's add a proper enum for this, to make the checking more expressive: 0 → HIDEPID_OFF 1 → HIDEPID_NO_ACCESS 2 → HIDEPID_INVISIBLE This changes the internal labelling only, the userspace-facing interface remains unmodified, and still works with literal integers 0, 1, 2. No functional changes. Link: http://lkml.kernel.org/r/1484572984-13388-2-git-send-email-djalal@gmail.com Signed-off-by: Lafcadio Wluiki Signed-off-by: Djalal Harouni Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 8 ++++---- fs/proc/inode.c | 2 +- fs/proc/root.c | 3 ++- include/linux/pid_namespace.h | 6 ++++++ 4 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/fs/proc/base.c b/fs/proc/base.c index 4ecb5edc3c61..b8f06273353e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -697,11 +697,11 @@ static int proc_pid_permission(struct inode *inode, int mask) task = get_proc_task(inode); if (!task) return -ESRCH; - has_perms = has_pid_permissions(pid, task, 1); + has_perms = has_pid_permissions(pid, task, HIDEPID_NO_ACCESS); put_task_struct(task); if (!has_perms) { - if (pid->hide_pid == 2) { + if (pid->hide_pid == HIDEPID_INVISIBLE) { /* * Let's make getdents(), stat(), and open() * consistent with each other. If a process @@ -1737,7 +1737,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) stat->gid = GLOBAL_ROOT_GID; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { - if (!has_pid_permissions(pid, task, 2)) { + if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) { rcu_read_unlock(); /* * This doesn't prevent learning whether PID exists, @@ -3168,7 +3168,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) int len; cond_resched(); - if (!has_pid_permissions(ns, iter.task, 2)) + if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE)) continue; len = snprintf(name, sizeof(name), "%d", iter.tgid); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 7ad9ed7958af..2cc7a8030275 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -107,7 +107,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid)); - if (pid->hide_pid != 0) + if (pid->hide_pid != HIDEPID_OFF) seq_printf(seq, ",hidepid=%u", pid->hide_pid); return 0; diff --git a/fs/proc/root.c b/fs/proc/root.c index 1988440b2049..b90da888b81a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -58,7 +58,8 @@ int proc_parse_options(char *options, struct pid_namespace *pid) case Opt_hidepid: if (match_int(&args[0], &option)) return 0; - if (option < 0 || option > 2) { + if (option < HIDEPID_OFF || + option > HIDEPID_INVISIBLE) { pr_err("proc: hidepid value must be between 0 and 2.\n"); return 0; } diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 34cce96741bc..c2a989dee876 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -21,6 +21,12 @@ struct pidmap { struct fs_pin; +enum { /* definitions for pid_namespace's hide_pid field */ + HIDEPID_OFF = 0, + HIDEPID_NO_ACCESS = 1, + HIDEPID_INVISIBLE = 2, +}; + struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; -- cgit v1.2.3 From 1ca5eebb894a3625b2a543c7b550aa4ae33ba3cc Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Fri, 24 Feb 2017 15:00:26 -0800 Subject: uapi: mqueue.h: add missing linux/types.h include Commit 63159f5dcccb ("uapi: Use __kernel_long_t in struct mq_attr") changed the types from long to __kernel_long_t, but didn't add a linux/types.h include. Code that tries to include this header directly breaks: /usr/include/linux/mqueue.h:26:2: error: unknown type name '__kernel_long_t' __kernel_long_t mq_flags; /* message queue flags */ This also upsets configure tests for this header: checking linux/mqueue.h usability... no checking linux/mqueue.h presence... yes configure: WARNING: linux/mqueue.h: present but cannot be compiled configure: WARNING: linux/mqueue.h: check for missing prerequisite headers? configure: WARNING: linux/mqueue.h: see the Autoconf documentation configure: WARNING: linux/mqueue.h: section "Present But Cannot Be Compiled" configure: WARNING: linux/mqueue.h: proceeding with the compiler's result checking for linux/mqueue.h... no Link: http://lkml.kernel.org/r/20170119194644.4403-1-vapier@gentoo.org Signed-off-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/mqueue.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/mqueue.h b/include/uapi/linux/mqueue.h index d0a2b8e89813..bbd5116ea739 100644 --- a/include/uapi/linux/mqueue.h +++ b/include/uapi/linux/mqueue.h @@ -18,6 +18,8 @@ #ifndef _LINUX_MQUEUE_H #define _LINUX_MQUEUE_H +#include + #define MQ_PRIO_MAX 32768 /* per-uid limit of kernel memory used by mqueue, in bytes */ #define MQ_BYTES_MAX 819200 -- cgit v1.2.3 From 16f4e3195156f2f06e90d00a36bc48d7a513f253 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 24 Feb 2017 15:00:29 -0800 Subject: include/linux/iopoll.h: include instead of The timer APIs this header needs are ktime_get(), ktime_add_us(), and ktime_compare(). So, including seems enough. This commit will cut unnecessary header file parsing. Link: http://lkml.kernel.org/r/1481679225-10885-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/iopoll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h index 1c30014ed176..d29e1e21bf3f 100644 --- a/include/linux/iopoll.h +++ b/include/linux/iopoll.h @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include -- cgit v1.2.3 From a3f0825e7e37d99a02a8a1b1599687667ee50d04 Mon Sep 17 00:00:00 2001 From: Gideon Israel Dsouza Date: Fri, 24 Feb 2017 15:00:32 -0800 Subject: compiler-gcc.h: add a new macro to wrap gcc attribute Add __mode(x) into compiler-gcc.h as part of a cleanup task I've taken up, to replace gcc specific attributes with macros. The next patch is a cleanup of the m68k subsystem and it requires a new macro to wrap __attribute__ ((mode (...))) Link: http://lkml.kernel.org/r/1485540901-1988-2-git-send-email-gidisrael@gmail.com Signed-off-by: Gideon Israel Dsouza Cc: Greg Ungerer Cc: Geert Uytterhoeven Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index fddd1a5eb322..811f7a915658 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -122,6 +122,7 @@ #define __attribute_const__ __attribute__((__const__)) #define __maybe_unused __attribute__((unused)) #define __always_unused __attribute__((unused)) +#define __mode(x) __attribute__((mode(x))) /* gcc version specific checks */ -- cgit v1.2.3 From 85caa95b9f19bb3a26d7e025d1134760b69e0c40 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 24 Feb 2017 15:00:38 -0800 Subject: bug: switch data corruption check to __must_check The CHECK_DATA_CORRUPTION() macro was designed to have callers do something meaningful/protective on failure. However, using "return false" in the macro too strictly limits the design patterns of callers. Instead, let callers handle the logic test directly, but make sure that the result IS checked by forcing __must_check (which appears to not be able to be used directly on macro expressions). Link: http://lkml.kernel.org/r/20170206204547.GA125312@beast Signed-off-by: Kees Cook Suggested-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bug.h | 12 +++++++----- lib/list_debug.c | 45 ++++++++++++++++++++++++--------------------- 2 files changed, 31 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/bug.h b/include/linux/bug.h index baff2e8fc8a8..5828489309bb 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -124,18 +124,20 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr, /* * Since detected data corruption should stop operation on the affected - * structures, this returns false if the corruption condition is found. + * structures. Return value must be checked and sanely acted on by caller. */ +static inline __must_check bool check_data_corruption(bool v) { return v; } #define CHECK_DATA_CORRUPTION(condition, fmt, ...) \ - do { \ - if (unlikely(condition)) { \ + check_data_corruption(({ \ + bool corruption = unlikely(condition); \ + if (corruption) { \ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ pr_err(fmt, ##__VA_ARGS__); \ BUG(); \ } else \ WARN(1, fmt, ##__VA_ARGS__); \ - return false; \ } \ - } while (0) + corruption; \ + })) #endif /* _LINUX_BUG_H */ diff --git a/lib/list_debug.c b/lib/list_debug.c index 7f7bfa55eb6d..a34db8d27667 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -20,15 +20,16 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { - CHECK_DATA_CORRUPTION(next->prev != prev, - "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", - prev, next->prev, next); - CHECK_DATA_CORRUPTION(prev->next != next, - "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", - next, prev->next, prev); - CHECK_DATA_CORRUPTION(new == prev || new == next, - "list_add double add: new=%p, prev=%p, next=%p.\n", - new, prev, next); + if (CHECK_DATA_CORRUPTION(next->prev != prev, + "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", + prev, next->prev, next) || + CHECK_DATA_CORRUPTION(prev->next != next, + "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", + next, prev->next, prev) || + CHECK_DATA_CORRUPTION(new == prev || new == next, + "list_add double add: new=%p, prev=%p, next=%p.\n", + new, prev, next)) + return false; return true; } @@ -41,18 +42,20 @@ bool __list_del_entry_valid(struct list_head *entry) prev = entry->prev; next = entry->next; - CHECK_DATA_CORRUPTION(next == LIST_POISON1, - "list_del corruption, %p->next is LIST_POISON1 (%p)\n", - entry, LIST_POISON1); - CHECK_DATA_CORRUPTION(prev == LIST_POISON2, - "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", - entry, LIST_POISON2); - CHECK_DATA_CORRUPTION(prev->next != entry, - "list_del corruption. prev->next should be %p, but was %p\n", - entry, prev->next); - CHECK_DATA_CORRUPTION(next->prev != entry, - "list_del corruption. next->prev should be %p, but was %p\n", - entry, next->prev); + if (CHECK_DATA_CORRUPTION(next == LIST_POISON1, + "list_del corruption, %p->next is LIST_POISON1 (%p)\n", + entry, LIST_POISON1) || + CHECK_DATA_CORRUPTION(prev == LIST_POISON2, + "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", + entry, LIST_POISON2) || + CHECK_DATA_CORRUPTION(prev->next != entry, + "list_del corruption. prev->next should be %p, but was %p\n", + entry, prev->next) || + CHECK_DATA_CORRUPTION(next->prev != entry, + "list_del corruption. next->prev should be %p, but was %p\n", + entry, next->prev)) + return false; + return true; } -- cgit v1.2.3 From 4f5901f5a6724f4ed1641e4a94978c5039a1f8c4 Mon Sep 17 00:00:00 2001 From: Niklas Söderlund Date: Fri, 24 Feb 2017 15:01:01 -0800 Subject: linux/kernel.h: fix DIV_ROUND_CLOSEST to support negative divisors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While working on a thermal driver I encounter a scenario where the divisor could be negative, instead of adding local code to handle this I though I first try to add support for this in DIV_ROUND_CLOSEST. Add support to DIV_ROUND_CLOSEST for negative divisors if both dividend and divisor variable types are signed. This should not alter current behavior for users of the macro as previously negative divisors where not supported. Before: DIV_ROUND_CLOSEST( 59, 4) = 15 DIV_ROUND_CLOSEST( 59, -4) = -14 DIV_ROUND_CLOSEST( -59, 4) = -15 DIV_ROUND_CLOSEST( -59, -4) = 14 After: DIV_ROUND_CLOSEST( 59, 4) = 15 DIV_ROUND_CLOSEST( 59, -4) = -15 DIV_ROUND_CLOSEST( -59, 4) = -15 DIV_ROUND_CLOSEST( -59, -4) = 15 [akpm@linux-foundation.org: fix comment, per Guenter] Link: http://lkml.kernel.org/r/20161222102217.29011-1-niklas.soderlund+renesas@ragnatech.se Signed-off-by: Niklas Söderlund Reviewed-by: Guenter Roeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index cb09238f6d32..4c26dc3a8295 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -100,16 +100,18 @@ ) /* - * Divide positive or negative dividend by positive divisor and round - * to closest integer. Result is undefined for negative divisors and - * for negative dividends if the divisor variable type is unsigned. + * Divide positive or negative dividend by positive or negative divisor + * and round to closest integer. Result is undefined for negative + * divisors if he dividend variable type is unsigned and for negative + * dividends if the divisor variable type is unsigned. */ #define DIV_ROUND_CLOSEST(x, divisor)( \ { \ typeof(x) __x = x; \ typeof(divisor) __d = divisor; \ (((typeof(x))-1) > 0 || \ - ((typeof(divisor))-1) > 0 || (__x) > 0) ? \ + ((typeof(divisor))-1) > 0 || \ + (((__x) > 0) == ((__d) > 0))) ? \ (((__x) + ((__d) / 2)) / (__d)) : \ (((__x) - ((__d) / 2)) / (__d)); \ } \ -- cgit v1.2.3 From f231aebfc4cae2f6ed27a46a31e2630909513d77 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 24 Feb 2017 15:01:04 -0800 Subject: rbtree: use designated initializers Prepare to mark sensitive kernel structures for randomization by making sure they're using designated initializers. These were identified during allyesconfig builds of x86, arm, and arm64, with most initializer fixes extracted from grsecurity. Link: http://lkml.kernel.org/r/20161217010253.GA140470@beast Signed-off-by: Kees Cook Acked-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Jie Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree_augmented.h | 4 +++- lib/rbtree.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index d076183e49be..9702b6e183bc 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -90,7 +90,9 @@ rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ old->rbaugmented = rbcompute(old); \ } \ rbstatic const struct rb_augment_callbacks rbname = { \ - rbname ## _propagate, rbname ## _copy, rbname ## _rotate \ + .propagate = rbname ## _propagate, \ + .copy = rbname ## _copy, \ + .rotate = rbname ## _rotate \ }; diff --git a/lib/rbtree.c b/lib/rbtree.c index 1f8b112a7c35..4ba2828a67c0 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -427,7 +427,9 @@ static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {} static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {} static const struct rb_augment_callbacks dummy_callbacks = { - dummy_propagate, dummy_copy, dummy_rotate + .propagate = dummy_propagate, + .copy = dummy_copy, + .rotate = dummy_rotate }; void rb_insert_color(struct rb_node *node, struct rb_root *root) -- cgit v1.2.3 From 4e1a33b105ddf201f66dcc44490c6086a25eca0b Mon Sep 17 00:00:00 2001 From: Sven Schmidt <4sschmid@informatik.uni-hamburg.de> Date: Fri, 24 Feb 2017 15:01:12 -0800 Subject: lib: update LZ4 compressor module Patch series "Update LZ4 compressor module", v7. This patchset updates the LZ4 compression module to a version based on LZ4 v1.7.3 allowing to use the fast compression algorithm aka LZ4 fast which provides an "acceleration" parameter as a tradeoff between high compression ratio and high compression speed. We want to use LZ4 fast in order to support compression in lustre and (mostly, based on that) investigate data reduction techniques in behalf of storage systems. Also, it will be useful for other users of LZ4 compression, as with LZ4 fast it is possible to enable applications to use fast and/or high compression depending on the usecase. For instance, ZRAM is offering a LZ4 backend and could benefit from an updated LZ4 in the kernel. LZ4 homepage: http://www.lz4.org/ LZ4 source repository: https://github.com/lz4/lz4 Source version: 1.7.3 Benchmark (taken from [1], Core i5-4300U @1.9GHz): ----------------|--------------|----------------|---------- Compressor | Compression | Decompression | Ratio ----------------|--------------|----------------|---------- memcpy | 4200 MB/s | 4200 MB/s | 1.000 LZ4 fast 50 | 1080 MB/s | 2650 MB/s | 1.375 LZ4 fast 17 | 680 MB/s | 2220 MB/s | 1.607 LZ4 fast 5 | 475 MB/s | 1920 MB/s | 1.886 LZ4 default | 385 MB/s | 1850 MB/s | 2.101 [1] http://fastcompression.blogspot.de/2015/04/sampling-or-faster-lz4.html [PATCH 1/5] lib: Update LZ4 compressor module [PATCH 2/5] lib/decompress_unlz4: Change module to work with new LZ4 module version [PATCH 3/5] crypto: Change LZ4 modules to work with new LZ4 module version [PATCH 4/5] fs/pstore: fs/squashfs: Change usage of LZ4 to work with new LZ4 version [PATCH 5/5] lib/lz4: Remove back-compat wrappers This patch (of 5): Update the LZ4 kernel module to LZ4 v1.7.3 by Yann Collet. The kernel module is inspired by the previous work by Chanho Min. The updated LZ4 module will not break existing code since the patchset contains appropriate changes. API changes: New method LZ4_compress_fast which differs from the variant available in kernel by the new acceleration parameter, allowing to trade compression ratio for more compression speed and vice versa. LZ4_decompress_fast is the respective decompression method, featuring a very fast decoder (multiple GB/s per core), able to reach RAM speed in multi-core systems. The decompressor allows to decompress data compressed with LZ4 fast as well as the LZ4 HC (high compression) algorithm. Also the useful functions LZ4_decompress_safe_partial and LZ4_compress_destsize were added. The latter reverses the logic by trying to compress as much data as possible from source to dest while the former aims to decompress partial blocks of data. A bunch of streaming functions were also added which allow compressig/decompressing data in multiple steps (so called "streaming mode"). The methods lz4_compress and lz4_decompress_unknownoutputsize are now known as LZ4_compress_default respectivley LZ4_decompress_safe. The old methods will be removed since there's no callers left in the code. [arnd@arndb.de: fix KERNEL_LZ4 support] Link: http://lkml.kernel.org/r/20170208211946.2839649-1-arnd@arndb.de [akpm@linux-foundation.org: simplify] [akpm@linux-foundation.org: fix the simplification] [4sschmid@informatik.uni-hamburg.de: fix performance regressions] Link: http://lkml.kernel.org/r/1486898178-17125-2-git-send-email-4sschmid@informatik.uni-hamburg.de [4sschmid@informatik.uni-hamburg.de: v8] Link: http://lkml.kernel.org/r/1487182598-15351-2-git-send-email-4sschmid@informatik.uni-hamburg.de Link: http://lkml.kernel.org/r/1486321748-19085-2-git-send-email-4sschmid@informatik.uni-hamburg.de Signed-off-by: Sven Schmidt <4sschmid@informatik.uni-hamburg.de> Signed-off-by: Arnd Bergmann Cc: Bongkyu Kim Cc: Rui Salvaterra Cc: Sergey Senozhatsky Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: David S. Miller Cc: Anton Vorontsov Cc: Colin Cross Cc: Kees Cook Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lz4.h | 762 +++++++++++++++++++++++++++--- lib/lz4/Makefile | 2 + lib/lz4/lz4_compress.c | 1161 +++++++++++++++++++++++++++++++++------------- lib/lz4/lz4_decompress.c | 705 ++++++++++++++++++---------- lib/lz4/lz4defs.h | 338 ++++++++------ lib/lz4/lz4hc_compress.c | 867 ++++++++++++++++++++++------------ 6 files changed, 2758 insertions(+), 1077 deletions(-) (limited to 'include') diff --git a/include/linux/lz4.h b/include/linux/lz4.h index 6b784c59f321..1b0f8ca0f9c8 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -1,87 +1,717 @@ -#ifndef __LZ4_H__ -#define __LZ4_H__ -/* - * LZ4 Kernel Interface +/* LZ4 Kernel Interface * * Copyright (C) 2013, LG Electronics, Kyungsik Lee + * Copyright (C) 2016, Sven Schmidt <4sschmid@informatik.uni-hamburg.de> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. + * + * This file is based on the original header file + * for LZ4 - Fast LZ compression algorithm. + * + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2016, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * You can contact the author at : + * - LZ4 homepage : http://www.lz4.org + * - LZ4 source repository : https://github.com/lz4/lz4 */ -#define LZ4_MEM_COMPRESS (16384) -#define LZ4HC_MEM_COMPRESS (262144 + (2 * sizeof(unsigned char *))) +#ifndef __LZ4_H__ +#define __LZ4_H__ + +#include +#include /* memset, memcpy */ + +/*-************************************************************************ + * CONSTANTS + **************************************************************************/ /* - * lz4_compressbound() - * Provides the maximum size that LZ4 may output in a "worst case" scenario - * (input data not compressible) + * LZ4_MEMORY_USAGE : + * Memory usage formula : N->2^N Bytes + * (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) + * Increasing memory usage improves compression ratio + * Reduced memory usage can improve speed, due to cache effect + * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */ -static inline size_t lz4_compressbound(size_t isize) +#define LZ4_MEMORY_USAGE 14 + +#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ +#define LZ4_COMPRESSBOUND(isize) (\ + (unsigned int)(isize) > (unsigned int)LZ4_MAX_INPUT_SIZE \ + ? 0 \ + : (isize) + ((isize)/255) + 16) + +#define LZ4_ACCELERATION_DEFAULT 1 +#define LZ4_HASHLOG (LZ4_MEMORY_USAGE-2) +#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE) +#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG) + +#define LZ4HC_MIN_CLEVEL 3 +#define LZ4HC_DEFAULT_CLEVEL 9 +#define LZ4HC_MAX_CLEVEL 16 + +#define LZ4HC_DICTIONARY_LOGSIZE 16 +#define LZ4HC_MAXD (1<= LZ4_compressBound(inputSize). + * It also runs faster, so it's a recommended setting. + * If the function cannot compress 'source' into a more limited 'dest' budget, + * compression stops *immediately*, and the function result is zero. + * As a consequence, 'dest' content is not valid. + * + * Return: Number of bytes written into buffer 'dest' + * (necessarily <= maxOutputSize) or 0 if compression fails + */ +int LZ4_compress_default(const char *source, char *dest, int inputSize, + int maxOutputSize, void *wrkmem); + +/** + * LZ4_compress_fast() - As LZ4_compress_default providing an acceleration param + * @source: source address of the original data + * @dest: output buffer address of the compressed data + * @inputSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE + * @maxOutputSize: full or partial size of buffer 'dest' + * which must be already allocated + * @acceleration: acceleration factor + * @wrkmem: address of the working memory. + * This requires 'workmem' of LZ4_MEM_COMPRESS. + * + * Same as LZ4_compress_default(), but allows to select an "acceleration" + * factor. The larger the acceleration value, the faster the algorithm, + * but also the lesser the compression. It's a trade-off. It can be fine tuned, + * with each successive value providing roughly +~3% to speed. + * An acceleration value of "1" is the same as regular LZ4_compress_default() + * Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT, which is 1. + * + * Return: Number of bytes written into buffer 'dest' + * (necessarily <= maxOutputSize) or 0 if compression fails + */ +int LZ4_compress_fast(const char *source, char *dest, int inputSize, + int maxOutputSize, int acceleration, void *wrkmem); + +/** + * LZ4_compress_destSize() - Compress as much data as possible + * from source to dest + * @source: source address of the original data + * @dest: output buffer address of the compressed data + * @sourceSizePtr: will be modified to indicate how many bytes where read + * from 'source' to fill 'dest'. New value is necessarily <= old value. + * @targetDestSize: Size of buffer 'dest' which must be already allocated + * @wrkmem: address of the working memory. + * This requires 'workmem' of LZ4_MEM_COMPRESS. + * + * Reverse the logic, by compressing as much data as possible + * from 'source' buffer into already allocated buffer 'dest' + * of size 'targetDestSize'. + * This function either compresses the entire 'source' content into 'dest' + * if it's large enough, or fill 'dest' buffer completely with as much data as + * possible from 'source'. + * + * Return: Number of bytes written into 'dest' (necessarily <= targetDestSize) + * or 0 if compression fails + */ +int LZ4_compress_destSize(const char *source, char *dest, int *sourceSizePtr, + int targetDestSize, void *wrkmem); /* - * lz4_decompress() - * src : source address of the compressed data - * src_len : is the input size, whcih is returned after decompress done - * dest : output buffer address of the decompressed data - * actual_dest_len: is the size of uncompressed data, supposing it's known - * return : Success if return 0 - * Error if return (< 0) - * note : Destination buffer must be already allocated. - * slightly faster than lz4_decompress_unknownoutputsize() + * lz4_compress() - For backward compatibility, see LZ4_compress_default + * @src: source address of the original data + * @src_len: size of the original data + * @dst: output buffer address of the compressed data. This requires 'dst' + * of size LZ4_COMPRESSBOUND + * @dst_len: is the output size, which is returned after compress done + * @workmem: address of the working memory. + * + * Return: Success if return 0, Error if return < 0 */ -int lz4_decompress(const unsigned char *src, size_t *src_len, - unsigned char *dest, size_t actual_dest_len); +int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst, + size_t *dst_len, void *wrkmem); + +/*-************************************************************************ + * Decompression Functions + **************************************************************************/ + +/** + * LZ4_decompress_fast() - Decompresses data from 'source' into 'dest' + * @source: source address of the compressed data + * @dest: output buffer address of the uncompressed data + * which must be already allocated with 'originalSize' bytes + * @originalSize: is the original and therefore uncompressed size + * + * Decompresses data from 'source' into 'dest'. + * This function fully respect memory boundaries for properly formed + * compressed data. + * It is a bit faster than LZ4_decompress_safe(). + * However, it does not provide any protection against intentionally + * modified data stream (malicious input). + * Use this function in trusted environment only + * (data to decode comes from a trusted source). + * + * Return: number of bytes read from the source buffer + * or a negative result if decompression fails. + */ +int LZ4_decompress_fast(const char *source, char *dest, int originalSize); + +/** + * LZ4_decompress_safe() - Decompression protected against buffer overflow + * @source: source address of the compressed data + * @dest: output buffer address of the uncompressed data + * which must be already allocated + * @compressedSize: is the precise full size of the compressed block + * @maxDecompressedSize: is the size of 'dest' buffer + * + * Decompresses data fom 'source' into 'dest'. + * If the source stream is detected malformed, the function will + * stop decoding and return a negative result. + * This function is protected against buffer overflow exploits, + * including malicious data packets. It never writes outside output buffer, + * nor reads outside input buffer. + * + * Return: number of bytes decompressed into destination buffer + * (necessarily <= maxDecompressedSize) + * or a negative result in case of error + */ +int LZ4_decompress_safe(const char *source, char *dest, int compressedSize, + int maxDecompressedSize); + +/** + * LZ4_decompress_safe_partial() - Decompress a block of size 'compressedSize' + * at position 'source' into buffer 'dest' + * @source: source address of the compressed data + * @dest: output buffer address of the decompressed data which must be + * already allocated + * @compressedSize: is the precise full size of the compressed block. + * @targetOutputSize: the decompression operation will try + * to stop as soon as 'targetOutputSize' has been reached + * @maxDecompressedSize: is the size of destination buffer + * + * This function decompresses a compressed block of size 'compressedSize' + * at position 'source' into destination buffer 'dest' + * of size 'maxDecompressedSize'. + * The function tries to stop decompressing operation as soon as + * 'targetOutputSize' has been reached, reducing decompression time. + * This function never writes outside of output buffer, + * and never reads outside of input buffer. + * It is therefore protected against malicious data packets. + * + * Return: the number of bytes decoded in the destination buffer + * (necessarily <= maxDecompressedSize) + * or a negative result in case of error + * + */ +int LZ4_decompress_safe_partial(const char *source, char *dest, + int compressedSize, int targetOutputSize, int maxDecompressedSize); /* - * lz4_decompress_unknownoutputsize() - * src : source address of the compressed data - * src_len : is the input size, therefore the compressed size - * dest : output buffer address of the decompressed data - * dest_len: is the max size of the destination buffer, which is - * returned with actual size of decompressed data after - * decompress done - * return : Success if return 0 - * Error if return (< 0) - * note : Destination buffer must be already allocated. + * lz4_decompress_unknownoutputsize() - For backwards compatibility, + * see LZ4_decompress_safe + * @src: source address of the compressed data + * @src_len: is the input size, therefore the compressed size + * @dest: output buffer address of the decompressed data + * which must be already allocated + * @dest_len: is the max size of the destination buffer, which is + * returned with actual size of decompressed data after decompress done + * + * Return: Success if return 0, Error if return (< 0) */ int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, - unsigned char *dest, size_t *dest_len); + unsigned char *dest, size_t *dest_len); + +/** + * lz4_decompress() - For backwards cocmpatibility, see LZ4_decompress_fast + * @src: source address of the compressed data + * @src_len: is the input size, which is returned after decompress done + * @dest: output buffer address of the decompressed data, + * which must be already allocated + * @actual_dest_len: is the size of uncompressed data, supposing it's known + * + * Return: Success if return 0, Error if return (< 0) + */ +int lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len); + +/*-************************************************************************ + * LZ4 HC Compression + **************************************************************************/ + +/** + * LZ4_compress_HC() - Compress data from `src` into `dst`, using HC algorithm + * @src: source address of the original data + * @dst: output buffer address of the compressed data + * @srcSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE + * @dstCapacity: full or partial size of buffer 'dst', + * which must be already allocated + * @compressionLevel: Recommended values are between 4 and 9, although any + * value between 1 and LZ4HC_MAX_CLEVEL will work. + * Values >LZ4HC_MAX_CLEVEL behave the same as 16. + * @wrkmem: address of the working memory. + * This requires 'wrkmem' of size LZ4HC_MEM_COMPRESS. + * + * Compress data from 'src' into 'dst', using the more powerful + * but slower "HC" algorithm. Compression is guaranteed to succeed if + * `dstCapacity >= LZ4_compressBound(srcSize) + * + * Return : the number of bytes written into 'dst' or 0 if compression fails. + */ +int LZ4_compress_HC(const char *src, char *dst, int srcSize, int dstCapacity, + int compressionLevel, void *wrkmem); + +/** + * lz4hc_compress() - For backwards compatibility, see LZ4_compress_HC + * @src: source address of the original data + * @src_len: size of the original data + * @dst: output buffer address of the compressed data. This requires 'dst' + * of size LZ4_COMPRESSBOUND. + * @dst_len: is the output size, which is returned after compress done + * @wrkmem: address of the working memory. + * This requires 'workmem' of size LZ4HC_MEM_COMPRESS. + * + * Return : Success if return 0, Error if return (< 0) + */ +int lz4hc_compress(const unsigned char *src, size_t src_len, unsigned char *dst, + size_t *dst_len, void *wrkmem); + +/** + * LZ4_resetStreamHC() - Init an allocated 'LZ4_streamHC_t' structure + * @streamHCPtr: pointer to the 'LZ4_streamHC_t' structure + * @compressionLevel: Recommended values are between 4 and 9, although any + * value between 1 and LZ4HC_MAX_CLEVEL will work. + * Values >LZ4HC_MAX_CLEVEL behave the same as 16. + * + * An LZ4_streamHC_t structure can be allocated once + * and re-used multiple times. + * Use this function to init an allocated `LZ4_streamHC_t` structure + * and start a new compression. + */ +void LZ4_resetStreamHC(LZ4_streamHC_t *streamHCPtr, int compressionLevel); + +/** + * LZ4_loadDictHC() - Load a static dictionary into LZ4_streamHC + * @streamHCPtr: pointer to the LZ4HC_stream_t + * @dictionary: dictionary to load + * @dictSize: size of dictionary + * + * Use this function to load a static dictionary into LZ4HC_stream. + * Any previous data will be forgotten, only 'dictionary' + * will remain in memory. + * Loading a size of 0 is allowed. + * + * Return : dictionary size, in bytes (necessarily <= 64 KB) + */ +int LZ4_loadDictHC(LZ4_streamHC_t *streamHCPtr, const char *dictionary, + int dictSize); + +/** + * LZ4_compress_HC_continue() - Compress 'src' using data from previously + * compressed blocks as a dictionary using the HC algorithm + * @streamHCPtr: Pointer to the previous 'LZ4_streamHC_t' structure + * @src: source address of the original data + * @dst: output buffer address of the compressed data, + * which must be already allocated + * @srcSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE + * @maxDstSize: full or partial size of buffer 'dest' + * which must be already allocated + * + * These functions compress data in successive blocks of any size, using + * previous blocks as dictionary. One key assumption is that previous + * blocks (up to 64 KB) remain read-accessible while + * compressing next blocks. There is an exception for ring buffers, + * which can be smaller than 64 KB. + * Ring buffers scenario is automatically detected and handled by + * LZ4_compress_HC_continue(). + * Before starting compression, state must be properly initialized, + * using LZ4_resetStreamHC(). + * A first "fictional block" can then be designated as + * initial dictionary, using LZ4_loadDictHC() (Optional). + * Then, use LZ4_compress_HC_continue() + * to compress each successive block. Previous memory blocks + * (including initial dictionary when present) must remain accessible + * and unmodified during compression. + * 'dst' buffer should be sized to handle worst case scenarios, using + * LZ4_compressBound(), to ensure operation success. + * If, for any reason, previous data blocks can't be preserved unmodified + * in memory during next compression block, + * you must save it to a safer memory space, using LZ4_saveDictHC(). + * Return value of LZ4_saveDictHC() is the size of dictionary + * effectively saved into 'safeBuffer'. + * + * Return: Number of bytes written into buffer 'dst' or 0 if compression fails + */ +int LZ4_compress_HC_continue(LZ4_streamHC_t *streamHCPtr, const char *src, + char *dst, int srcSize, int maxDstSize); + +/** + * LZ4_saveDictHC() - Save static dictionary from LZ4HC_stream + * @streamHCPtr: pointer to the 'LZ4HC_stream_t' structure + * @safeBuffer: buffer to save dictionary to, must be already allocated + * @maxDictSize: size of 'safeBuffer' + * + * If previously compressed data block is not guaranteed + * to remain available at its memory location, + * save it into a safer place (char *safeBuffer). + * Note : you don't need to call LZ4_loadDictHC() afterwards, + * dictionary is immediately usable, you can therefore call + * LZ4_compress_HC_continue(). + * + * Return : saved dictionary size in bytes (necessarily <= maxDictSize), + * or 0 if error. + */ +int LZ4_saveDictHC(LZ4_streamHC_t *streamHCPtr, char *safeBuffer, + int maxDictSize); + +/*-********************************************* + * Streaming Compression Functions + ***********************************************/ + +/** + * LZ4_resetStream() - Init an allocated 'LZ4_stream_t' structure + * @LZ4_stream: pointer to the 'LZ4_stream_t' structure + * + * An LZ4_stream_t structure can be allocated once + * and re-used multiple times. + * Use this function to init an allocated `LZ4_stream_t` structure + * and start a new compression. + */ +void LZ4_resetStream(LZ4_stream_t *LZ4_stream); + +/** + * LZ4_loadDict() - Load a static dictionary into LZ4_stream + * @streamPtr: pointer to the LZ4_stream_t + * @dictionary: dictionary to load + * @dictSize: size of dictionary + * + * Use this function to load a static dictionary into LZ4_stream. + * Any previous data will be forgotten, only 'dictionary' + * will remain in memory. + * Loading a size of 0 is allowed. + * + * Return : dictionary size, in bytes (necessarily <= 64 KB) + */ +int LZ4_loadDict(LZ4_stream_t *streamPtr, const char *dictionary, + int dictSize); + +/** + * LZ4_saveDict() - Save static dictionary from LZ4_stream + * @streamPtr: pointer to the 'LZ4_stream_t' structure + * @safeBuffer: buffer to save dictionary to, must be already allocated + * @dictSize: size of 'safeBuffer' + * + * If previously compressed data block is not guaranteed + * to remain available at its memory location, + * save it into a safer place (char *safeBuffer). + * Note : you don't need to call LZ4_loadDict() afterwards, + * dictionary is immediately usable, you can therefore call + * LZ4_compress_fast_continue(). + * + * Return : saved dictionary size in bytes (necessarily <= dictSize), + * or 0 if error. + */ +int LZ4_saveDict(LZ4_stream_t *streamPtr, char *safeBuffer, int dictSize); + +/** + * LZ4_compress_fast_continue() - Compress 'src' using data from previously + * compressed blocks as a dictionary + * @streamPtr: Pointer to the previous 'LZ4_stream_t' structure + * @src: source address of the original data + * @dst: output buffer address of the compressed data, + * which must be already allocated + * @srcSize: size of the input data. Max supported value is LZ4_MAX_INPUT_SIZE + * @maxDstSize: full or partial size of buffer 'dest' + * which must be already allocated + * @acceleration: acceleration factor + * + * Compress buffer content 'src', using data from previously compressed blocks + * as dictionary to improve compression ratio. + * Important : Previous data blocks are assumed to still + * be present and unmodified ! + * If maxDstSize >= LZ4_compressBound(srcSize), + * compression is guaranteed to succeed, and runs faster. + * + * Return: Number of bytes written into buffer 'dst' or 0 if compression fails + */ +int LZ4_compress_fast_continue(LZ4_stream_t *streamPtr, const char *src, + char *dst, int srcSize, int maxDstSize, int acceleration); + +/** + * LZ4_setStreamDecode() - Instruct where to find dictionary + * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure + * @dictionary: dictionary to use + * @dictSize: size of dictionary + * + * Use this function to instruct where to find the dictionary. + * Setting a size of 0 is allowed (same effect as reset). + * + * Return: 1 if OK, 0 if error + */ +int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode, + const char *dictionary, int dictSize); + +/** + * LZ4_decompress_fast_continue() - Decompress blocks in streaming mode + * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure + * @source: source address of the compressed data + * @dest: output buffer address of the uncompressed data + * which must be already allocated + * @compressedSize: is the precise full size of the compressed block + * @maxDecompressedSize: is the size of 'dest' buffer + * + * These decoding function allows decompression of multiple blocks + * in "streaming" mode. + * Previously decoded blocks *must* remain available at the memory position + * where they were decoded (up to 64 KB) + * In the case of a ring buffers, decoding buffer must be either : + * - Exactly same size as encoding buffer, with same update rule + * (block boundaries at same positions) In which case, + * the decoding & encoding ring buffer can have any size, + * including very small ones ( < 64 KB). + * - Larger than encoding buffer, by a minimum of maxBlockSize more bytes. + * maxBlockSize is implementation dependent. + * It's the maximum size you intend to compress into a single block. + * In which case, encoding and decoding buffers do not need + * to be synchronized, and encoding ring buffer can have any size, + * including small ones ( < 64 KB). + * - _At least_ 64 KB + 8 bytes + maxBlockSize. + * In which case, encoding and decoding buffers do not need to be + * synchronized, and encoding ring buffer can have any size, + * including larger than decoding buffer. W + * Whenever these conditions are not possible, save the last 64KB of decoded + * data into a safe buffer, and indicate where it is saved + * using LZ4_setStreamDecode() + * + * Return: number of bytes decompressed into destination buffer + * (necessarily <= maxDecompressedSize) + * or a negative result in case of error + */ +int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode, + const char *source, char *dest, int compressedSize, + int maxDecompressedSize); + +/** + * LZ4_decompress_fast_continue() - Decompress blocks in streaming mode + * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure + * @source: source address of the compressed data + * @dest: output buffer address of the uncompressed data + * which must be already allocated with 'originalSize' bytes + * @originalSize: is the original and therefore uncompressed size + * + * These decoding function allows decompression of multiple blocks + * in "streaming" mode. + * Previously decoded blocks *must* remain available at the memory position + * where they were decoded (up to 64 KB) + * In the case of a ring buffers, decoding buffer must be either : + * - Exactly same size as encoding buffer, with same update rule + * (block boundaries at same positions) In which case, + * the decoding & encoding ring buffer can have any size, + * including very small ones ( < 64 KB). + * - Larger than encoding buffer, by a minimum of maxBlockSize more bytes. + * maxBlockSize is implementation dependent. + * It's the maximum size you intend to compress into a single block. + * In which case, encoding and decoding buffers do not need + * to be synchronized, and encoding ring buffer can have any size, + * including small ones ( < 64 KB). + * - _At least_ 64 KB + 8 bytes + maxBlockSize. + * In which case, encoding and decoding buffers do not need to be + * synchronized, and encoding ring buffer can have any size, + * including larger than decoding buffer. W + * Whenever these conditions are not possible, save the last 64KB of decoded + * data into a safe buffer, and indicate where it is saved + * using LZ4_setStreamDecode() + * + * Return: number of bytes decompressed into destination buffer + * (necessarily <= maxDecompressedSize) + * or a negative result in case of error + */ +int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode, + const char *source, char *dest, int originalSize); + +/** + * LZ4_decompress_safe_usingDict() - Same as LZ4_setStreamDecode() + * followed by LZ4_decompress_safe_continue() + * @source: source address of the compressed data + * @dest: output buffer address of the uncompressed data + * which must be already allocated + * @compressedSize: is the precise full size of the compressed block + * @maxDecompressedSize: is the size of 'dest' buffer + * @dictStart: pointer to the start of the dictionary in memory + * @dictSize: size of dictionary + * + * These decoding function works the same as + * a combination of LZ4_setStreamDecode() followed by + * LZ4_decompress_safe_continue() + * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure. + * + * Return: number of bytes decompressed into destination buffer + * (necessarily <= maxDecompressedSize) + * or a negative result in case of error + */ +int LZ4_decompress_safe_usingDict(const char *source, char *dest, + int compressedSize, int maxDecompressedSize, const char *dictStart, + int dictSize); + +/** + * LZ4_decompress_fast_usingDict() - Same as LZ4_setStreamDecode() + * followed by LZ4_decompress_fast_continue() + * @source: source address of the compressed data + * @dest: output buffer address of the uncompressed data + * which must be already allocated with 'originalSize' bytes + * @originalSize: is the original and therefore uncompressed size + * @dictStart: pointer to the start of the dictionary in memory + * @dictSize: size of dictionary + * + * These decoding function works the same as + * a combination of LZ4_setStreamDecode() followed by + * LZ4_decompress_safe_continue() + * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure. + * + * Return: number of bytes decompressed into destination buffer + * (necessarily <= maxDecompressedSize) + * or a negative result in case of error + */ +int LZ4_decompress_fast_usingDict(const char *source, char *dest, + int originalSize, const char *dictStart, int dictSize); + #endif diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile index 8085d04e9309..f7b113271d13 100644 --- a/lib/lz4/Makefile +++ b/lib/lz4/Makefile @@ -1,3 +1,5 @@ +ccflags-y += -O3 + obj-$(CONFIG_LZ4_COMPRESS) += lz4_compress.o obj-$(CONFIG_LZ4HC_COMPRESS) += lz4hc_compress.o obj-$(CONFIG_LZ4_DECOMPRESS) += lz4_decompress.o diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c index 28321d8f75ef..53f313f05185 100644 --- a/lib/lz4/lz4_compress.c +++ b/lib/lz4/lz4_compress.c @@ -1,19 +1,16 @@ /* * LZ4 - Fast LZ compression algorithm - * Copyright (C) 2011-2012, Yann Collet. - * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - + * Copyright (C) 2011 - 2016, Yann Collet. + * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php) * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. - * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -25,417 +22,939 @@ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * * You can contact the author at : - * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html - * - LZ4 source repository : http://code.google.com/p/lz4/ + * - LZ4 homepage : http://www.lz4.org + * - LZ4 source repository : https://github.com/lz4/lz4 * - * Changed for kernel use by: - * Chanho Min + * Changed for kernel usage by: + * Sven Schmidt <4sschmid@informatik.uni-hamburg.de> */ +/*-************************************ + * Dependencies + **************************************/ +#include +#include "lz4defs.h" #include #include -#include #include -#include "lz4defs.h" -/* - * LZ4_compressCtx : - * ----------------- - * Compress 'isize' bytes from 'source' into an output buffer 'dest' of - * maximum size 'maxOutputSize'. * If it cannot achieve it, compression - * will stop, and result of the function will be zero. - * return : the number of bytes written in buffer 'dest', or 0 if the - * compression fails - */ -static inline int lz4_compressctx(void *ctx, - const char *source, - char *dest, - int isize, - int maxoutputsize) +static const int LZ4_minLength = (MFLIMIT + 1); +static const int LZ4_64Klimit = ((64 * KB) + (MFLIMIT - 1)); + +/*-****************************** + * Compression functions + ********************************/ +static FORCE_INLINE U32 LZ4_hash4( + U32 sequence, + tableType_t const tableType) { - HTYPE *hashtable = (HTYPE *)ctx; - const u8 *ip = (u8 *)source; -#if LZ4_ARCH64 - const BYTE * const base = ip; + if (tableType == byU16) + return ((sequence * 2654435761U) + >> ((MINMATCH * 8) - (LZ4_HASHLOG + 1))); + else + return ((sequence * 2654435761U) + >> ((MINMATCH * 8) - LZ4_HASHLOG)); +} + +static FORCE_INLINE U32 LZ4_hash5( + U64 sequence, + tableType_t const tableType) +{ + const U32 hashLog = (tableType == byU16) + ? LZ4_HASHLOG + 1 + : LZ4_HASHLOG; + +#if LZ4_LITTLE_ENDIAN + static const U64 prime5bytes = 889523592379ULL; + + return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); #else - const int base = 0; + static const U64 prime8bytes = 11400714785074694791ULL; + + return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); #endif - const u8 *anchor = ip; - const u8 *const iend = ip + isize; - const u8 *const mflimit = iend - MFLIMIT; - #define MATCHLIMIT (iend - LASTLITERALS) - - u8 *op = (u8 *) dest; - u8 *const oend = op + maxoutputsize; - int length; - const int skipstrength = SKIPSTRENGTH; - u32 forwardh; - int lastrun; - - /* Init */ - if (isize < MINLENGTH) - goto _last_literals; +} + +static FORCE_INLINE U32 LZ4_hashPosition( + const void *p, + tableType_t const tableType) +{ +#if LZ4_ARCH64 + if (tableType == byU32) + return LZ4_hash5(LZ4_read_ARCH(p), tableType); +#endif + + return LZ4_hash4(LZ4_read32(p), tableType); +} + +static void LZ4_putPositionOnHash( + const BYTE *p, + U32 h, + void *tableBase, + tableType_t const tableType, + const BYTE *srcBase) +{ + switch (tableType) { + case byPtr: + { + const BYTE **hashTable = (const BYTE **)tableBase; + + hashTable[h] = p; + return; + } + case byU32: + { + U32 *hashTable = (U32 *) tableBase; + + hashTable[h] = (U32)(p - srcBase); + return; + } + case byU16: + { + U16 *hashTable = (U16 *) tableBase; + + hashTable[h] = (U16)(p - srcBase); + return; + } + } +} + +static FORCE_INLINE void LZ4_putPosition( + const BYTE *p, + void *tableBase, + tableType_t tableType, + const BYTE *srcBase) +{ + U32 const h = LZ4_hashPosition(p, tableType); + + LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); +} + +static const BYTE *LZ4_getPositionOnHash( + U32 h, + void *tableBase, + tableType_t tableType, + const BYTE *srcBase) +{ + if (tableType == byPtr) { + const BYTE **hashTable = (const BYTE **) tableBase; + + return hashTable[h]; + } + + if (tableType == byU32) { + const U32 * const hashTable = (U32 *) tableBase; + + return hashTable[h] + srcBase; + } + + { + /* default, to ensure a return */ + const U16 * const hashTable = (U16 *) tableBase; + + return hashTable[h] + srcBase; + } +} + +static FORCE_INLINE const BYTE *LZ4_getPosition( + const BYTE *p, + void *tableBase, + tableType_t tableType, + const BYTE *srcBase) +{ + U32 const h = LZ4_hashPosition(p, tableType); + + return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); +} + - memset((void *)hashtable, 0, LZ4_MEM_COMPRESS); +/* + * LZ4_compress_generic() : + * inlined, to ensure branches are decided at compilation time + */ +static FORCE_INLINE int LZ4_compress_generic( + LZ4_stream_t_internal * const dictPtr, + const char * const source, + char * const dest, + const int inputSize, + const int maxOutputSize, + const limitedOutput_directive outputLimited, + const tableType_t tableType, + const dict_directive dict, + const dictIssue_directive dictIssue, + const U32 acceleration) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *base; + const BYTE *lowLimit; + const BYTE * const lowRefLimit = ip - dictPtr->dictSize; + const BYTE * const dictionary = dictPtr->dictionary; + const BYTE * const dictEnd = dictionary + dictPtr->dictSize; + const size_t dictDelta = dictEnd - (const BYTE *)source; + const BYTE *anchor = (const BYTE *) source; + const BYTE * const iend = ip + inputSize; + const BYTE * const mflimit = iend - MFLIMIT; + const BYTE * const matchlimit = iend - LASTLITERALS; + + BYTE *op = (BYTE *) dest; + BYTE * const olimit = op + maxOutputSize; + + U32 forwardH; + size_t refDelta = 0; + + /* Init conditions */ + if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) { + /* Unsupported inputSize, too large (or negative) */ + return 0; + } + + switch (dict) { + case noDict: + default: + base = (const BYTE *)source; + lowLimit = (const BYTE *)source; + break; + case withPrefix64k: + base = (const BYTE *)source - dictPtr->currentOffset; + lowLimit = (const BYTE *)source - dictPtr->dictSize; + break; + case usingExtDict: + base = (const BYTE *)source - dictPtr->currentOffset; + lowLimit = (const BYTE *)source; + break; + } + + if ((tableType == byU16) + && (inputSize >= LZ4_64Klimit)) { + /* Size too large (not within 64K limit) */ + return 0; + } + + if (inputSize < LZ4_minLength) { + /* Input too small, no compression (all literals) */ + goto _last_literals; + } /* First Byte */ - hashtable[LZ4_HASH_VALUE(ip)] = ip - base; + LZ4_putPosition(ip, dictPtr->hashTable, tableType, base); ip++; - forwardh = LZ4_HASH_VALUE(ip); + forwardH = LZ4_hashPosition(ip, tableType); /* Main Loop */ - for (;;) { - int findmatchattempts = (1U << skipstrength) + 3; - const u8 *forwardip = ip; - const u8 *ref; - u8 *token; + for ( ; ; ) { + const BYTE *match; + BYTE *token; /* Find a match */ - do { - u32 h = forwardh; - int step = findmatchattempts++ >> skipstrength; - ip = forwardip; - forwardip = ip + step; - - if (unlikely(forwardip > mflimit)) - goto _last_literals; - - forwardh = LZ4_HASH_VALUE(forwardip); - ref = base + hashtable[h]; - hashtable[h] = ip - base; - } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); + { + const BYTE *forwardIp = ip; + unsigned int step = 1; + unsigned int searchMatchNb = acceleration << LZ4_SKIPTRIGGER; + + do { + U32 const h = forwardH; + + ip = forwardIp; + forwardIp += step; + step = (searchMatchNb++ >> LZ4_SKIPTRIGGER); + + if (unlikely(forwardIp > mflimit)) + goto _last_literals; + + match = LZ4_getPositionOnHash(h, + dictPtr->hashTable, + tableType, base); + + if (dict == usingExtDict) { + if (match < (const BYTE *)source) { + refDelta = dictDelta; + lowLimit = dictionary; + } else { + refDelta = 0; + lowLimit = (const BYTE *)source; + } } + + forwardH = LZ4_hashPosition(forwardIp, + tableType); + + LZ4_putPositionOnHash(ip, h, dictPtr->hashTable, + tableType, base); + } while (((dictIssue == dictSmall) + ? (match < lowRefLimit) + : 0) + || ((tableType == byU16) + ? 0 + : (match + MAX_DISTANCE < ip)) + || (LZ4_read32(match + refDelta) + != LZ4_read32(ip))); + } /* Catch up */ - while ((ip > anchor) && (ref > (u8 *)source) && - unlikely(ip[-1] == ref[-1])) { + while (((ip > anchor) & (match + refDelta > lowLimit)) + && (unlikely(ip[-1] == match[refDelta - 1]))) { ip--; - ref--; + match--; } - /* Encode Literal length */ - length = (int)(ip - anchor); - token = op++; - /* check output limit */ - if (unlikely(op + length + (2 + 1 + LASTLITERALS) + - (length >> 8) > oend)) - return 0; + /* Encode Literals */ + { + unsigned const int litLength = (unsigned int)(ip - anchor); - if (length >= (int)RUN_MASK) { - int len; - *token = (RUN_MASK << ML_BITS); - len = length - RUN_MASK; - for (; len > 254 ; len -= 255) - *op++ = 255; - *op++ = (u8)len; - } else - *token = (length << ML_BITS); + token = op++; + + if ((outputLimited) && + /* Check output buffer overflow */ + (unlikely(op + litLength + + (2 + 1 + LASTLITERALS) + + (litLength / 255) > olimit))) + return 0; + + if (litLength >= RUN_MASK) { + int len = (int)litLength - RUN_MASK; + + *token = (RUN_MASK << ML_BITS); + + for (; len >= 255; len -= 255) + *op++ = 255; + *op++ = (BYTE)len; + } else + *token = (BYTE)(litLength << ML_BITS); + + /* Copy Literals */ + LZ4_wildCopy(op, anchor, op + litLength); + op += litLength; + } - /* Copy Literals */ - LZ4_BLINDCOPY(anchor, op, length); _next_match: /* Encode Offset */ - LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref)); + LZ4_writeLE16(op, (U16)(ip - match)); + op += 2; - /* Start Counting */ - ip += MINMATCH; - /* MinMatch verified */ - ref += MINMATCH; - anchor = ip; - while (likely(ip < MATCHLIMIT - (STEPSIZE - 1))) { - #if LZ4_ARCH64 - u64 diff = A64(ref) ^ A64(ip); - #else - u32 diff = A32(ref) ^ A32(ip); - #endif - if (!diff) { - ip += STEPSIZE; - ref += STEPSIZE; - continue; - } - ip += LZ4_NBCOMMONBYTES(diff); - goto _endcount; - } - #if LZ4_ARCH64 - if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) { - ip += 4; - ref += 4; - } - #endif - if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) { - ip += 2; - ref += 2; - } - if ((ip < MATCHLIMIT) && (*ref == *ip)) - ip++; -_endcount: /* Encode MatchLength */ - length = (int)(ip - anchor); - /* Check output limit */ - if (unlikely(op + (1 + LASTLITERALS) + (length >> 8) > oend)) - return 0; - if (length >= (int)ML_MASK) { - *token += ML_MASK; - length -= ML_MASK; - for (; length > 509 ; length -= 510) { - *op++ = 255; - *op++ = 255; - } - if (length > 254) { - length -= 255; - *op++ = 255; + { + unsigned int matchCode; + + if ((dict == usingExtDict) + && (lowLimit == dictionary)) { + const BYTE *limit; + + match += refDelta; + limit = ip + (dictEnd - match); + + if (limit > matchlimit) + limit = matchlimit; + + matchCode = LZ4_count(ip + MINMATCH, + match + MINMATCH, limit); + + ip += MINMATCH + matchCode; + + if (ip == limit) { + unsigned const int more = LZ4_count(ip, + (const BYTE *)source, + matchlimit); + + matchCode += more; + ip += more; + } + } else { + matchCode = LZ4_count(ip + MINMATCH, + match + MINMATCH, matchlimit); + ip += MINMATCH + matchCode; } - *op++ = (u8)length; - } else - *token += length; + + if (outputLimited && + /* Check output buffer overflow */ + (unlikely(op + + (1 + LASTLITERALS) + + (matchCode >> 8) > olimit))) + return 0; + + if (matchCode >= ML_MASK) { + *token += ML_MASK; + matchCode -= ML_MASK; + LZ4_write32(op, 0xFFFFFFFF); + + while (matchCode >= 4 * 255) { + op += 4; + LZ4_write32(op, 0xFFFFFFFF); + matchCode -= 4 * 255; + } + + op += matchCode / 255; + *op++ = (BYTE)(matchCode % 255); + } else + *token += (BYTE)(matchCode); + } + + anchor = ip; /* Test end of chunk */ - if (ip > mflimit) { - anchor = ip; + if (ip > mflimit) break; - } /* Fill table */ - hashtable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base; + LZ4_putPosition(ip - 2, dictPtr->hashTable, tableType, base); /* Test next position */ - ref = base + hashtable[LZ4_HASH_VALUE(ip)]; - hashtable[LZ4_HASH_VALUE(ip)] = ip - base; - if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { + match = LZ4_getPosition(ip, dictPtr->hashTable, + tableType, base); + + if (dict == usingExtDict) { + if (match < (const BYTE *)source) { + refDelta = dictDelta; + lowLimit = dictionary; + } else { + refDelta = 0; + lowLimit = (const BYTE *)source; + } + } + + LZ4_putPosition(ip, dictPtr->hashTable, tableType, base); + + if (((dictIssue == dictSmall) ? (match >= lowRefLimit) : 1) + && (match + MAX_DISTANCE >= ip) + && (LZ4_read32(match + refDelta) == LZ4_read32(ip))) { token = op++; *token = 0; goto _next_match; } /* Prepare next loop */ - anchor = ip++; - forwardh = LZ4_HASH_VALUE(ip); + forwardH = LZ4_hashPosition(++ip, tableType); } _last_literals: /* Encode Last Literals */ - lastrun = (int)(iend - anchor); - if (((char *)op - dest) + lastrun + 1 - + ((lastrun + 255 - RUN_MASK) / 255) > (u32)maxoutputsize) - return 0; + { + size_t const lastRun = (size_t)(iend - anchor); - if (lastrun >= (int)RUN_MASK) { - *op++ = (RUN_MASK << ML_BITS); - lastrun -= RUN_MASK; - for (; lastrun > 254 ; lastrun -= 255) - *op++ = 255; - *op++ = (u8)lastrun; - } else - *op++ = (lastrun << ML_BITS); - memcpy(op, anchor, iend - anchor); - op += iend - anchor; + if ((outputLimited) && + /* Check output buffer overflow */ + ((op - (BYTE *)dest) + lastRun + 1 + + ((lastRun + 255 - RUN_MASK) / 255) > (U32)maxOutputSize)) + return 0; + + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for (; accumulator >= 255; accumulator -= 255) + *op++ = 255; + *op++ = (BYTE) accumulator; + } else { + *op++ = (BYTE)(lastRun << ML_BITS); + } + + memcpy(op, anchor, lastRun); + + op += lastRun; + } /* End */ - return (int)(((char *)op) - dest); + return (int) (((char *)op) - dest); } -static inline int lz4_compress64kctx(void *ctx, - const char *source, - char *dest, - int isize, - int maxoutputsize) +static int LZ4_compress_fast_extState( + void *state, + const char *source, + char *dest, + int inputSize, + int maxOutputSize, + int acceleration) { - u16 *hashtable = (u16 *)ctx; - const u8 *ip = (u8 *) source; - const u8 *anchor = ip; - const u8 *const base = ip; - const u8 *const iend = ip + isize; - const u8 *const mflimit = iend - MFLIMIT; - #define MATCHLIMIT (iend - LASTLITERALS) - - u8 *op = (u8 *) dest; - u8 *const oend = op + maxoutputsize; - int len, length; - const int skipstrength = SKIPSTRENGTH; - u32 forwardh; - int lastrun; - - /* Init */ - if (isize < MINLENGTH) - goto _last_literals; + LZ4_stream_t_internal *ctx = &((LZ4_stream_t *)state)->internal_donotuse; +#if LZ4_ARCH64 + const tableType_t tableType = byU32; +#else + const tableType_t tableType = byPtr; +#endif - memset((void *)hashtable, 0, LZ4_MEM_COMPRESS); + LZ4_resetStream((LZ4_stream_t *)state); + + if (acceleration < 1) + acceleration = LZ4_ACCELERATION_DEFAULT; + + if (maxOutputSize >= LZ4_COMPRESSBOUND(inputSize)) { + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(ctx, source, + dest, inputSize, 0, + noLimit, byU16, noDict, + noDictIssue, acceleration); + else + return LZ4_compress_generic(ctx, source, + dest, inputSize, 0, + noLimit, tableType, noDict, + noDictIssue, acceleration); + } else { + if (inputSize < LZ4_64Klimit) + return LZ4_compress_generic(ctx, source, + dest, inputSize, + maxOutputSize, limitedOutput, byU16, noDict, + noDictIssue, acceleration); + else + return LZ4_compress_generic(ctx, source, + dest, inputSize, + maxOutputSize, limitedOutput, tableType, noDict, + noDictIssue, acceleration); + } +} + +int LZ4_compress_fast(const char *source, char *dest, int inputSize, + int maxOutputSize, int acceleration, void *wrkmem) +{ + return LZ4_compress_fast_extState(wrkmem, source, dest, inputSize, + maxOutputSize, acceleration); +} +EXPORT_SYMBOL(LZ4_compress_fast); + +int LZ4_compress_default(const char *source, char *dest, int inputSize, + int maxOutputSize, void *wrkmem) +{ + return LZ4_compress_fast(source, dest, inputSize, + maxOutputSize, LZ4_ACCELERATION_DEFAULT, wrkmem); +} +EXPORT_SYMBOL(LZ4_compress_default); + +/*-****************************** + * *_destSize() variant + ********************************/ +static int LZ4_compress_destSize_generic( + LZ4_stream_t_internal * const ctx, + const char * const src, + char * const dst, + int * const srcSizePtr, + const int targetDstSize, + const tableType_t tableType) +{ + const BYTE *ip = (const BYTE *) src; + const BYTE *base = (const BYTE *) src; + const BYTE *lowLimit = (const BYTE *) src; + const BYTE *anchor = ip; + const BYTE * const iend = ip + *srcSizePtr; + const BYTE * const mflimit = iend - MFLIMIT; + const BYTE * const matchlimit = iend - LASTLITERALS; + + BYTE *op = (BYTE *) dst; + BYTE * const oend = op + targetDstSize; + BYTE * const oMaxLit = op + targetDstSize - 2 /* offset */ + - 8 /* because 8 + MINMATCH == MFLIMIT */ - 1 /* token */; + BYTE * const oMaxMatch = op + targetDstSize + - (LASTLITERALS + 1 /* token */); + BYTE * const oMaxSeq = oMaxLit - 1 /* token */; + + U32 forwardH; + + /* Init conditions */ + /* Impossible to store anything */ + if (targetDstSize < 1) + return 0; + /* Unsupported input size, too large (or negative) */ + if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) + return 0; + /* Size too large (not within 64K limit) */ + if ((tableType == byU16) && (*srcSizePtr >= LZ4_64Klimit)) + return 0; + /* Input too small, no compression (all literals) */ + if (*srcSizePtr < LZ4_minLength) + goto _last_literals; /* First Byte */ - ip++; - forwardh = LZ4_HASH64K_VALUE(ip); + *srcSizePtr = 0; + LZ4_putPosition(ip, ctx->hashTable, tableType, base); + ip++; forwardH = LZ4_hashPosition(ip, tableType); /* Main Loop */ - for (;;) { - int findmatchattempts = (1U << skipstrength) + 3; - const u8 *forwardip = ip; - const u8 *ref; - u8 *token; + for ( ; ; ) { + const BYTE *match; + BYTE *token; /* Find a match */ - do { - u32 h = forwardh; - int step = findmatchattempts++ >> skipstrength; - ip = forwardip; - forwardip = ip + step; - - if (forwardip > mflimit) - goto _last_literals; - - forwardh = LZ4_HASH64K_VALUE(forwardip); - ref = base + hashtable[h]; - hashtable[h] = (u16)(ip - base); - } while (A32(ref) != A32(ip)); + { + const BYTE *forwardIp = ip; + unsigned int step = 1; + unsigned int searchMatchNb = 1 << LZ4_SKIPTRIGGER; + + do { + U32 h = forwardH; + + ip = forwardIp; + forwardIp += step; + step = (searchMatchNb++ >> LZ4_SKIPTRIGGER); + + if (unlikely(forwardIp > mflimit)) + goto _last_literals; + + match = LZ4_getPositionOnHash(h, ctx->hashTable, + tableType, base); + forwardH = LZ4_hashPosition(forwardIp, + tableType); + LZ4_putPositionOnHash(ip, h, + ctx->hashTable, tableType, + base); + + } while (((tableType == byU16) + ? 0 + : (match + MAX_DISTANCE < ip)) + || (LZ4_read32(match) != LZ4_read32(ip))); + } /* Catch up */ - while ((ip > anchor) && (ref > (u8 *)source) - && (ip[-1] == ref[-1])) { + while ((ip > anchor) + && (match > lowLimit) + && (unlikely(ip[-1] == match[-1]))) { ip--; - ref--; + match--; } /* Encode Literal length */ - length = (int)(ip - anchor); - token = op++; - /* Check output limit */ - if (unlikely(op + length + (2 + 1 + LASTLITERALS) - + (length >> 8) > oend)) - return 0; - if (length >= (int)RUN_MASK) { - *token = (RUN_MASK << ML_BITS); - len = length - RUN_MASK; - for (; len > 254 ; len -= 255) - *op++ = 255; - *op++ = (u8)len; - } else - *token = (length << ML_BITS); + { + unsigned int litLength = (unsigned int)(ip - anchor); - /* Copy Literals */ - LZ4_BLINDCOPY(anchor, op, length); + token = op++; + if (op + ((litLength + 240) / 255) + + litLength > oMaxLit) { + /* Not enough space for a last match */ + op--; + goto _last_literals; + } + if (litLength >= RUN_MASK) { + unsigned int len = litLength - RUN_MASK; + *token = (RUN_MASK<= 255; len -= 255) + *op++ = 255; + *op++ = (BYTE)len; + } else + *token = (BYTE)(litLength << ML_BITS); + + /* Copy Literals */ + LZ4_wildCopy(op, anchor, op + litLength); + op += litLength; + } _next_match: /* Encode Offset */ - LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref)); + LZ4_writeLE16(op, (U16)(ip - match)); op += 2; - /* Start Counting */ - ip += MINMATCH; - /* MinMatch verified */ - ref += MINMATCH; - anchor = ip; + /* Encode MatchLength */ + { + size_t matchLength = LZ4_count(ip + MINMATCH, + match + MINMATCH, matchlimit); - while (ip < MATCHLIMIT - (STEPSIZE - 1)) { - #if LZ4_ARCH64 - u64 diff = A64(ref) ^ A64(ip); - #else - u32 diff = A32(ref) ^ A32(ip); - #endif - - if (!diff) { - ip += STEPSIZE; - ref += STEPSIZE; - continue; + if (op + ((matchLength + 240)/255) > oMaxMatch) { + /* Match description too long : reduce it */ + matchLength = (15 - 1) + (oMaxMatch - op) * 255; } - ip += LZ4_NBCOMMONBYTES(diff); - goto _endcount; - } - #if LZ4_ARCH64 - if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) { - ip += 4; - ref += 4; - } - #endif - if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) { - ip += 2; - ref += 2; + ip += MINMATCH + matchLength; + + if (matchLength >= ML_MASK) { + *token += ML_MASK; + matchLength -= ML_MASK; + while (matchLength >= 255) { + matchLength -= 255; + *op++ = 255; + } + *op++ = (BYTE)matchLength; + } else + *token += (BYTE)(matchLength); } - if ((ip < MATCHLIMIT) && (*ref == *ip)) - ip++; -_endcount: - /* Encode MatchLength */ - len = (int)(ip - anchor); - /* Check output limit */ - if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)) - return 0; - if (len >= (int)ML_MASK) { - *token += ML_MASK; - len -= ML_MASK; - for (; len > 509 ; len -= 510) { - *op++ = 255; - *op++ = 255; - } - if (len > 254) { - len -= 255; - *op++ = 255; - } - *op++ = (u8)len; - } else - *token += len; + anchor = ip; - /* Test end of chunk */ - if (ip > mflimit) { - anchor = ip; + /* Test end of block */ + if (ip > mflimit) + break; + if (op > oMaxSeq) break; - } /* Fill table */ - hashtable[LZ4_HASH64K_VALUE(ip-2)] = (u16)(ip - 2 - base); + LZ4_putPosition(ip - 2, ctx->hashTable, tableType, base); /* Test next position */ - ref = base + hashtable[LZ4_HASH64K_VALUE(ip)]; - hashtable[LZ4_HASH64K_VALUE(ip)] = (u16)(ip - base); - if (A32(ref) == A32(ip)) { - token = op++; - *token = 0; + match = LZ4_getPosition(ip, ctx->hashTable, tableType, base); + LZ4_putPosition(ip, ctx->hashTable, tableType, base); + + if ((match + MAX_DISTANCE >= ip) + && (LZ4_read32(match) == LZ4_read32(ip))) { + token = op++; *token = 0; goto _next_match; } /* Prepare next loop */ - anchor = ip++; - forwardh = LZ4_HASH64K_VALUE(ip); + forwardH = LZ4_hashPosition(++ip, tableType); } _last_literals: /* Encode Last Literals */ - lastrun = (int)(iend - anchor); - if (op + lastrun + 1 + (lastrun - RUN_MASK + 255) / 255 > oend) - return 0; - if (lastrun >= (int)RUN_MASK) { - *op++ = (RUN_MASK << ML_BITS); - lastrun -= RUN_MASK; - for (; lastrun > 254 ; lastrun -= 255) - *op++ = 255; - *op++ = (u8)lastrun; - } else - *op++ = (lastrun << ML_BITS); - memcpy(op, anchor, iend - anchor); - op += iend - anchor; + { + size_t lastRunSize = (size_t)(iend - anchor); + + if (op + 1 /* token */ + + ((lastRunSize + 240) / 255) /* litLength */ + + lastRunSize /* literals */ > oend) { + /* adapt lastRunSize to fill 'dst' */ + lastRunSize = (oend - op) - 1; + lastRunSize -= (lastRunSize + 240) / 255; + } + ip = anchor + lastRunSize; + + if (lastRunSize >= RUN_MASK) { + size_t accumulator = lastRunSize - RUN_MASK; + + *op++ = RUN_MASK << ML_BITS; + for (; accumulator >= 255; accumulator -= 255) + *op++ = 255; + *op++ = (BYTE) accumulator; + } else { + *op++ = (BYTE)(lastRunSize<= LZ4_COMPRESSBOUND(*srcSizePtr)) { + /* compression success is guaranteed */ + return LZ4_compress_fast_extState( + state, src, dst, *srcSizePtr, + targetDstSize, 1); + } else { + if (*srcSizePtr < LZ4_64Klimit) + return LZ4_compress_destSize_generic( + &state->internal_donotuse, + src, dst, srcSizePtr, + targetDstSize, byU16); + else + return LZ4_compress_destSize_generic( + &state->internal_donotuse, + src, dst, srcSizePtr, + targetDstSize, tableType); + } +} + + +int LZ4_compress_destSize( + const char *src, + char *dst, + int *srcSizePtr, + int targetDstSize, + void *wrkmem) +{ + return LZ4_compress_destSize_extState(wrkmem, src, dst, srcSizePtr, + targetDstSize); +} +EXPORT_SYMBOL(LZ4_compress_destSize); + +/*-****************************** + * Streaming functions + ********************************/ +void LZ4_resetStream(LZ4_stream_t *LZ4_stream) +{ + memset(LZ4_stream, 0, sizeof(LZ4_stream_t)); +} + +int LZ4_loadDict(LZ4_stream_t *LZ4_dict, + const char *dictionary, int dictSize) +{ + LZ4_stream_t_internal *dict = &LZ4_dict->internal_donotuse; + const BYTE *p = (const BYTE *)dictionary; + const BYTE * const dictEnd = p + dictSize; + const BYTE *base; + + if ((dict->initCheck) + || (dict->currentOffset > 1 * GB)) { + /* Uninitialized structure, or reuse overflow */ + LZ4_resetStream(LZ4_dict); + } + + if (dictSize < (int)HASH_UNIT) { + dict->dictionary = NULL; + dict->dictSize = 0; + return 0; + } + + if ((dictEnd - p) > 64 * KB) + p = dictEnd - 64 * KB; + dict->currentOffset += 64 * KB; + base = p - dict->currentOffset; + dict->dictionary = p; + dict->dictSize = (U32)(dictEnd - p); + dict->currentOffset += dict->dictSize; + + while (p <= dictEnd - HASH_UNIT) { + LZ4_putPosition(p, dict->hashTable, byU32, base); + p += 3; + } + + return dict->dictSize; +} +EXPORT_SYMBOL(LZ4_loadDict); + +static void LZ4_renormDictT(LZ4_stream_t_internal *LZ4_dict, + const BYTE *src) +{ + if ((LZ4_dict->currentOffset > 0x80000000) || + ((uptrval)LZ4_dict->currentOffset > (uptrval)src)) { + /* address space overflow */ + /* rescale hash table */ + U32 const delta = LZ4_dict->currentOffset - 64 * KB; + const BYTE *dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; + int i; + + for (i = 0; i < LZ4_HASH_SIZE_U32; i++) { + if (LZ4_dict->hashTable[i] < delta) + LZ4_dict->hashTable[i] = 0; + else + LZ4_dict->hashTable[i] -= delta; + } + LZ4_dict->currentOffset = 64 * KB; + if (LZ4_dict->dictSize > 64 * KB) + LZ4_dict->dictSize = 64 * KB; + LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; + } +} + +int LZ4_saveDict(LZ4_stream_t *LZ4_dict, char *safeBuffer, int dictSize) +{ + LZ4_stream_t_internal * const dict = &LZ4_dict->internal_donotuse; + const BYTE * const previousDictEnd = dict->dictionary + dict->dictSize; + + if ((U32)dictSize > 64 * KB) { + /* useless to define a dictionary > 64 * KB */ + dictSize = 64 * KB; + } + if ((U32)dictSize > dict->dictSize) + dictSize = dict->dictSize; + + memmove(safeBuffer, previousDictEnd - dictSize, dictSize); + + dict->dictionary = (const BYTE *)safeBuffer; + dict->dictSize = (U32)dictSize; + + return dictSize; +} +EXPORT_SYMBOL(LZ4_saveDict); + +int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, const char *source, + char *dest, int inputSize, int maxOutputSize, int acceleration) +{ + LZ4_stream_t_internal *streamPtr = &LZ4_stream->internal_donotuse; + const BYTE * const dictEnd = streamPtr->dictionary + + streamPtr->dictSize; + + const BYTE *smallest = (const BYTE *) source; + + if (streamPtr->initCheck) { + /* Uninitialized structure detected */ + return 0; + } + + if ((streamPtr->dictSize > 0) && (smallest > dictEnd)) + smallest = dictEnd; + + LZ4_renormDictT(streamPtr, smallest); + + if (acceleration < 1) + acceleration = LZ4_ACCELERATION_DEFAULT; - if (out_len < 0) - goto exit; + /* Check overlapping input/dictionary space */ + { + const BYTE *sourceEnd = (const BYTE *) source + inputSize; + + if ((sourceEnd > streamPtr->dictionary) + && (sourceEnd < dictEnd)) { + streamPtr->dictSize = (U32)(dictEnd - sourceEnd); + if (streamPtr->dictSize > 64 * KB) + streamPtr->dictSize = 64 * KB; + if (streamPtr->dictSize < 4) + streamPtr->dictSize = 0; + streamPtr->dictionary = dictEnd - streamPtr->dictSize; + } + } - *dst_len = out_len; + /* prefix mode : source data follows dictionary */ + if (dictEnd == (const BYTE *)source) { + int result; + + if ((streamPtr->dictSize < 64 * KB) && + (streamPtr->dictSize < streamPtr->currentOffset)) { + result = LZ4_compress_generic( + streamPtr, source, dest, inputSize, + maxOutputSize, limitedOutput, byU32, + withPrefix64k, dictSmall, acceleration); + } else { + result = LZ4_compress_generic( + streamPtr, source, dest, inputSize, + maxOutputSize, limitedOutput, byU32, + withPrefix64k, noDictIssue, acceleration); + } + streamPtr->dictSize += (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + return result; + } - return 0; -exit: - return ret; + /* external dictionary mode */ + { + int result; + + if ((streamPtr->dictSize < 64 * KB) && + (streamPtr->dictSize < streamPtr->currentOffset)) { + result = LZ4_compress_generic( + streamPtr, source, dest, inputSize, + maxOutputSize, limitedOutput, byU32, + usingExtDict, dictSmall, acceleration); + } else { + result = LZ4_compress_generic( + streamPtr, source, dest, inputSize, + maxOutputSize, limitedOutput, byU32, + usingExtDict, noDictIssue, acceleration); + } + streamPtr->dictionary = (const BYTE *)source; + streamPtr->dictSize = (U32)inputSize; + streamPtr->currentOffset += (U32)inputSize; + return result; + } +} +EXPORT_SYMBOL(LZ4_compress_fast_continue); + +/*-****************************** + * For backwards compatibility + ********************************/ +int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst, + size_t *dst_len, void *wrkmem) { + *dst_len = LZ4_compress_default(src, dst, src_len, + *dst_len, wrkmem); + + /* + * Prior lz4_compress will return -1 in case of error + * and 0 on success + * while new LZ4_compress_fast/default + * returns 0 in case of error + * and the output length on success + */ + if (!*dst_len) + return -1; + else + return 0; } EXPORT_SYMBOL(lz4_compress); diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index 6d940c72b5fc..b5e00a19a7b4 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -1,25 +1,16 @@ /* - * LZ4 Decompressor for Linux kernel - * - * Copyright (C) 2013, LG Electronics, Kyungsik Lee - * - * Based on LZ4 implementation by Yann Collet. - * * LZ4 - Fast LZ compression algorithm - * Copyright (C) 2011-2012, Yann Collet. - * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - * + * Copyright (C) 2011 - 2016, Yann Collet. + * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php) * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. - * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -31,313 +22,529 @@ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * You can contact the author at : + * - LZ4 homepage : http://www.lz4.org + * - LZ4 source repository : https://github.com/lz4/lz4 * - * You can contact the author at : - * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html - * - LZ4 source repository : http://code.google.com/p/lz4/ + * Changed for kernel usage by: + * Sven Schmidt <4sschmid@informatik.uni-hamburg.de> */ -#ifndef STATIC +/*-************************************ + * Dependencies + **************************************/ +#include +#include "lz4defs.h" +#include #include #include -#endif -#include - #include -#include "lz4defs.h" - -static const int dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; -#if LZ4_ARCH64 -static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; -#endif - -static int lz4_uncompress(const char *source, char *dest, int osize) +/*-***************************** + * Decompression functions + *******************************/ +/* LZ4_decompress_generic() : + * This generic decompression function cover all use cases. + * It shall be instantiated several times, using different sets of directives + * Note that it is important this generic function is really inlined, + * in order to remove useless branches during compilation optimization. + */ +static FORCE_INLINE int LZ4_decompress_generic( + const char * const source, + char * const dest, + int inputSize, + /* + * If endOnInput == endOnInputSize, + * this value is the max size of Output Buffer. + */ + int outputSize, + /* endOnOutputSize, endOnInputSize */ + int endOnInput, + /* full, partial */ + int partialDecoding, + /* only used if partialDecoding == partial */ + int targetOutputSize, + /* noDict, withPrefix64k, usingExtDict */ + int dict, + /* == dest when no prefix */ + const BYTE * const lowPrefix, + /* only if dict == usingExtDict */ + const BYTE * const dictStart, + /* note : = 0 if noDict */ + const size_t dictSize + ) { + /* Local Variables */ const BYTE *ip = (const BYTE *) source; - const BYTE *ref; + const BYTE * const iend = ip + inputSize; + BYTE *op = (BYTE *) dest; - BYTE * const oend = op + osize; + BYTE * const oend = op + outputSize; BYTE *cpy; - unsigned token; - size_t length; + BYTE *oexit = op + targetOutputSize; + const BYTE * const lowLimit = lowPrefix - dictSize; + + const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize; + const unsigned int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; + const int dec64table[] = { 0, 0, 0, -1, 0, 1, 2, 3 }; + const int safeDecode = (endOnInput == endOnInputSize); + const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB))); + + /* Special cases */ + /* targetOutputSize too high => decode everything */ + if ((partialDecoding) && (oexit > oend - MFLIMIT)) + oexit = oend - MFLIMIT; + + /* Empty output buffer */ + if ((endOnInput) && (unlikely(outputSize == 0))) + return ((inputSize == 1) && (*ip == 0)) ? 0 : -1; + + if ((!endOnInput) && (unlikely(outputSize == 0))) + return (*ip == 0 ? 1 : -1); + + /* Main Loop : decode sequences */ while (1) { + size_t length; + const BYTE *match; + size_t offset; + + /* get literal length */ + unsigned int const token = *ip++; + + length = token>>ML_BITS; - /* get runlength */ - token = *ip++; - length = (token >> ML_BITS); if (length == RUN_MASK) { - size_t len; + unsigned int s; - len = *ip++; - for (; len == 255; length += 255) - len = *ip++; - if (unlikely(length > (size_t)(length + len))) + do { + s = *ip++; + length += s; + } while (likely(endOnInput + ? ip < iend - RUN_MASK + : 1) & (s == 255)); + + if ((safeDecode) + && unlikely( + (size_t)(op + length) < (size_t)(op))) { + /* overflow detection */ goto _output_error; - length += len; + } + if ((safeDecode) + && unlikely( + (size_t)(ip + length) < (size_t)(ip))) { + /* overflow detection */ + goto _output_error; + } } /* copy literals */ cpy = op + length; - if (unlikely(cpy > oend - COPYLENGTH)) { - /* - * Error: not enough place for another match - * (min 4) + 5 literals - */ - if (cpy != oend) - goto _output_error; + if (((endOnInput) && ((cpy > (partialDecoding ? oexit : oend - MFLIMIT)) + || (ip + length > iend - (2 + 1 + LASTLITERALS)))) + || ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) { + if (partialDecoding) { + if (cpy > oend) { + /* + * Error : + * write attempt beyond end of output buffer + */ + goto _output_error; + } + if ((endOnInput) + && (ip + length > iend)) { + /* + * Error : + * read attempt beyond + * end of input buffer + */ + goto _output_error; + } + } else { + if ((!endOnInput) + && (cpy != oend)) { + /* + * Error : + * block decoding must + * stop exactly there + */ + goto _output_error; + } + if ((endOnInput) + && ((ip + length != iend) + || (cpy > oend))) { + /* + * Error : + * input must be consumed + */ + goto _output_error; + } + } memcpy(op, ip, length); ip += length; - break; /* EOF */ + op += length; + /* Necessarily EOF, due to parsing restrictions */ + break; } - LZ4_WILDCOPY(ip, op, cpy); - ip -= (op - cpy); + + LZ4_wildCopy(op, ip, cpy); + ip += length; op = cpy; /* get offset */ - LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + offset = LZ4_readLE16(ip); ip += 2; + match = op - offset; - /* Error: offset create reference outside destination buffer */ - if (unlikely(ref < (BYTE *const) dest)) + if ((checkOffset) && (unlikely(match < lowLimit))) { + /* Error : offset outside buffers */ goto _output_error; + } + + /* costs ~1%; silence an msan warning when offset == 0 */ + LZ4_write32(op, (U32)offset); /* get matchlength */ length = token & ML_MASK; if (length == ML_MASK) { - for (; *ip == 255; length += 255) - ip++; - if (unlikely(length > (size_t)(length + *ip))) + unsigned int s; + + do { + s = *ip++; + + if ((endOnInput) && (ip > iend - LASTLITERALS)) + goto _output_error; + + length += s; + } while (s == 255); + + if ((safeDecode) + && unlikely( + (size_t)(op + length) < (size_t)op)) { + /* overflow detection */ goto _output_error; - length += *ip++; + } } - /* copy repeated sequence */ - if (unlikely((op - ref) < STEPSIZE)) { -#if LZ4_ARCH64 - int dec64 = dec64table[op - ref]; -#else - const int dec64 = 0; -#endif - op[0] = ref[0]; - op[1] = ref[1]; - op[2] = ref[2]; - op[3] = ref[3]; - op += 4; - ref += 4; - ref -= dec32table[op-ref]; - PUT4(ref, op); - op += STEPSIZE - 4; - ref -= dec64; + length += MINMATCH; + + /* check external dictionary */ + if ((dict == usingExtDict) && (match < lowPrefix)) { + if (unlikely(op + length > oend - LASTLITERALS)) { + /* doesn't respect parsing restriction */ + goto _output_error; + } + + if (length <= (size_t)(lowPrefix - match)) { + /* + * match can be copied as a single segment + * from external dictionary + */ + memmove(op, dictEnd - (lowPrefix - match), + length); + op += length; + } else { + /* + * match encompass external + * dictionary and current block + */ + size_t const copySize = (size_t)(lowPrefix - match); + size_t const restSize = length - copySize; + + memcpy(op, dictEnd - copySize, copySize); + op += copySize; + + if (restSize > (size_t)(op - lowPrefix)) { + /* overlap copy */ + BYTE * const endOfMatch = op + restSize; + const BYTE *copyFrom = lowPrefix; + + while (op < endOfMatch) + *op++ = *copyFrom++; + } else { + memcpy(op, lowPrefix, restSize); + op += restSize; + } + } + + continue; + } + + /* copy match within block */ + cpy = op + length; + + if (unlikely(offset < 8)) { + const int dec64 = dec64table[offset]; + + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += dec32table[offset]; + memcpy(op + 4, match, 4); + match -= dec64; } else { - LZ4_COPYSTEP(ref, op); + LZ4_copy8(op, match); + match += 8; } - cpy = op + length - (STEPSIZE - 4); - if (cpy > (oend - COPYLENGTH)) { - /* Error: request to write beyond destination buffer */ - if (cpy > oend) - goto _output_error; -#if LZ4_ARCH64 - if ((ref + COPYLENGTH) > oend) -#else - if ((ref + COPYLENGTH) > oend || - (op + COPYLENGTH) > oend) -#endif + op += 8; + + if (unlikely(cpy > oend - 12)) { + BYTE * const oCopyLimit = oend - (WILDCOPYLENGTH - 1); + + if (cpy > oend - LASTLITERALS) { + /* + * Error : last LASTLITERALS bytes + * must be literals (uncompressed) + */ goto _output_error; - LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + } + + if (op < oCopyLimit) { + LZ4_wildCopy(op, match, oCopyLimit); + match += oCopyLimit - op; + op = oCopyLimit; + } + while (op < cpy) - *op++ = *ref++; - op = cpy; - /* - * Check EOF (should never happen, since last 5 bytes - * are supposed to be literals) - */ - if (op == oend) - goto _output_error; - continue; + *op++ = *match++; + } else { + LZ4_copy8(op, match); + + if (length > 16) + LZ4_wildCopy(op + 8, match + 8, cpy); } - LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ } + /* end of decoding */ - return (int) (((char *)ip) - source); + if (endOnInput) { + /* Nb of output bytes decoded */ + return (int) (((char *)op) - dest); + } else { + /* Nb of input bytes read */ + return (int) (((const char *)ip) - source); + } - /* write overflow error detected */ + /* Overflow error detected */ _output_error: return -1; } -static int lz4_uncompress_unknownoutputsize(const char *source, char *dest, - int isize, size_t maxoutputsize) +int LZ4_decompress_safe(const char *source, char *dest, + int compressedSize, int maxDecompressedSize) { - const BYTE *ip = (const BYTE *) source; - const BYTE *const iend = ip + isize; - const BYTE *ref; - + return LZ4_decompress_generic(source, dest, compressedSize, + maxDecompressedSize, endOnInputSize, full, 0, + noDict, (BYTE *)dest, NULL, 0); +} - BYTE *op = (BYTE *) dest; - BYTE * const oend = op + maxoutputsize; - BYTE *cpy; +int LZ4_decompress_safe_partial(const char *source, char *dest, + int compressedSize, int targetOutputSize, int maxDecompressedSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, + maxDecompressedSize, endOnInputSize, partial, + targetOutputSize, noDict, (BYTE *)dest, NULL, 0); +} - /* Main Loop */ - while (ip < iend) { +int LZ4_decompress_fast(const char *source, char *dest, int originalSize) +{ + return LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, full, 0, withPrefix64k, + (BYTE *)(dest - 64 * KB), NULL, 64 * KB); +} - unsigned token; - size_t length; +int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode, + const char *dictionary, int dictSize) +{ + LZ4_streamDecode_t_internal *lz4sd = (LZ4_streamDecode_t_internal *) LZ4_streamDecode; - /* get runlength */ - token = *ip++; - length = (token >> ML_BITS); - if (length == RUN_MASK) { - int s = 255; - while ((ip < iend) && (s == 255)) { - s = *ip++; - if (unlikely(length > (size_t)(length + s))) - goto _output_error; - length += s; - } - } - /* copy literals */ - cpy = op + length; - if ((cpy > oend - COPYLENGTH) || - (ip + length > iend - COPYLENGTH)) { - - if (cpy > oend) - goto _output_error;/* writes beyond buffer */ - - if (ip + length != iend) - goto _output_error;/* - * Error: LZ4 format requires - * to consume all input - * at this stage - */ - memcpy(op, ip, length); - op += length; - break;/* Necessarily EOF, due to parsing restrictions */ - } - LZ4_WILDCOPY(ip, op, cpy); - ip -= (op - cpy); - op = cpy; + lz4sd->prefixSize = (size_t) dictSize; + lz4sd->prefixEnd = (const BYTE *) dictionary + dictSize; + lz4sd->externalDict = NULL; + lz4sd->extDictSize = 0; + return 1; +} - /* get offset */ - LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); - ip += 2; - if (ref < (BYTE * const) dest) - goto _output_error; - /* - * Error : offset creates reference - * outside of destination buffer - */ +/* + * *_continue() : + * These decoding functions allow decompression of multiple blocks + * in "streaming" mode. + * Previously decoded blocks must still be available at the memory + * position where they were decoded. + * If it's not possible, save the relevant part of + * decoded data into a safe buffer, + * and indicate where it stands using LZ4_setStreamDecode() + */ +int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode, + const char *source, char *dest, int compressedSize, int maxOutputSize) +{ + LZ4_streamDecode_t_internal *lz4sd = &LZ4_streamDecode->internal_donotuse; + int result; + + if (lz4sd->prefixEnd == (BYTE *)dest) { + result = LZ4_decompress_generic(source, dest, + compressedSize, + maxOutputSize, + endOnInputSize, full, 0, + usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, + lz4sd->externalDict, + lz4sd->extDictSize); + + if (result <= 0) + return result; + + lz4sd->prefixSize += result; + lz4sd->prefixEnd += result; + } else { + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_generic(source, dest, + compressedSize, maxOutputSize, + endOnInputSize, full, 0, + usingExtDict, (BYTE *)dest, + lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) + return result; + lz4sd->prefixSize = result; + lz4sd->prefixEnd = (BYTE *)dest + result; + } - /* get matchlength */ - length = (token & ML_MASK); - if (length == ML_MASK) { - while (ip < iend) { - int s = *ip++; - if (unlikely(length > (size_t)(length + s))) - goto _output_error; - length += s; - if (s == 255) - continue; - break; - } - } + return result; +} - /* copy repeated sequence */ - if (unlikely((op - ref) < STEPSIZE)) { -#if LZ4_ARCH64 - int dec64 = dec64table[op - ref]; -#else - const int dec64 = 0; -#endif - op[0] = ref[0]; - op[1] = ref[1]; - op[2] = ref[2]; - op[3] = ref[3]; - op += 4; - ref += 4; - ref -= dec32table[op - ref]; - PUT4(ref, op); - op += STEPSIZE - 4; - ref -= dec64; - } else { - LZ4_COPYSTEP(ref, op); - } - cpy = op + length - (STEPSIZE-4); - if (cpy > oend - COPYLENGTH) { - if (cpy > oend) - goto _output_error; /* write outside of buf */ -#if LZ4_ARCH64 - if ((ref + COPYLENGTH) > oend) -#else - if ((ref + COPYLENGTH) > oend || - (op + COPYLENGTH) > oend) -#endif - goto _output_error; - LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); - while (op < cpy) - *op++ = *ref++; - op = cpy; - /* - * Check EOF (should never happen, since last 5 bytes - * are supposed to be literals) - */ - if (op == oend) - goto _output_error; - continue; - } - LZ4_SECURECOPY(ref, op, cpy); - op = cpy; /* correction */ +int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode, + const char *source, char *dest, int originalSize) +{ + LZ4_streamDecode_t_internal *lz4sd = &LZ4_streamDecode->internal_donotuse; + int result; + + if (lz4sd->prefixEnd == (BYTE *)dest) { + result = LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, full, 0, + usingExtDict, + lz4sd->prefixEnd - lz4sd->prefixSize, + lz4sd->externalDict, lz4sd->extDictSize); + + if (result <= 0) + return result; + + lz4sd->prefixSize += originalSize; + lz4sd->prefixEnd += originalSize; + } else { + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_generic(source, dest, 0, originalSize, + endOnOutputSize, full, 0, + usingExtDict, (BYTE *)dest, + lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) + return result; + lz4sd->prefixSize = originalSize; + lz4sd->prefixEnd = (BYTE *)dest + originalSize; } - /* end of decoding */ - return (int) (((char *) op) - dest); - /* write overflow error detected */ -_output_error: - return -1; + return result; } -int lz4_decompress(const unsigned char *src, size_t *src_len, - unsigned char *dest, size_t actual_dest_len) +/* + * Advanced decoding functions : + * *_usingDict() : + * These decoding functions work the same as "_continue" ones, + * the dictionary must be explicitly provided within parameters + */ +static FORCE_INLINE int LZ4_decompress_usingDict_generic(const char *source, + char *dest, int compressedSize, int maxOutputSize, int safe, + const char *dictStart, int dictSize) { - int ret = -1; - int input_len = 0; - - input_len = lz4_uncompress(src, dest, actual_dest_len); - if (input_len < 0) - goto exit_0; - *src_len = input_len; + if (dictSize == 0) + return LZ4_decompress_generic(source, dest, + compressedSize, maxOutputSize, safe, full, 0, + noDict, (BYTE *)dest, NULL, 0); + if (dictStart + dictSize == dest) { + if (dictSize >= (int)(64 * KB - 1)) + return LZ4_decompress_generic(source, dest, + compressedSize, maxOutputSize, safe, full, 0, + withPrefix64k, (BYTE *)dest - 64 * KB, NULL, 0); + return LZ4_decompress_generic(source, dest, compressedSize, + maxOutputSize, safe, full, 0, noDict, + (BYTE *)dest - dictSize, NULL, 0); + } + return LZ4_decompress_generic(source, dest, compressedSize, + maxOutputSize, safe, full, 0, usingExtDict, + (BYTE *)dest, (const BYTE *)dictStart, dictSize); +} - return 0; -exit_0: - return ret; +int LZ4_decompress_safe_usingDict(const char *source, char *dest, + int compressedSize, int maxOutputSize, + const char *dictStart, int dictSize) +{ + return LZ4_decompress_usingDict_generic(source, dest, + compressedSize, maxOutputSize, 1, dictStart, dictSize); } -#ifndef STATIC -EXPORT_SYMBOL(lz4_decompress); -#endif -int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, - unsigned char *dest, size_t *dest_len) +int LZ4_decompress_fast_usingDict(const char *source, char *dest, + int originalSize, const char *dictStart, int dictSize) { - int ret = -1; - int out_len = 0; - - out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len, - *dest_len); - if (out_len < 0) - goto exit_0; - *dest_len = out_len; - - return 0; -exit_0: - return ret; + return LZ4_decompress_usingDict_generic(source, dest, 0, + originalSize, 0, dictStart, dictSize); } + +/*-****************************** + * For backwards compatibility + ********************************/ +int lz4_decompress_unknownoutputsize(const unsigned char *src, + size_t src_len, unsigned char *dest, size_t *dest_len) { + *dest_len = LZ4_decompress_safe(src, dest, + src_len, *dest_len); + + /* + * Prior lz4_decompress_unknownoutputsize will return + * 0 for success and a negative result for error + * new LZ4_decompress_safe returns + * - the length of data read on success + * - and also a negative result on error + * meaning when result > 0, we just return 0 here + */ + if (src_len > 0) + return 0; + else + return -1; +} + +int lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len) { + *src_len = LZ4_decompress_fast(src, dest, actual_dest_len); + + /* + * Prior lz4_decompress will return + * 0 for success and a negative result for error + * new LZ4_decompress_fast returns + * - the length of data read on success + * - and also a negative result on error + * meaning when result > 0, we just return 0 here + */ + if (*src_len > 0) + return 0; + else + return -1; +} + #ifndef STATIC +EXPORT_SYMBOL(LZ4_decompress_safe); +EXPORT_SYMBOL(LZ4_decompress_safe_partial); +EXPORT_SYMBOL(LZ4_decompress_fast); +EXPORT_SYMBOL(LZ4_setStreamDecode); +EXPORT_SYMBOL(LZ4_decompress_safe_continue); +EXPORT_SYMBOL(LZ4_decompress_fast_continue); +EXPORT_SYMBOL(LZ4_decompress_safe_usingDict); +EXPORT_SYMBOL(LZ4_decompress_fast_usingDict); EXPORT_SYMBOL(lz4_decompress_unknownoutputsize); +EXPORT_SYMBOL(lz4_decompress); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_DESCRIPTION("LZ4 Decompressor"); +MODULE_DESCRIPTION("LZ4 decompressor"); #endif diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h index c79d7ea8a38e..00a0b58a0871 100644 --- a/lib/lz4/lz4defs.h +++ b/lib/lz4/lz4defs.h @@ -1,157 +1,227 @@ +#ifndef __LZ4DEFS_H__ +#define __LZ4DEFS_H__ + /* - * lz4defs.h -- architecture specific defines - * - * Copyright (C) 2013, LG Electronics, Kyungsik Lee + * lz4defs.h -- common and architecture specific defines for the kernel usage + + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2016, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * You can contact the author at : + * - LZ4 homepage : http://www.lz4.org + * - LZ4 source repository : https://github.com/lz4/lz4 * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * Changed for kernel usage by: + * Sven Schmidt <4sschmid@informatik.uni-hamburg.de> */ -/* - * Detects 64 bits mode - */ +#include +#include /* memset, memcpy */ + +#define FORCE_INLINE __always_inline + +/*-************************************ + * Basic Types + **************************************/ +#include + +typedef uint8_t BYTE; +typedef uint16_t U16; +typedef uint32_t U32; +typedef int32_t S32; +typedef uint64_t U64; +typedef uintptr_t uptrval; + +/*-************************************ + * Architecture specifics + **************************************/ #if defined(CONFIG_64BIT) #define LZ4_ARCH64 1 #else #define LZ4_ARCH64 0 #endif -/* - * Architecture-specific macros - */ -#define BYTE u8 -typedef struct _U16_S { u16 v; } U16_S; -typedef struct _U32_S { u32 v; } U32_S; -typedef struct _U64_S { u64 v; } U64_S; -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) - -#define A16(x) (((U16_S *)(x))->v) -#define A32(x) (((U32_S *)(x))->v) -#define A64(x) (((U64_S *)(x))->v) - -#define PUT4(s, d) (A32(d) = A32(s)) -#define PUT8(s, d) (A64(d) = A64(s)) - -#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ - (d = s - A16(p)) - -#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ - do { \ - A16(p) = v; \ - p += 2; \ - } while (0) -#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ - -#define A64(x) get_unaligned((u64 *)&(((U16_S *)(x))->v)) -#define A32(x) get_unaligned((u32 *)&(((U16_S *)(x))->v)) -#define A16(x) get_unaligned((u16 *)&(((U16_S *)(x))->v)) - -#define PUT4(s, d) \ - put_unaligned(get_unaligned((const u32 *) s), (u32 *) d) -#define PUT8(s, d) \ - put_unaligned(get_unaligned((const u64 *) s), (u64 *) d) - -#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ - (d = s - get_unaligned_le16(p)) - -#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ - do { \ - put_unaligned_le16(v, (u16 *)(p)); \ - p += 2; \ - } while (0) +#if defined(__LITTLE_ENDIAN) +#define LZ4_LITTLE_ENDIAN 1 +#else +#define LZ4_LITTLE_ENDIAN 0 #endif -#define COPYLENGTH 8 -#define ML_BITS 4 -#define ML_MASK ((1U << ML_BITS) - 1) +/*-************************************ + * Constants + **************************************/ +#define MINMATCH 4 + +#define WILDCOPYLENGTH 8 +#define LASTLITERALS 5 +#define MFLIMIT (WILDCOPYLENGTH + MINMATCH) + +/* Increase this value ==> compression run slower on incompressible data */ +#define LZ4_SKIPTRIGGER 6 + +#define HASH_UNIT sizeof(size_t) + +#define KB (1 << 10) +#define MB (1 << 20) +#define GB (1U << 30) + +#define MAXD_LOG 16 +#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) +#define STEPSIZE sizeof(size_t) + +#define ML_BITS 4 +#define ML_MASK ((1U << ML_BITS) - 1) #define RUN_BITS (8 - ML_BITS) #define RUN_MASK ((1U << RUN_BITS) - 1) -#define MEMORY_USAGE 14 -#define MINMATCH 4 -#define SKIPSTRENGTH 6 -#define LASTLITERALS 5 -#define MFLIMIT (COPYLENGTH + MINMATCH) -#define MINLENGTH (MFLIMIT + 1) -#define MAXD_LOG 16 -#define MAXD (1 << MAXD_LOG) -#define MAXD_MASK (u32)(MAXD - 1) -#define MAX_DISTANCE (MAXD - 1) -#define HASH_LOG (MAXD_LOG - 1) -#define HASHTABLESIZE (1 << HASH_LOG) -#define MAX_NB_ATTEMPTS 256 -#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH) -#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT - 1)) -#define HASHLOG64K ((MEMORY_USAGE - 2) + 1) -#define HASH64KTABLESIZE (1U << HASHLOG64K) -#define LZ4_HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ - ((MINMATCH * 8) - (MEMORY_USAGE-2))) -#define LZ4_HASH64K_VALUE(p) (((A32(p)) * 2654435761U) >> \ - ((MINMATCH * 8) - HASHLOG64K)) -#define HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ - ((MINMATCH * 8) - HASH_LOG)) - -#if LZ4_ARCH64/* 64-bit */ -#define STEPSIZE 8 - -#define LZ4_COPYSTEP(s, d) \ - do { \ - PUT8(s, d); \ - d += 8; \ - s += 8; \ - } while (0) - -#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d) - -#define LZ4_SECURECOPY(s, d, e) \ - do { \ - if (d < e) { \ - LZ4_WILDCOPY(s, d, e); \ - } \ - } while (0) -#define HTYPE u32 - -#ifdef __BIG_ENDIAN -#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3) + +/*-************************************ + * Reading and writing into memory + **************************************/ +static FORCE_INLINE U16 LZ4_read16(const void *ptr) +{ + return get_unaligned((const U16 *)ptr); +} + +static FORCE_INLINE U32 LZ4_read32(const void *ptr) +{ + return get_unaligned((const U32 *)ptr); +} + +static FORCE_INLINE size_t LZ4_read_ARCH(const void *ptr) +{ + return get_unaligned((const size_t *)ptr); +} + +static FORCE_INLINE void LZ4_write16(void *memPtr, U16 value) +{ + put_unaligned(value, (U16 *)memPtr); +} + +static FORCE_INLINE void LZ4_write32(void *memPtr, U32 value) +{ + put_unaligned(value, (U32 *)memPtr); +} + +static FORCE_INLINE U16 LZ4_readLE16(const void *memPtr) +{ + return get_unaligned_le16(memPtr); +} + +static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value) +{ + return put_unaligned_le16(value, memPtr); +} + +static FORCE_INLINE void LZ4_copy8(void *dst, const void *src) +{ +#if LZ4_ARCH64 + U64 a = get_unaligned((const U64 *)src); + + put_unaligned(a, (U64 *)dst); +#else + U32 a = get_unaligned((const U32 *)src); + U32 b = get_unaligned((const U32 *)src + 1); + + put_unaligned(a, (U32 *)dst); + put_unaligned(b, (U32 *)dst + 1); +#endif +} + +/* + * customized variant of memcpy, + * which can overwrite up to 7 bytes beyond dstEnd + */ +static FORCE_INLINE void LZ4_wildCopy(void *dstPtr, + const void *srcPtr, void *dstEnd) +{ + BYTE *d = (BYTE *)dstPtr; + const BYTE *s = (const BYTE *)srcPtr; + BYTE *const e = (BYTE *)dstEnd; + + do { + LZ4_copy8(d, s); + d += 8; + s += 8; + } while (d < e); +} + +static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val) +{ +#if LZ4_LITTLE_ENDIAN + return __ffs(val) >> 3; #else -#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3) + return (BITS_PER_LONG - 1 - __fls(val)) >> 3; +#endif +} + +static FORCE_INLINE unsigned int LZ4_count( + const BYTE *pIn, + const BYTE *pMatch, + const BYTE *pInLimit) +{ + const BYTE *const pStart = pIn; + + while (likely(pIn < pInLimit - (STEPSIZE - 1))) { + size_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); + + if (!diff) { + pIn += STEPSIZE; + pMatch += STEPSIZE; + continue; + } + + pIn += LZ4_NbCommonBytes(diff); + + return (unsigned int)(pIn - pStart); + } + +#if LZ4_ARCH64 + if ((pIn < (pInLimit - 3)) + && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { + pIn += 4; + pMatch += 4; + } #endif -#else /* 32-bit */ -#define STEPSIZE 4 + if ((pIn < (pInLimit - 1)) + && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { + pIn += 2; + pMatch += 2; + } -#define LZ4_COPYSTEP(s, d) \ - do { \ - PUT4(s, d); \ - d += 4; \ - s += 4; \ - } while (0) + if ((pIn < pInLimit) && (*pMatch == *pIn)) + pIn++; -#define LZ4_COPYPACKET(s, d) \ - do { \ - LZ4_COPYSTEP(s, d); \ - LZ4_COPYSTEP(s, d); \ - } while (0) + return (unsigned int)(pIn - pStart); +} -#define LZ4_SECURECOPY LZ4_WILDCOPY -#define HTYPE const u8* +typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive; +typedef enum { byPtr, byU32, byU16 } tableType_t; -#ifdef __BIG_ENDIAN -#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3) -#else -#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3) -#endif +typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; +typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; -#endif +typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; +typedef enum { full = 0, partial = 1 } earlyEnd_directive; -#define LZ4_WILDCOPY(s, d, e) \ - do { \ - LZ4_COPYPACKET(s, d); \ - } while (d < e) - -#define LZ4_BLINDCOPY(s, d, l) \ - do { \ - u8 *e = (d) + l; \ - LZ4_WILDCOPY(s, d, e); \ - d = e; \ - } while (0) +#endif diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c index f344f76b6559..e1fbf1561e55 100644 --- a/lib/lz4/lz4hc_compress.c +++ b/lib/lz4/lz4hc_compress.c @@ -1,19 +1,17 @@ /* * LZ4 HC - High Compression Mode of LZ4 - * Copyright (C) 2011-2012, Yann Collet. - * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * Copyright (C) 2011-2015, Yann Collet. * + * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php) * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. - * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -25,323 +23,361 @@ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * * You can contact the author at : - * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html - * - LZ4 source repository : http://code.google.com/p/lz4/ + * - LZ4 homepage : http://www.lz4.org + * - LZ4 source repository : https://github.com/lz4/lz4 * - * Changed for kernel use by: - * Chanho Min + * Changed for kernel usage by: + * Sven Schmidt <4sschmid@informatik.uni-hamburg.de> */ -#include -#include +/*-************************************ + * Dependencies + **************************************/ #include -#include #include "lz4defs.h" +#include +#include +#include /* memset */ + +/* ************************************* + * Local Constants and types + ***************************************/ -struct lz4hc_data { - const u8 *base; - HTYPE hashtable[HASHTABLESIZE]; - u16 chaintable[MAXD]; - const u8 *nexttoupdate; -} __attribute__((__packed__)); +#define OPTIMAL_ML (int)((ML_MASK - 1) + MINMATCH) -static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base) +#define HASH_FUNCTION(i) (((i) * 2654435761U) \ + >> ((MINMATCH*8) - LZ4HC_HASH_LOG)) +#define DELTANEXTU16(p) chainTable[(U16)(p)] /* faster */ + +static U32 LZ4HC_hashPtr(const void *ptr) +{ + return HASH_FUNCTION(LZ4_read32(ptr)); +} + +/************************************** + * HC Compression + **************************************/ +static void LZ4HC_init(LZ4HC_CCtx_internal *hc4, const BYTE *start) { - memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable)); - memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable)); - -#if LZ4_ARCH64 - hc4->nexttoupdate = base + 1; -#else - hc4->nexttoupdate = base; -#endif - hc4->base = base; - return 1; + memset((void *)hc4->hashTable, 0, sizeof(hc4->hashTable)); + memset(hc4->chainTable, 0xFF, sizeof(hc4->chainTable)); + hc4->nextToUpdate = 64 * KB; + hc4->base = start - 64 * KB; + hc4->end = start; + hc4->dictBase = start - 64 * KB; + hc4->dictLimit = 64 * KB; + hc4->lowLimit = 64 * KB; } /* Update chains up to ip (excluded) */ -static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip) +static FORCE_INLINE void LZ4HC_Insert(LZ4HC_CCtx_internal *hc4, + const BYTE *ip) { - u16 *chaintable = hc4->chaintable; - HTYPE *hashtable = hc4->hashtable; -#if LZ4_ARCH64 + U16 * const chainTable = hc4->chainTable; + U32 * const hashTable = hc4->hashTable; const BYTE * const base = hc4->base; -#else - const int base = 0; -#endif + U32 const target = (U32)(ip - base); + U32 idx = hc4->nextToUpdate; + + while (idx < target) { + U32 const h = LZ4HC_hashPtr(base + idx); + size_t delta = idx - hashTable[h]; - while (hc4->nexttoupdate < ip) { - const u8 *p = hc4->nexttoupdate; - size_t delta = p - (hashtable[HASH_VALUE(p)] + base); if (delta > MAX_DISTANCE) delta = MAX_DISTANCE; - chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta; - hashtable[HASH_VALUE(p)] = (p) - base; - hc4->nexttoupdate++; - } -} -static inline size_t lz4hc_commonlength(const u8 *p1, const u8 *p2, - const u8 *const matchlimit) -{ - const u8 *p1t = p1; - - while (p1t < matchlimit - (STEPSIZE - 1)) { -#if LZ4_ARCH64 - u64 diff = A64(p2) ^ A64(p1t); -#else - u32 diff = A32(p2) ^ A32(p1t); -#endif - if (!diff) { - p1t += STEPSIZE; - p2 += STEPSIZE; - continue; - } - p1t += LZ4_NBCOMMONBYTES(diff); - return p1t - p1; - } -#if LZ4_ARCH64 - if ((p1t < (matchlimit-3)) && (A32(p2) == A32(p1t))) { - p1t += 4; - p2 += 4; - } -#endif + DELTANEXTU16(idx) = (U16)delta; - if ((p1t < (matchlimit - 1)) && (A16(p2) == A16(p1t))) { - p1t += 2; - p2 += 2; + hashTable[h] = idx; + idx++; } - if ((p1t < matchlimit) && (*p2 == *p1t)) - p1t++; - return p1t - p1; + + hc4->nextToUpdate = target; } -static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4, - const u8 *ip, const u8 *const matchlimit, const u8 **matchpos) +static FORCE_INLINE int LZ4HC_InsertAndFindBestMatch( + LZ4HC_CCtx_internal *hc4, /* Index table will be updated */ + const BYTE *ip, + const BYTE * const iLimit, + const BYTE **matchpos, + const int maxNbAttempts) { - u16 *const chaintable = hc4->chaintable; - HTYPE *const hashtable = hc4->hashtable; - const u8 *ref; -#if LZ4_ARCH64 + U16 * const chainTable = hc4->chainTable; + U32 * const HashTable = hc4->hashTable; const BYTE * const base = hc4->base; -#else - const int base = 0; -#endif - int nbattempts = MAX_NB_ATTEMPTS; - size_t repl = 0, ml = 0; - u16 delta; + const BYTE * const dictBase = hc4->dictBase; + const U32 dictLimit = hc4->dictLimit; + const U32 lowLimit = (hc4->lowLimit + 64 * KB > (U32)(ip - base)) + ? hc4->lowLimit + : (U32)(ip - base) - (64 * KB - 1); + U32 matchIndex; + int nbAttempts = maxNbAttempts; + size_t ml = 0; /* HC4 match finder */ - lz4hc_insert(hc4, ip); - ref = hashtable[HASH_VALUE(ip)] + base; - - /* potential repetition */ - if (ref >= ip-4) { - /* confirmed */ - if (A32(ref) == A32(ip)) { - delta = (u16)(ip-ref); - repl = ml = lz4hc_commonlength(ip + MINMATCH, - ref + MINMATCH, matchlimit) + MINMATCH; - *matchpos = ref; - } - ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; - } + LZ4HC_Insert(hc4, ip); + matchIndex = HashTable[LZ4HC_hashPtr(ip)]; + + while ((matchIndex >= lowLimit) + && (nbAttempts)) { + nbAttempts--; + if (matchIndex >= dictLimit) { + const BYTE * const match = base + matchIndex; + + if (*(match + ml) == *(ip + ml) + && (LZ4_read32(match) == LZ4_read32(ip))) { + size_t const mlt = LZ4_count(ip + MINMATCH, + match + MINMATCH, iLimit) + MINMATCH; - while ((ref >= ip - MAX_DISTANCE) && nbattempts) { - nbattempts--; - if (*(ref + ml) == *(ip + ml)) { - if (A32(ref) == A32(ip)) { - size_t mlt = - lz4hc_commonlength(ip + MINMATCH, - ref + MINMATCH, matchlimit) + MINMATCH; if (mlt > ml) { ml = mlt; - *matchpos = ref; + *matchpos = match; + } + } + } else { + const BYTE * const match = dictBase + matchIndex; + + if (LZ4_read32(match) == LZ4_read32(ip)) { + size_t mlt; + const BYTE *vLimit = ip + + (dictLimit - matchIndex); + + if (vLimit > iLimit) + vLimit = iLimit; + mlt = LZ4_count(ip + MINMATCH, + match + MINMATCH, vLimit) + MINMATCH; + if ((ip + mlt == vLimit) + && (vLimit < iLimit)) + mlt += LZ4_count(ip + mlt, + base + dictLimit, + iLimit); + if (mlt > ml) { + /* virtual matchpos */ + ml = mlt; + *matchpos = base + matchIndex; } } } - ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; - } - - /* Complete table */ - if (repl) { - const BYTE *ptr = ip; - const BYTE *end; - end = ip + repl - (MINMATCH-1); - /* Pre-Load */ - while (ptr < end - delta) { - chaintable[(size_t)(ptr) & MAXD_MASK] = delta; - ptr++; - } - do { - chaintable[(size_t)(ptr) & MAXD_MASK] = delta; - /* Head of chain */ - hashtable[HASH_VALUE(ptr)] = (ptr) - base; - ptr++; - } while (ptr < end); - hc4->nexttoupdate = end; + matchIndex -= DELTANEXTU16(matchIndex); } return (int)ml; } -static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4, - const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest, - const u8 **matchpos, const u8 **startpos) +static FORCE_INLINE int LZ4HC_InsertAndGetWiderMatch( + LZ4HC_CCtx_internal *hc4, + const BYTE * const ip, + const BYTE * const iLowLimit, + const BYTE * const iHighLimit, + int longest, + const BYTE **matchpos, + const BYTE **startpos, + const int maxNbAttempts) { - u16 *const chaintable = hc4->chaintable; - HTYPE *const hashtable = hc4->hashtable; -#if LZ4_ARCH64 + U16 * const chainTable = hc4->chainTable; + U32 * const HashTable = hc4->hashTable; const BYTE * const base = hc4->base; -#else - const int base = 0; -#endif - const u8 *ref; - int nbattempts = MAX_NB_ATTEMPTS; - int delta = (int)(ip - startlimit); + const U32 dictLimit = hc4->dictLimit; + const BYTE * const lowPrefixPtr = base + dictLimit; + const U32 lowLimit = (hc4->lowLimit + 64 * KB > (U32)(ip - base)) + ? hc4->lowLimit + : (U32)(ip - base) - (64 * KB - 1); + const BYTE * const dictBase = hc4->dictBase; + U32 matchIndex; + int nbAttempts = maxNbAttempts; + int delta = (int)(ip - iLowLimit); /* First Match */ - lz4hc_insert(hc4, ip); - ref = hashtable[HASH_VALUE(ip)] + base; - - while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base) - && (nbattempts)) { - nbattempts--; - if (*(startlimit + longest) == *(ref - delta + longest)) { - if (A32(ref) == A32(ip)) { - const u8 *reft = ref + MINMATCH; - const u8 *ipt = ip + MINMATCH; - const u8 *startt = ip; - - while (ipt < matchlimit-(STEPSIZE - 1)) { - #if LZ4_ARCH64 - u64 diff = A64(reft) ^ A64(ipt); - #else - u32 diff = A32(reft) ^ A32(ipt); - #endif - - if (!diff) { - ipt += STEPSIZE; - reft += STEPSIZE; - continue; + LZ4HC_Insert(hc4, ip); + matchIndex = HashTable[LZ4HC_hashPtr(ip)]; + + while ((matchIndex >= lowLimit) + && (nbAttempts)) { + nbAttempts--; + if (matchIndex >= dictLimit) { + const BYTE *matchPtr = base + matchIndex; + + if (*(iLowLimit + longest) + == *(matchPtr - delta + longest)) { + if (LZ4_read32(matchPtr) == LZ4_read32(ip)) { + int mlt = MINMATCH + LZ4_count( + ip + MINMATCH, + matchPtr + MINMATCH, + iHighLimit); + int back = 0; + + while ((ip + back > iLowLimit) + && (matchPtr + back > lowPrefixPtr) + && (ip[back - 1] == matchPtr[back - 1])) + back--; + + mlt -= back; + + if (mlt > longest) { + longest = (int)mlt; + *matchpos = matchPtr + back; + *startpos = ip + back; } - ipt += LZ4_NBCOMMONBYTES(diff); - goto _endcount; - } - #if LZ4_ARCH64 - if ((ipt < (matchlimit - 3)) - && (A32(reft) == A32(ipt))) { - ipt += 4; - reft += 4; } - ipt += 2; - #endif - if ((ipt < (matchlimit - 1)) - && (A16(reft) == A16(ipt))) { - reft += 2; - } - if ((ipt < matchlimit) && (*reft == *ipt)) - ipt++; -_endcount: - reft = ref; - - while ((startt > startlimit) - && (reft > hc4->base) - && (startt[-1] == reft[-1])) { - startt--; - reft--; - } - - if ((ipt - startt) > longest) { - longest = (int)(ipt - startt); - *matchpos = reft; - *startpos = startt; + } + } else { + const BYTE * const matchPtr = dictBase + matchIndex; + + if (LZ4_read32(matchPtr) == LZ4_read32(ip)) { + size_t mlt; + int back = 0; + const BYTE *vLimit = ip + (dictLimit - matchIndex); + + if (vLimit > iHighLimit) + vLimit = iHighLimit; + + mlt = LZ4_count(ip + MINMATCH, + matchPtr + MINMATCH, vLimit) + MINMATCH; + + if ((ip + mlt == vLimit) && (vLimit < iHighLimit)) + mlt += LZ4_count(ip + mlt, base + dictLimit, + iHighLimit); + while ((ip + back > iLowLimit) + && (matchIndex + back > lowLimit) + && (ip[back - 1] == matchPtr[back - 1])) + back--; + + mlt -= back; + + if ((int)mlt > longest) { + longest = (int)mlt; + *matchpos = base + matchIndex + back; + *startpos = ip + back; } } } - ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + + matchIndex -= DELTANEXTU16(matchIndex); } + return longest; } -static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor, - int ml, const u8 *ref) +static FORCE_INLINE int LZ4HC_encodeSequence( + const BYTE **ip, + BYTE **op, + const BYTE **anchor, + int matchLength, + const BYTE * const match, + limitedOutput_directive limitedOutputBuffer, + BYTE *oend) { - int length, len; - u8 *token; + int length; + BYTE *token; /* Encode Literal length */ length = (int)(*ip - *anchor); token = (*op)++; + + if ((limitedOutputBuffer) + && ((*op + (length>>8) + + length + (2 + 1 + LASTLITERALS)) > oend)) { + /* Check output limit */ + return 1; + } if (length >= (int)RUN_MASK) { - *token = (RUN_MASK << ML_BITS); + int len; + + *token = (RUN_MASK< 254 ; len -= 255) *(*op)++ = 255; - *(*op)++ = (u8)len; + *(*op)++ = (BYTE)len; } else - *token = (length << ML_BITS); + *token = (BYTE)(length<= (int)ML_MASK) { + length = (int)(matchLength - MINMATCH); + + if ((limitedOutputBuffer) + && (*op + (length>>8) + + (1 + LASTLITERALS) > oend)) { + /* Check output limit */ + return 1; + } + + if (length >= (int)ML_MASK) { *token += ML_MASK; - len -= ML_MASK; - for (; len > 509 ; len -= 510) { + length -= ML_MASK; + + for (; length > 509 ; length -= 510) { *(*op)++ = 255; *(*op)++ = 255; } - if (len > 254) { - len -= 255; + + if (length > 254) { + length -= 255; *(*op)++ = 255; } - *(*op)++ = (u8)len; + + *(*op)++ = (BYTE)length; } else - *token += len; + *token += (BYTE)(length); /* Prepare next loop */ - *ip += ml; + *ip += matchLength; *anchor = *ip; return 0; } -static int lz4_compresshcctx(struct lz4hc_data *ctx, - const char *source, - char *dest, - int isize) +static int LZ4HC_compress_generic( + LZ4HC_CCtx_internal *const ctx, + const char * const source, + char * const dest, + int const inputSize, + int const maxOutputSize, + int compressionLevel, + limitedOutput_directive limit + ) { - const u8 *ip = (const u8 *)source; - const u8 *anchor = ip; - const u8 *const iend = ip + isize; - const u8 *const mflimit = iend - MFLIMIT; - const u8 *const matchlimit = (iend - LASTLITERALS); + const BYTE *ip = (const BYTE *) source; + const BYTE *anchor = ip; + const BYTE * const iend = ip + inputSize; + const BYTE * const mflimit = iend - MFLIMIT; + const BYTE * const matchlimit = (iend - LASTLITERALS); - u8 *op = (u8 *)dest; + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + maxOutputSize; + unsigned int maxNbAttempts; int ml, ml2, ml3, ml0; - const u8 *ref = NULL; - const u8 *start2 = NULL; - const u8 *ref2 = NULL; - const u8 *start3 = NULL; - const u8 *ref3 = NULL; - const u8 *start0; - const u8 *ref0; - int lastrun; + const BYTE *ref = NULL; + const BYTE *start2 = NULL; + const BYTE *ref2 = NULL; + const BYTE *start3 = NULL; + const BYTE *ref3 = NULL; + const BYTE *start0; + const BYTE *ref0; + + /* init */ + if (compressionLevel > LZ4HC_MAX_CLEVEL) + compressionLevel = LZ4HC_MAX_CLEVEL; + if (compressionLevel < 1) + compressionLevel = LZ4HC_DEFAULT_CLEVEL; + maxNbAttempts = 1 << (compressionLevel - 1); + ctx->end += inputSize; ip++; /* Main Loop */ while (ip < mflimit) { - ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref)); + ml = LZ4HC_InsertAndFindBestMatch(ctx, ip, + matchlimit, (&ref), maxNbAttempts); if (!ml) { ip++; continue; @@ -351,51 +387,59 @@ static int lz4_compresshcctx(struct lz4hc_data *ctx, start0 = ip; ref0 = ref; ml0 = ml; -_search2: - if (ip+ml < mflimit) - ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2, - ip + 1, matchlimit, ml, &ref2, &start2); + +_Search2: + if (ip + ml < mflimit) + ml2 = LZ4HC_InsertAndGetWiderMatch(ctx, + ip + ml - 2, ip + 0, + matchlimit, ml, &ref2, + &start2, maxNbAttempts); else ml2 = ml; - /* No better match */ + if (ml2 == ml) { - lz4_encodesequence(&ip, &op, &anchor, ml, ref); + /* No better match */ + if (LZ4HC_encodeSequence(&ip, &op, + &anchor, ml, ref, limit, oend)) + return 0; continue; } if (start0 < ip) { - /* empirical */ if (start2 < ip + ml0) { + /* empirical */ ip = start0; ref = ref0; ml = ml0; } } - /* - * Here, start0==ip - * First Match too small : removed - */ + + /* Here, start0 == ip */ if ((start2 - ip) < 3) { + /* First Match too small : removed */ ml = ml2; ip = start2; ref = ref2; - goto _search2; + goto _Search2; } -_search3: +_Search3: /* - * Currently we have : - * ml2 > ml1, and - * ip1+3 <= ip2 (usually < ip1+ml1) - */ + * Currently we have : + * ml2 > ml1, and + * ip1 + 3 <= ip2 (usually < ip1 + ml1) + */ if ((start2 - ip) < OPTIMAL_ML) { int correction; int new_ml = ml; + if (new_ml > OPTIMAL_ML) new_ml = OPTIMAL_ML; if (ip + new_ml > start2 + ml2 - MINMATCH) new_ml = (int)(start2 - ip) + ml2 - MINMATCH; + correction = new_ml - (int)(start2 - ip); + if (correction > 0) { start2 += correction; ref2 += correction; @@ -403,39 +447,44 @@ _search3: } } /* - * Now, we have start2 = ip+new_ml, - * with new_ml=min(ml, OPTIMAL_ML=18) + * Now, we have start2 = ip + new_ml, + * with new_ml = min(ml, OPTIMAL_ML = 18) */ + if (start2 + ml2 < mflimit) - ml3 = lz4hc_insertandgetwidermatch(ctx, - start2 + ml2 - 3, start2, matchlimit, - ml2, &ref3, &start3); + ml3 = LZ4HC_InsertAndGetWiderMatch(ctx, + start2 + ml2 - 3, start2, + matchlimit, ml2, &ref3, &start3, + maxNbAttempts); else ml3 = ml2; - /* No better match : 2 sequences to encode */ if (ml3 == ml2) { + /* No better match : 2 sequences to encode */ /* ip & ref are known; Now for ml */ - if (start2 < ip+ml) + if (start2 < ip + ml) ml = (int)(start2 - ip); - /* Now, encode 2 sequences */ - lz4_encodesequence(&ip, &op, &anchor, ml, ref); + if (LZ4HC_encodeSequence(&ip, &op, &anchor, + ml, ref, limit, oend)) + return 0; ip = start2; - lz4_encodesequence(&ip, &op, &anchor, ml2, ref2); + if (LZ4HC_encodeSequence(&ip, &op, &anchor, + ml2, ref2, limit, oend)) + return 0; continue; } - /* Not enough space for match 2 : remove it */ if (start3 < ip + ml + 3) { - /* - * can write Seq1 immediately ==> Seq2 is removed, - * so Seq3 becomes Seq1 - */ + /* Not enough space for match 2 : remove it */ if (start3 >= (ip + ml)) { + /* can write Seq1 immediately + * ==> Seq2 is removed, + * so Seq3 becomes Seq1 + */ if (start2 < ip + ml) { - int correction = - (int)(ip + ml - start2); + int correction = (int)(ip + ml - start2); + start2 += correction; ref2 += correction; ml2 -= correction; @@ -446,35 +495,38 @@ _search3: } } - lz4_encodesequence(&ip, &op, &anchor, ml, ref); - ip = start3; + if (LZ4HC_encodeSequence(&ip, &op, &anchor, + ml, ref, limit, oend)) + return 0; + ip = start3; ref = ref3; - ml = ml3; + ml = ml3; start0 = start2; ref0 = ref2; ml0 = ml2; - goto _search2; + goto _Search2; } start2 = start3; ref2 = ref3; ml2 = ml3; - goto _search3; + goto _Search3; } /* - * OK, now we have 3 ascending matches; let's write at least - * the first one ip & ref are known; Now for ml - */ + * OK, now we have 3 ascending matches; + * let's write at least the first one + * ip & ref are known; Now for ml + */ if (start2 < ip + ml) { if ((start2 - ip) < (int)ML_MASK) { int correction; + if (ml > OPTIMAL_ML) ml = OPTIMAL_ML; if (ip + ml > start2 + ml2 - MINMATCH) - ml = (int)(start2 - ip) + ml2 - - MINMATCH; + ml = (int)(start2 - ip) + ml2 - MINMATCH; correction = ml - (int)(start2 - ip); if (correction > 0) { start2 += correction; @@ -484,7 +536,9 @@ _search3: } else ml = (int)(start2 - ip); } - lz4_encodesequence(&ip, &op, &anchor, ml, ref); + if (LZ4HC_encodeSequence(&ip, &op, &anchor, ml, + ref, limit, oend)) + return 0; ip = start2; ref = ref2; @@ -494,46 +548,245 @@ _search3: ref2 = ref3; ml2 = ml3; - goto _search3; + goto _Search3; } /* Encode Last Literals */ - lastrun = (int)(iend - anchor); - if (lastrun >= (int)RUN_MASK) { - *op++ = (RUN_MASK << ML_BITS); - lastrun -= RUN_MASK; - for (; lastrun > 254 ; lastrun -= 255) - *op++ = 255; - *op++ = (u8) lastrun; - } else - *op++ = (lastrun << ML_BITS); - memcpy(op, anchor, iend - anchor); - op += iend - anchor; + { + int lastRun = (int)(iend - anchor); + + if ((limit) + && (((char *)op - dest) + lastRun + 1 + + ((lastRun + 255 - RUN_MASK)/255) + > (U32)maxOutputSize)) { + /* Check output limit */ + return 0; + } + if (lastRun >= (int)RUN_MASK) { + *op++ = (RUN_MASK< 254 ; lastRun -= 255) + *op++ = 255; + *op++ = (BYTE) lastRun; + } else + *op++ = (BYTE)(lastRun<internal_donotuse; - struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem; - lz4hc_init(hc4, (const u8 *)src); - out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src, - (char *)dst, (int)src_len); + if (((size_t)(state)&(sizeof(void *) - 1)) != 0) { + /* Error : state is not aligned + * for pointers (32 or 64 bits) + */ + return 0; + } - if (out_len < 0) - goto exit; + LZ4HC_init(ctx, (const BYTE *)src); - *dst_len = out_len; - return 0; + if (maxDstSize < LZ4_compressBound(srcSize)) + return LZ4HC_compress_generic(ctx, src, dst, + srcSize, maxDstSize, compressionLevel, limitedOutput); + else + return LZ4HC_compress_generic(ctx, src, dst, + srcSize, maxDstSize, compressionLevel, noLimit); +} + +int LZ4_compress_HC(const char *src, char *dst, int srcSize, + int maxDstSize, int compressionLevel, void *wrkmem) +{ + return LZ4_compress_HC_extStateHC(wrkmem, src, dst, + srcSize, maxDstSize, compressionLevel); +} +EXPORT_SYMBOL(LZ4_compress_HC); + +/************************************** + * Streaming Functions + **************************************/ +void LZ4_resetStreamHC(LZ4_streamHC_t *LZ4_streamHCPtr, int compressionLevel) +{ + LZ4_streamHCPtr->internal_donotuse.base = NULL; + LZ4_streamHCPtr->internal_donotuse.compressionLevel = (unsigned int)compressionLevel; +} + +int LZ4_loadDictHC(LZ4_streamHC_t *LZ4_streamHCPtr, + const char *dictionary, + int dictSize) +{ + LZ4HC_CCtx_internal *ctxPtr = &LZ4_streamHCPtr->internal_donotuse; + + if (dictSize > 64 * KB) { + dictionary += dictSize - 64 * KB; + dictSize = 64 * KB; + } + LZ4HC_init(ctxPtr, (const BYTE *)dictionary); + if (dictSize >= 4) + LZ4HC_Insert(ctxPtr, (const BYTE *)dictionary + (dictSize - 3)); + ctxPtr->end = (const BYTE *)dictionary + dictSize; + return dictSize; +} +EXPORT_SYMBOL(LZ4_loadDictHC); + +/* compression */ + +static void LZ4HC_setExternalDict( + LZ4HC_CCtx_internal *ctxPtr, + const BYTE *newBlock) +{ + if (ctxPtr->end >= ctxPtr->base + 4) { + /* Referencing remaining dictionary content */ + LZ4HC_Insert(ctxPtr, ctxPtr->end - 3); + } + + /* + * Only one memory segment for extDict, + * so any previous extDict is lost at this stage + */ + ctxPtr->lowLimit = ctxPtr->dictLimit; + ctxPtr->dictLimit = (U32)(ctxPtr->end - ctxPtr->base); + ctxPtr->dictBase = ctxPtr->base; + ctxPtr->base = newBlock - ctxPtr->dictLimit; + ctxPtr->end = newBlock; + /* match referencing will resume from there */ + ctxPtr->nextToUpdate = ctxPtr->dictLimit; +} +EXPORT_SYMBOL(LZ4HC_setExternalDict); + +static int LZ4_compressHC_continue_generic( + LZ4_streamHC_t *LZ4_streamHCPtr, + const char *source, + char *dest, + int inputSize, + int maxOutputSize, + limitedOutput_directive limit) +{ + LZ4HC_CCtx_internal *ctxPtr = &LZ4_streamHCPtr->internal_donotuse; + + /* auto - init if forgotten */ + if (ctxPtr->base == NULL) + LZ4HC_init(ctxPtr, (const BYTE *) source); + + /* Check overflow */ + if ((size_t)(ctxPtr->end - ctxPtr->base) > 2 * GB) { + size_t dictSize = (size_t)(ctxPtr->end - ctxPtr->base) + - ctxPtr->dictLimit; + if (dictSize > 64 * KB) + dictSize = 64 * KB; + LZ4_loadDictHC(LZ4_streamHCPtr, + (const char *)(ctxPtr->end) - dictSize, (int)dictSize); + } + + /* Check if blocks follow each other */ + if ((const BYTE *)source != ctxPtr->end) + LZ4HC_setExternalDict(ctxPtr, (const BYTE *)source); + + /* Check overlapping input/dictionary space */ + { + const BYTE *sourceEnd = (const BYTE *) source + inputSize; + const BYTE * const dictBegin = ctxPtr->dictBase + ctxPtr->lowLimit; + const BYTE * const dictEnd = ctxPtr->dictBase + ctxPtr->dictLimit; + + if ((sourceEnd > dictBegin) + && ((const BYTE *)source < dictEnd)) { + if (sourceEnd > dictEnd) + sourceEnd = dictEnd; + ctxPtr->lowLimit = (U32)(sourceEnd - ctxPtr->dictBase); + + if (ctxPtr->dictLimit - ctxPtr->lowLimit < 4) + ctxPtr->lowLimit = ctxPtr->dictLimit; + } + } + + return LZ4HC_compress_generic(ctxPtr, source, dest, + inputSize, maxOutputSize, ctxPtr->compressionLevel, limit); +} + +int LZ4_compress_HC_continue( + LZ4_streamHC_t *LZ4_streamHCPtr, + const char *source, + char *dest, + int inputSize, + int maxOutputSize) +{ + if (maxOutputSize < LZ4_compressBound(inputSize)) + return LZ4_compressHC_continue_generic(LZ4_streamHCPtr, + source, dest, inputSize, maxOutputSize, limitedOutput); + else + return LZ4_compressHC_continue_generic(LZ4_streamHCPtr, + source, dest, inputSize, maxOutputSize, noLimit); +} +EXPORT_SYMBOL(LZ4_compress_HC_continue); + +/* dictionary saving */ + +int LZ4_saveDictHC( + LZ4_streamHC_t *LZ4_streamHCPtr, + char *safeBuffer, + int dictSize) +{ + LZ4HC_CCtx_internal *const streamPtr = &LZ4_streamHCPtr->internal_donotuse; + int const prefixSize = (int)(streamPtr->end + - (streamPtr->base + streamPtr->dictLimit)); + + if (dictSize > 64 * KB) + dictSize = 64 * KB; + if (dictSize < 4) + dictSize = 0; + if (dictSize > prefixSize) + dictSize = prefixSize; + + memmove(safeBuffer, streamPtr->end - dictSize, dictSize); -exit: - return ret; + { + U32 const endIndex = (U32)(streamPtr->end - streamPtr->base); + + streamPtr->end = (const BYTE *)safeBuffer + dictSize; + streamPtr->base = streamPtr->end - endIndex; + streamPtr->dictLimit = endIndex - dictSize; + streamPtr->lowLimit = endIndex - dictSize; + + if (streamPtr->nextToUpdate < streamPtr->dictLimit) + streamPtr->nextToUpdate = streamPtr->dictLimit; + } + return dictSize; +} +EXPORT_SYMBOL(LZ4_saveDictHC); + +/*-****************************** + * For backwards compatibility + ********************************/ +int lz4hc_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem) +{ + *dst_len = LZ4_compress_HC(src, dst, src_len, + *dst_len, LZ4HC_DEFAULT_CLEVEL, wrkmem); + + /* + * Prior lz4hc_compress will return -1 in case of error + * and 0 on success + * while new LZ4_compress_HC + * returns 0 in case of error + * and the output length on success + */ + if (!*dst_len) + return -1; + else + return 0; } EXPORT_SYMBOL(lz4hc_compress); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_DESCRIPTION("LZ4HC compressor"); +MODULE_DESCRIPTION("LZ4 HC compressor"); -- cgit v1.2.3 From 69c78423b8f439b077929410bdf8f88e7031b891 Mon Sep 17 00:00:00 2001 From: Sven Schmidt <4sschmid@informatik.uni-hamburg.de> Date: Fri, 24 Feb 2017 15:01:25 -0800 Subject: lib/lz4: remove back-compat wrappers Remove the functions introduced as wrappers for providing backwards compatibility to the prior LZ4 version. They're not needed anymore since there's no callers left. Link: http://lkml.kernel.org/r/1486321748-19085-6-git-send-email-4sschmid@informatik.uni-hamburg.de Signed-off-by: Sven Schmidt <4sschmid@informatik.uni-hamburg.de> Cc: Bongkyu Kim Cc: Rui Salvaterra Cc: Sergey Senozhatsky Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: David S. Miller Cc: Anton Vorontsov Cc: Colin Cross Cc: Kees Cook Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lz4.h | 69 ------------------------------------------------ lib/lz4/lz4_compress.c | 22 --------------- lib/lz4/lz4_decompress.c | 42 ----------------------------- lib/lz4/lz4hc_compress.c | 23 ---------------- 4 files changed, 156 deletions(-) (limited to 'include') diff --git a/include/linux/lz4.h b/include/linux/lz4.h index 1b0f8ca0f9c8..394e3d9213b8 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -172,18 +172,6 @@ static inline int LZ4_compressBound(size_t isize) return LZ4_COMPRESSBOUND(isize); } -/** - * lz4_compressbound() - For backwards compatibility; see LZ4_compressBound - * @isize: Size of the input data - * - * Return: Max. size LZ4 may output in a "worst case" szenario - * (data not compressible) - */ -static inline int lz4_compressbound(size_t isize) -{ - return LZ4_COMPRESSBOUND(isize); -} - /** * LZ4_compress_default() - Compress data from source to dest * @source: source address of the original data @@ -257,20 +245,6 @@ int LZ4_compress_fast(const char *source, char *dest, int inputSize, int LZ4_compress_destSize(const char *source, char *dest, int *sourceSizePtr, int targetDestSize, void *wrkmem); -/* - * lz4_compress() - For backward compatibility, see LZ4_compress_default - * @src: source address of the original data - * @src_len: size of the original data - * @dst: output buffer address of the compressed data. This requires 'dst' - * of size LZ4_COMPRESSBOUND - * @dst_len: is the output size, which is returned after compress done - * @workmem: address of the working memory. - * - * Return: Success if return 0, Error if return < 0 - */ -int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst, - size_t *dst_len, void *wrkmem); - /*-************************************************************************ * Decompression Functions **************************************************************************/ @@ -346,34 +320,6 @@ int LZ4_decompress_safe(const char *source, char *dest, int compressedSize, int LZ4_decompress_safe_partial(const char *source, char *dest, int compressedSize, int targetOutputSize, int maxDecompressedSize); -/* - * lz4_decompress_unknownoutputsize() - For backwards compatibility, - * see LZ4_decompress_safe - * @src: source address of the compressed data - * @src_len: is the input size, therefore the compressed size - * @dest: output buffer address of the decompressed data - * which must be already allocated - * @dest_len: is the max size of the destination buffer, which is - * returned with actual size of decompressed data after decompress done - * - * Return: Success if return 0, Error if return (< 0) - */ -int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, - unsigned char *dest, size_t *dest_len); - -/** - * lz4_decompress() - For backwards cocmpatibility, see LZ4_decompress_fast - * @src: source address of the compressed data - * @src_len: is the input size, which is returned after decompress done - * @dest: output buffer address of the decompressed data, - * which must be already allocated - * @actual_dest_len: is the size of uncompressed data, supposing it's known - * - * Return: Success if return 0, Error if return (< 0) - */ -int lz4_decompress(const unsigned char *src, size_t *src_len, - unsigned char *dest, size_t actual_dest_len); - /*-************************************************************************ * LZ4 HC Compression **************************************************************************/ @@ -400,21 +346,6 @@ int lz4_decompress(const unsigned char *src, size_t *src_len, int LZ4_compress_HC(const char *src, char *dst, int srcSize, int dstCapacity, int compressionLevel, void *wrkmem); -/** - * lz4hc_compress() - For backwards compatibility, see LZ4_compress_HC - * @src: source address of the original data - * @src_len: size of the original data - * @dst: output buffer address of the compressed data. This requires 'dst' - * of size LZ4_COMPRESSBOUND. - * @dst_len: is the output size, which is returned after compress done - * @wrkmem: address of the working memory. - * This requires 'workmem' of size LZ4HC_MEM_COMPRESS. - * - * Return : Success if return 0, Error if return (< 0) - */ -int lz4hc_compress(const unsigned char *src, size_t src_len, unsigned char *dst, - size_t *dst_len, void *wrkmem); - /** * LZ4_resetStreamHC() - Init an allocated 'LZ4_streamHC_t' structure * @streamHCPtr: pointer to the 'LZ4_streamHC_t' structure diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c index 53f313f05185..cc7b6d4cc7c7 100644 --- a/lib/lz4/lz4_compress.c +++ b/lib/lz4/lz4_compress.c @@ -936,27 +936,5 @@ int LZ4_compress_fast_continue(LZ4_stream_t *LZ4_stream, const char *source, } EXPORT_SYMBOL(LZ4_compress_fast_continue); -/*-****************************** - * For backwards compatibility - ********************************/ -int lz4_compress(const unsigned char *src, size_t src_len, unsigned char *dst, - size_t *dst_len, void *wrkmem) { - *dst_len = LZ4_compress_default(src, dst, src_len, - *dst_len, wrkmem); - - /* - * Prior lz4_compress will return -1 in case of error - * and 0 on success - * while new LZ4_compress_fast/default - * returns 0 in case of error - * and the output length on success - */ - if (!*dst_len) - return -1; - else - return 0; -} -EXPORT_SYMBOL(lz4_compress); - MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("LZ4 compressor"); diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index b5e00a19a7b4..bd3574312b82 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -493,46 +493,6 @@ int LZ4_decompress_fast_usingDict(const char *source, char *dest, originalSize, 0, dictStart, dictSize); } -/*-****************************** - * For backwards compatibility - ********************************/ -int lz4_decompress_unknownoutputsize(const unsigned char *src, - size_t src_len, unsigned char *dest, size_t *dest_len) { - *dest_len = LZ4_decompress_safe(src, dest, - src_len, *dest_len); - - /* - * Prior lz4_decompress_unknownoutputsize will return - * 0 for success and a negative result for error - * new LZ4_decompress_safe returns - * - the length of data read on success - * - and also a negative result on error - * meaning when result > 0, we just return 0 here - */ - if (src_len > 0) - return 0; - else - return -1; -} - -int lz4_decompress(const unsigned char *src, size_t *src_len, - unsigned char *dest, size_t actual_dest_len) { - *src_len = LZ4_decompress_fast(src, dest, actual_dest_len); - - /* - * Prior lz4_decompress will return - * 0 for success and a negative result for error - * new LZ4_decompress_fast returns - * - the length of data read on success - * - and also a negative result on error - * meaning when result > 0, we just return 0 here - */ - if (*src_len > 0) - return 0; - else - return -1; -} - #ifndef STATIC EXPORT_SYMBOL(LZ4_decompress_safe); EXPORT_SYMBOL(LZ4_decompress_safe_partial); @@ -542,8 +502,6 @@ EXPORT_SYMBOL(LZ4_decompress_safe_continue); EXPORT_SYMBOL(LZ4_decompress_fast_continue); EXPORT_SYMBOL(LZ4_decompress_safe_usingDict); EXPORT_SYMBOL(LZ4_decompress_fast_usingDict); -EXPORT_SYMBOL(lz4_decompress_unknownoutputsize); -EXPORT_SYMBOL(lz4_decompress); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("LZ4 decompressor"); diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c index e1fbf1561e55..176f03b83e56 100644 --- a/lib/lz4/lz4hc_compress.c +++ b/lib/lz4/lz4hc_compress.c @@ -765,28 +765,5 @@ int LZ4_saveDictHC( } EXPORT_SYMBOL(LZ4_saveDictHC); -/*-****************************** - * For backwards compatibility - ********************************/ -int lz4hc_compress(const unsigned char *src, size_t src_len, - unsigned char *dst, size_t *dst_len, void *wrkmem) -{ - *dst_len = LZ4_compress_HC(src, dst, src_len, - *dst_len, LZ4HC_DEFAULT_CLEVEL, wrkmem); - - /* - * Prior lz4hc_compress will return -1 in case of error - * and 0 on success - * while new LZ4_compress_HC - * returns 0 in case of error - * and the output length on success - */ - if (!*dst_len) - return -1; - else - return 0; -} -EXPORT_SYMBOL(lz4hc_compress); - MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("LZ4 HC compressor"); -- cgit v1.2.3