From 1bde33e051233f0ed93a8bc67137016ab38c3d2d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 2 Jun 2017 14:46:13 -0700 Subject: include/linux/gfp.h: fix ___GFP_NOLOCKDEP value Igor Stoppa has noticed that __GFP_NOLOCKDEP can use a lower bit. At the time commit 7e7844226f10 ("lockdep: allow to disable reclaim lockup detection") was written we still had __GFP_OTHER_NODE but I have removed it in commit 41b6167e8f74 ("mm: get rid of __GFP_OTHER_NODE") and forgot to lower the bit value. The current value is outside of __GFP_BITS_SHIFT so it cannot be used actually. Fixes: 7e7844226f10 ("lockdep: allow to disable reclaim lockup detection") Signed-off-by: Michal Hocko Reported-by: Igor Stoppa Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 2b1a44f5bdb6..a89d37e8b387 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -41,7 +41,7 @@ struct vm_area_struct; #define ___GFP_WRITE 0x800000u #define ___GFP_KSWAPD_RECLAIM 0x1000000u #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x4000000u +#define ___GFP_NOLOCKDEP 0x2000000u #else #define ___GFP_NOLOCKDEP 0 #endif -- cgit v1.2.3 From 60b0a8c3d2480f3b57282b47b7cae7ee71c48635 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Fri, 2 Jun 2017 14:46:16 -0700 Subject: frv: declare jiffies to be located in the .data section Commit 7c30f352c852 ("jiffies.h: declare jiffies and jiffies_64 with ____cacheline_aligned_in_smp") removed a section specification from the jiffies declaration that caused conflicts on some platforms. Unfortunately this change broke the build for frv: kernel/built-in.o: In function `__do_softirq': (.text+0x6460): relocation truncated to fit: R_FRV_GPREL12 against symbol `jiffies' defined in *ABS* section in .tmp_vmlinux1 kernel/built-in.o: In function `__do_softirq': (.text+0x6574): relocation truncated to fit: R_FRV_GPREL12 against symbol `jiffies' defined in *ABS* section in .tmp_vmlinux1 kernel/built-in.o: In function `pwq_activate_delayed_work': workqueue.c:(.text+0x15b9c): relocation truncated to fit: R_FRV_GPREL12 against symbol `jiffies' defined in *ABS* section in .tmp_vmlinux1 ... Add __jiffy_arch_data to the declaration of jiffies and use it on frv to include the section specification. For all other platforms __jiffy_arch_data (currently) has no effect. Fixes: 7c30f352c852 ("jiffies.h: declare jiffies and jiffies_64 with ____cacheline_aligned_in_smp") Link: http://lkml.kernel.org/r/20170516221333.177280-1-mka@chromium.org Signed-off-by: Matthias Kaehlcke Reported-by: Guenter Roeck Tested-by: Guenter Roeck Reviewed-by: David Howells Cc: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/frv/include/asm/timex.h | 6 ++++++ include/linux/jiffies.h | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/arch/frv/include/asm/timex.h b/arch/frv/include/asm/timex.h index a89bddefdacf..139093fab326 100644 --- a/arch/frv/include/asm/timex.h +++ b/arch/frv/include/asm/timex.h @@ -16,5 +16,11 @@ static inline cycles_t get_cycles(void) #define vxtime_lock() do {} while (0) #define vxtime_unlock() do {} while (0) +/* This attribute is used in include/linux/jiffies.h alongside with + * __cacheline_aligned_in_smp. It is assumed that __cacheline_aligned_in_smp + * for frv does not contain another section specification. + */ +#define __jiffy_arch_data __attribute__((__section__(".data"))) + #endif diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 36872fbb815d..734377ad42e9 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -64,13 +64,17 @@ extern int register_refined_jiffies(long clock_tick_rate); /* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) +#ifndef __jiffy_arch_data +#define __jiffy_arch_data +#endif + /* * The 64-bit value is not atomic - you MUST NOT read it * without sampling the sequence number in jiffies_lock. * get_jiffies_64() will do this for you as appropriate. */ extern u64 __cacheline_aligned_in_smp jiffies_64; -extern unsigned long volatile __cacheline_aligned_in_smp jiffies; +extern unsigned long volatile __cacheline_aligned_in_smp __jiffy_arch_data jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); -- cgit v1.2.3 From 9a291a7c9428155e8e623e4a3989f8be47134df5 Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 2 Jun 2017 14:46:46 -0700 Subject: mm/hugetlb: report -EHWPOISON not -EFAULT when FOLL_HWPOISON is specified KVM uses get_user_pages() to resolve its stage2 faults. KVM sets the FOLL_HWPOISON flag causing faultin_page() to return -EHWPOISON when it finds a VM_FAULT_HWPOISON. KVM handles these hwpoison pages as a special case. (check_user_page_hwpoison()) When huge pages are involved, this doesn't work so well. get_user_pages() calls follow_hugetlb_page(), which stops early if it receives VM_FAULT_HWPOISON from hugetlb_fault(), eventually returning -EFAULT to the caller. The step to map this to -EHWPOISON based on the FOLL_ flags is missing. The hwpoison special case is skipped, and -EFAULT is returned to user-space, causing Qemu or kvmtool to exit. Instead, move this VM_FAULT_ to errno mapping code into a header file and use it from faultin_page() and follow_hugetlb_page(). With this, KVM works as expected. This isn't a problem for arm64 today as we haven't enabled MEMORY_FAILURE, but I can't see any reason this doesn't happen on x86 too, so I think this should be a fix. This doesn't apply earlier than stable's v4.11.1 due to all sorts of cleanup. [james.morse@arm.com: add vm_fault_to_errno() call to faultin_page()] suggested. Link: http://lkml.kernel.org/r/20170525171035.16359-1-james.morse@arm.com [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20170524160900.28786-1-james.morse@arm.com Signed-off-by: James Morse Acked-by: Punit Agrawal Acked-by: Naoya Horiguchi Cc: "Kirill A . Shutemov" Cc: [4.11.1+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 11 +++++++++++ mm/gup.c | 20 ++++++++------------ mm/hugetlb.c | 5 +++++ 3 files changed, 24 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7cb17c6b97de..b892e95d4929 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2327,6 +2327,17 @@ static inline struct page *follow_page(struct vm_area_struct *vma, #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_COW 0x4000 /* internal GUP flag */ +static inline int vm_fault_to_errno(int vm_fault, int foll_flags) +{ + if (vm_fault & VM_FAULT_OOM) + return -ENOMEM; + if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT; + if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) + return -EFAULT; + return 0; +} + typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, diff --git a/mm/gup.c b/mm/gup.c index d9e6fddcc51f..b3c7214d710d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -407,12 +407,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, ret = handle_mm_fault(vma, address, fault_flags); if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) - return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; - if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) - return -EFAULT; + int err = vm_fault_to_errno(ret, *flags); + + if (err) + return err; BUG(); } @@ -723,12 +721,10 @@ retry: ret = handle_mm_fault(vma, address, fault_flags); major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) - return -EHWPOISON; - if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) - return -EFAULT; + int err = vm_fault_to_errno(ret, 0); + + if (err) + return err; BUG(); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e5828875f7bb..3eedb187e549 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4170,6 +4170,11 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, } ret = hugetlb_fault(mm, vma, vaddr, fault_flags); if (ret & VM_FAULT_ERROR) { + int err = vm_fault_to_errno(ret, flags); + + if (err) + return err; + remainder = 0; break; } -- cgit v1.2.3 From 864b9a393dcb5aed09b8fd31b9bbda0fdda99374 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 2 Jun 2017 14:46:49 -0700 Subject: mm: consider memblock reservations for deferred memory initialization sizing We have seen an early OOM killer invocation on ppc64 systems with crashkernel=4096M: kthreadd invoked oom-killer: gfp_mask=0x16040c0(GFP_KERNEL|__GFP_COMP|__GFP_NOTRACK), nodemask=7, order=0, oom_score_adj=0 kthreadd cpuset=/ mems_allowed=7 CPU: 0 PID: 2 Comm: kthreadd Not tainted 4.4.68-1.gd7fe927-default #1 Call Trace: dump_stack+0xb0/0xf0 (unreliable) dump_header+0xb0/0x258 out_of_memory+0x5f0/0x640 __alloc_pages_nodemask+0xa8c/0xc80 kmem_getpages+0x84/0x1a0 fallback_alloc+0x2a4/0x320 kmem_cache_alloc_node+0xc0/0x2e0 copy_process.isra.25+0x260/0x1b30 _do_fork+0x94/0x470 kernel_thread+0x48/0x60 kthreadd+0x264/0x330 ret_from_kernel_thread+0x5c/0xa4 Mem-Info: active_anon:0 inactive_anon:0 isolated_anon:0 active_file:0 inactive_file:0 isolated_file:0 unevictable:0 dirty:0 writeback:0 unstable:0 slab_reclaimable:5 slab_unreclaimable:73 mapped:0 shmem:0 pagetables:0 bounce:0 free:0 free_pcp:0 free_cma:0 Node 7 DMA free:0kB min:0kB low:0kB high:0kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:52428800kB managed:110016kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:320kB slab_unreclaimable:4672kB kernel_stack:1152kB pagetables:0kB unstable:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes lowmem_reserve[]: 0 0 0 0 Node 7 DMA: 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB 0*8192kB 0*16384kB = 0kB 0 total pagecache pages 0 pages in swap cache Swap cache stats: add 0, delete 0, find 0/0 Free swap = 0kB Total swap = 0kB 819200 pages RAM 0 pages HighMem/MovableOnly 817481 pages reserved 0 pages cma reserved 0 pages hwpoisoned the reason is that the managed memory is too low (only 110MB) while the rest of the the 50GB is still waiting for the deferred intialization to be done. update_defer_init estimates the initial memoty to initialize to 2GB at least but it doesn't consider any memory allocated in that range. In this particular case we've had Reserving 4096MB of memory at 128MB for crashkernel (System RAM: 51200MB) so the low 2GB is mostly depleted. Fix this by considering memblock allocations in the initial static initialization estimation. Move the max_initialise to reset_deferred_meminit and implement a simple memblock_reserved_memory helper which iterates all reserved blocks and sums the size of all that start below the given address. The cumulative size is than added on top of the initial estimation. This is still not ideal because reset_deferred_meminit doesn't consider holes and so reservation might be above the initial estimation whihch we ignore but let's make the logic simpler until we really need to handle more complicated cases. Fixes: 3a80a7fa7989 ("mm: meminit: initialise a subset of struct pages if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set") Link: http://lkml.kernel.org/r/20170531104010.GI27783@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Mel Gorman Tested-by: Srikar Dronamraju Cc: [4.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 8 ++++++++ include/linux/mmzone.h | 1 + mm/memblock.c | 23 +++++++++++++++++++++++ mm/page_alloc.c | 33 ++++++++++++++++++++++----------- 4 files changed, 54 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 4ce24a376262..8098695e5d8d 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -425,12 +425,20 @@ static inline void early_memtest(phys_addr_t start, phys_addr_t end) } #endif +extern unsigned long memblock_reserved_memory_within(phys_addr_t start_addr, + phys_addr_t end_addr); #else static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align) { return 0; } +static inline unsigned long memblock_reserved_memory_within(phys_addr_t start_addr, + phys_addr_t end_addr) +{ + return 0; +} + #endif /* CONFIG_HAVE_MEMBLOCK */ #endif /* __KERNEL__ */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ebaccd4e7d8c..ef6a13b7bd3e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -678,6 +678,7 @@ typedef struct pglist_data { * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; + unsigned long static_init_size; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/mm/memblock.c b/mm/memblock.c index b049c9b2dba8..7b8a5db76a2f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1739,6 +1739,29 @@ static void __init_memblock memblock_dump(struct memblock_type *type) } } +extern unsigned long __init_memblock +memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr) +{ + struct memblock_region *rgn; + unsigned long size = 0; + int idx; + + for_each_memblock_type((&memblock.reserved), rgn) { + phys_addr_t start, end; + + if (rgn->base + rgn->size < start_addr) + continue; + if (rgn->base > end_addr) + continue; + + start = rgn->base; + end = start + rgn->size; + size += end - start; + } + + return size; +} + void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b7a6f583a373..2302f250d6b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -292,6 +292,26 @@ int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static inline void reset_deferred_meminit(pg_data_t *pgdat) { + unsigned long max_initialise; + unsigned long reserved_lowmem; + + /* + * Initialise at least 2G of a node but also take into account that + * two large system hashes that can take up 1GB for 0.25TB/node. + */ + max_initialise = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); + + /* + * Compensate the all the memblock reservations (e.g. crash kernel) + * from the initial estimation to make sure we will initialize enough + * memory to boot. + */ + reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, + pgdat->node_start_pfn + max_initialise); + max_initialise += reserved_lowmem; + + pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } @@ -314,20 +334,11 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { - unsigned long max_initialise; - /* Always populate low zones for address-contrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; - /* - * Initialise at least 2G of a node but also take into account that - * two large system hashes that can take up 1GB for 0.25TB/node. - */ - max_initialise = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); - (*nr_initialised)++; - if ((*nr_initialised > max_initialise) && + if ((*nr_initialised > pgdat->static_init_size) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; @@ -6138,7 +6149,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); - reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; pgdat->per_cpu_nodestats = NULL; @@ -6160,6 +6170,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif + reset_deferred_meminit(pgdat); free_area_init_core(pgdat); } -- cgit v1.2.3