From 5ea3b1b2f8ad9162684431ce6188102ca4c64b7a Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 4 Jun 2014 16:06:54 -0700 Subject: cma: add placement specifier for "cma=" kernel parameter Currently, "cma=" kernel parameter is used to specify the size of CMA, but we can't specify where it is located. We want to locate CMA below 4GB for devices only supporting 32-bit addressing on 64-bit systems without iommu. This enables to specify the placement of CMA by extending "cma=" kernel parameter. Examples: 1. locate 64MB CMA below 4GB by "cma=64M@0-4G" 2. locate 64MB CMA exact at 512MB by "cma=64M@512M" Note that the DMA contiguous memory allocator on x86 assumes that page_address() works for the pages to allocate. So this change requires to limit end address of contiguous memory area upto max_pfn_mapped to prevent from locating it on highmem area by the argument of dma_contiguous_reserve(). Signed-off-by: Akinobu Mita Cc: Marek Szyprowski Cc: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Don Dutile Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index af55e13ace8f..adea3a22fa00 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -630,8 +630,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Also note the kernel might malfunction if you disable some critical bits. - cma=nn[MG] [ARM,KNL] - Sets the size of kernel global memory area for contiguous + cma=nn[MG]@[start[MG][-end[MG]]] + [ARM,X86,KNL] + Sets the size of kernel global memory area for + contiguous memory allocations and optionally the + placement constraint by the physical address range of memory allocations. For more information, see include/linux/dma-contiguous.h -- cgit v1.2.3 From 3dae7fec5e884a4e72e5416db0894de66f586201 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 4 Jun 2014 16:07:01 -0700 Subject: mm: memcontrol: remove hierarchy restrictions for swappiness and oom_control Per-memcg swappiness and oom killing can currently not be tweaked on a memcg that is part of a hierarchy, but not the root of that hierarchy. Users have complained that they can't configure this when they turned on hierarchy mode. In fact, with hierarchy mode becoming the default, this restriction disables the tunables entirely. But there is no good reason for this restriction. The settings for swappiness and OOM killing are taken from whatever memcg whose limit triggered reclaim and OOM invocation, regardless of its position in the hierarchy tree. Allow setting swappiness on any group. The knob on the root memcg already reads the global VM swappiness, make it writable as well. Allow disabling the OOM killer on any non-root memcg. Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 11 ++++------- mm/memcontrol.c | 29 +++++++---------------------- 2 files changed, 11 insertions(+), 29 deletions(-) (limited to 'Documentation') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 2622115276aa..1829c65f8371 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -535,17 +535,15 @@ Note: 5.3 swappiness -Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. +Similar to /proc/sys/vm/swappiness, but only affecting reclaim that is +triggered by this cgroup's hard limit. The tunable in the root cgroup +corresponds to the global swappiness setting. + Please note that unlike the global swappiness, memcg knob set to 0 really prevents from any swapping even if there is a swap storage available. This might lead to memcg OOM killer if there are no file pages to reclaim. -Following cgroups' swappiness can't be changed. -- root cgroup (uses /proc/sys/vm/swappiness). -- a cgroup which uses hierarchy and it has other cgroup(s) below it. -- a cgroup which uses hierarchy and not the root of hierarchy. - 5.4 failcnt A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. @@ -754,7 +752,6 @@ You can disable the OOM-killer by writing "1" to memory.oom_control file, as: #echo 1 > memory.oom_control -This operation is only allowed to the top cgroup of a sub-hierarchy. If OOM-killer is disabled, tasks under cgroup will hang/sleep in memory cgroup's OOM-waitqueue when they request accountable memory. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7bab1de50f48..20f47d9cd8b2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5444,22 +5444,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); - if (val > 100 || !parent) + if (val > 100) return -EINVAL; - mutex_lock(&memcg_create_mutex); - - /* If under hierarchy, only empty-root can set this value */ - if ((parent->use_hierarchy) || memcg_has_children(memcg)) { - mutex_unlock(&memcg_create_mutex); - return -EINVAL; - } - - memcg->swappiness = val; - - mutex_unlock(&memcg_create_mutex); + if (css_parent(css)) + memcg->swappiness = val; + else + vm_swappiness = val; return 0; } @@ -5791,22 +5783,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!parent || !((val == 0) || (val == 1))) + if (!css_parent(css) || !((val == 0) || (val == 1))) return -EINVAL; - mutex_lock(&memcg_create_mutex); - /* oom-kill-disable is a flag for subhierarchy. */ - if ((parent->use_hierarchy) || memcg_has_children(memcg)) { - mutex_unlock(&memcg_create_mutex); - return -EINVAL; - } memcg->oom_kill_disable = val; if (!val) memcg_oom_recover(memcg); - mutex_unlock(&memcg_create_mutex); + return 0; } -- cgit v1.2.3 From 56a3c655a3d31cb1afef25b530b5ef6a1e7ddefd Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 4 Jun 2014 16:07:03 -0700 Subject: memory-hotplug: update documentation to hide information about SECTIONS and remove end_phys_index Seems we all agree that information about SECTION, e.g. section size, sections per memory block should be kept as kernel internals, and not exposed to userspace. This patch updates Documentation/memory-hotplug.txt to refer to memory blocks instead of memory sections where appropriate and added a paragraph to explain that memory blocks are made of memory sections. The documentation update is mostly provided by Nathan. Also, as end_phys_index in code is actually not the end section id, but the end memory block id, which should always be the same as phys_index. So it is removed here. Signed-off-by: Li Zhong Reviewed-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/memory-hotplug.txt | 125 +++++++++++++++++++-------------------- drivers/base/memory.c | 12 ---- 2 files changed, 61 insertions(+), 76 deletions(-) (limited to 'Documentation') diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 58340d50f8a6..f304edb8fbe7 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt @@ -88,16 +88,21 @@ phase by hand. 1.3. Unit of Memory online/offline operation ------------ -Memory hotplug uses SPARSEMEM memory model. SPARSEMEM divides the whole memory -into chunks of the same size. The chunk is called a "section". The size of -a section is architecture dependent. For example, power uses 16MiB, ia64 uses -1GiB. The unit of online/offline operation is "one section". (see Section 3.) +Memory hotplug uses SPARSEMEM memory model which allows memory to be divided +into chunks of the same size. These chunks are called "sections". The size of +a memory section is architecture dependent. For example, power uses 16MiB, ia64 +uses 1GiB. -To determine the size of sections, please read this file: +Memory sections are combined into chunks referred to as "memory blocks". The +size of a memory block is architecture dependent and represents the logical +unit upon which memory online/offline operations are to be performed. The +default size of a memory block is the same as memory section size unless an +architecture specifies otherwise. (see Section 3.) + +To determine the size (in bytes) of a memory block please read this file: /sys/devices/system/memory/block_size_bytes -This file shows the size of sections in byte. ----------------------- 2. Kernel Configuration @@ -123,42 +128,35 @@ config options. (CONFIG_ACPI_CONTAINER). This option can be kernel module too. + -------------------------------- -4 sysfs files for memory hotplug +3 sysfs files for memory hotplug -------------------------------- -All sections have their device information in sysfs. Each section is part of -a memory block under /sys/devices/system/memory as +All memory blocks have their device information in sysfs. Each memory block +is described under /sys/devices/system/memory as /sys/devices/system/memory/memoryXXX -(XXX is the section id.) +(XXX is the memory block id.) -Now, XXX is defined as (start_address_of_section / section_size) of the first -section contained in the memory block. The files 'phys_index' and -'end_phys_index' under each directory report the beginning and end section id's -for the memory block covered by the sysfs directory. It is expected that all +For the memory block covered by the sysfs directory. It is expected that all memory sections in this range are present and no memory holes exist in the range. Currently there is no way to determine if there is a memory hole, but the existence of one should not affect the hotplug capabilities of the memory block. -For example, assume 1GiB section size. A device for a memory starting at +For example, assume 1GiB memory block size. A device for a memory starting at 0x100000000 is /sys/device/system/memory/memory4 (0x100000000 / 1Gib = 4) This device covers address range [0x100000000 ... 0x140000000) -Under each section, you can see 4 or 5 files, the end_phys_index file being -a recent addition and not present on older kernels. +Under each memory block, you can see 4 files: -/sys/devices/system/memory/memoryXXX/start_phys_index -/sys/devices/system/memory/memoryXXX/end_phys_index +/sys/devices/system/memory/memoryXXX/phys_index /sys/devices/system/memory/memoryXXX/phys_device /sys/devices/system/memory/memoryXXX/state /sys/devices/system/memory/memoryXXX/removable -'phys_index' : read-only and contains section id of the first section - in the memory block, same as XXX. -'end_phys_index' : read-only and contains section id of the last section - in the memory block. +'phys_index' : read-only and contains memory block id, same as XXX. 'state' : read-write at read: contains online/offline state of memory. at write: user can specify "online_kernel", @@ -185,6 +183,7 @@ For example: A backlink will also be created: /sys/devices/system/memory/memory9/node0 -> ../../node/node0 + -------------------------------- 4. Physical memory hot-add phase -------------------------------- @@ -227,11 +226,10 @@ You can tell the physical address of new memory to the kernel by % echo start_address_of_new_memory > /sys/devices/system/memory/probe -Then, [start_address_of_new_memory, start_address_of_new_memory + section_size) -memory range is hot-added. In this case, hotplug script is not called (in -current implementation). You'll have to online memory by yourself. -Please see "How to online memory" in this text. - +Then, [start_address_of_new_memory, start_address_of_new_memory + +memory_block_size] memory range is hot-added. In this case, hotplug script is +not called (in current implementation). You'll have to online memory by +yourself. Please see "How to online memory" in this text. ------------------------------ @@ -240,36 +238,36 @@ Please see "How to online memory" in this text. 5.1. State of memory ------------ -To see (online/offline) state of memory section, read 'state' file. +To see (online/offline) state of a memory block, read 'state' file. % cat /sys/device/system/memory/memoryXXX/state -If the memory section is online, you'll read "online". -If the memory section is offline, you'll read "offline". +If the memory block is online, you'll read "online". +If the memory block is offline, you'll read "offline". 5.2. How to online memory ------------ Even if the memory is hot-added, it is not at ready-to-use state. -For using newly added memory, you have to "online" the memory section. +For using newly added memory, you have to "online" the memory block. -For onlining, you have to write "online" to the section's state file as: +For onlining, you have to write "online" to the memory block's state file as: % echo online > /sys/devices/system/memory/memoryXXX/state -This onlining will not change the ZONE type of the target memory section, -If the memory section is in ZONE_NORMAL, you can change it to ZONE_MOVABLE: +This onlining will not change the ZONE type of the target memory block, +If the memory block is in ZONE_NORMAL, you can change it to ZONE_MOVABLE: % echo online_movable > /sys/devices/system/memory/memoryXXX/state -(NOTE: current limit: this memory section must be adjacent to ZONE_MOVABLE) +(NOTE: current limit: this memory block must be adjacent to ZONE_MOVABLE) -And if the memory section is in ZONE_MOVABLE, you can change it to ZONE_NORMAL: +And if the memory block is in ZONE_MOVABLE, you can change it to ZONE_NORMAL: % echo online_kernel > /sys/devices/system/memory/memoryXXX/state -(NOTE: current limit: this memory section must be adjacent to ZONE_NORMAL) +(NOTE: current limit: this memory block must be adjacent to ZONE_NORMAL) -After this, section memoryXXX's state will be 'online' and the amount of +After this, memory block XXX's state will be 'online' and the amount of available memory will be increased. Currently, newly added memory is added as ZONE_NORMAL (for powerpc, ZONE_DMA). @@ -284,22 +282,22 @@ This may be changed in future. 6.1 Memory offline and ZONE_MOVABLE ------------ Memory offlining is more complicated than memory online. Because memory offline -has to make the whole memory section be unused, memory offline can fail if -the section includes memory which cannot be freed. +has to make the whole memory block be unused, memory offline can fail if +the memory block includes memory which cannot be freed. In general, memory offline can use 2 techniques. -(1) reclaim and free all memory in the section. -(2) migrate all pages in the section. +(1) reclaim and free all memory in the memory block. +(2) migrate all pages in the memory block. In the current implementation, Linux's memory offline uses method (2), freeing -all pages in the section by page migration. But not all pages are +all pages in the memory block by page migration. But not all pages are migratable. Under current Linux, migratable pages are anonymous pages and -page caches. For offlining a section by migration, the kernel has to guarantee -that the section contains only migratable pages. +page caches. For offlining a memory block by migration, the kernel has to +guarantee that the memory block contains only migratable pages. -Now, a boot option for making a section which consists of migratable pages is -supported. By specifying "kernelcore=" or "movablecore=" boot option, you can +Now, a boot option for making a memory block which consists of migratable pages +is supported. By specifying "kernelcore=" or "movablecore=" boot option, you can create ZONE_MOVABLE...a zone which is just used for movable pages. (See also Documentation/kernel-parameters.txt) @@ -315,28 +313,27 @@ creates ZONE_MOVABLE as following. Size of memory for movable pages (for offline) is ZZZZ. -Note) Unfortunately, there is no information to show which section belongs +Note: Unfortunately, there is no information to show which memory block belongs to ZONE_MOVABLE. This is TBD. 6.2. How to offline memory ------------ -You can offline a section by using the same sysfs interface that was used in -memory onlining. +You can offline a memory block by using the same sysfs interface that was used +in memory onlining. % echo offline > /sys/devices/system/memory/memoryXXX/state -If offline succeeds, the state of the memory section is changed to be "offline". +If offline succeeds, the state of the memory block is changed to be "offline". If it fails, some error core (like -EBUSY) will be returned by the kernel. -Even if a section does not belong to ZONE_MOVABLE, you can try to offline it. -If it doesn't contain 'unmovable' memory, you'll get success. +Even if a memory block does not belong to ZONE_MOVABLE, you can try to offline +it. If it doesn't contain 'unmovable' memory, you'll get success. -A section under ZONE_MOVABLE is considered to be able to be offlined easily. -But under some busy state, it may return -EBUSY. Even if a memory section -cannot be offlined due to -EBUSY, you can retry offlining it and may be able to -offline it (or not). -(For example, a page is referred to by some kernel internal call and released - soon.) +A memory block under ZONE_MOVABLE is considered to be able to be offlined +easily. But under some busy state, it may return -EBUSY. Even if a memory +block cannot be offlined due to -EBUSY, you can retry offlining it and may be +able to offline it (or not). (For example, a page is referred to by some kernel +internal call and released soon.) Consideration: Memory hotplug's design direction is to make the possibility of memory offlining @@ -373,11 +370,11 @@ MEMORY_GOING_OFFLINE Generated to begin the process of offlining memory. Allocations are no longer possible from the memory but some of the memory to be offlined is still in use. The callback can be used to free memory known to a - subsystem from the indicated memory section. + subsystem from the indicated memory block. MEMORY_CANCEL_OFFLINE Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from - the section that we attempted to offline. + the memory block that we attempted to offline. MEMORY_OFFLINE Generated after offlining memory is complete. @@ -413,8 +410,8 @@ node if necessary. -------------- - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like sysctl or new control file. - - showing memory section and physical device relationship. - - showing memory section is under ZONE_MOVABLE or not + - showing memory block and physical device relationship. + - showing memory block is under ZONE_MOVABLE or not - test and make it better memory offlining. - support HugeTLB page migration and offlining. - memmap removing at memory offline. diff --git a/drivers/base/memory.c b/drivers/base/memory.c index bece691cb5d9..89f752dd8465 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -118,16 +118,6 @@ static ssize_t show_mem_start_phys_index(struct device *dev, return sprintf(buf, "%08lx\n", phys_index); } -static ssize_t show_mem_end_phys_index(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct memory_block *mem = to_memory_block(dev); - unsigned long phys_index; - - phys_index = mem->end_section_nr / sections_per_block; - return sprintf(buf, "%08lx\n", phys_index); -} - /* * Show whether the section of memory is likely to be hot-removable */ @@ -384,7 +374,6 @@ static ssize_t show_phys_device(struct device *dev, } static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); -static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL); @@ -529,7 +518,6 @@ struct memory_block *find_memory_block(struct mem_section *section) static struct attribute *memory_memblk_attrs[] = { &dev_attr_phys_index.attr, - &dev_attr_end_phys_index.attr, &dev_attr_state.attr, &dev_attr_phys_device.attr, &dev_attr_removable.attr, -- cgit v1.2.3 From 4f9b16a64753d0bb607454347036dc997fd03b82 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:07:14 -0700 Subject: mm: disable zone_reclaim_mode by default When it was introduced, zone_reclaim_mode made sense as NUMA distances punished and workloads were generally partitioned to fit into a NUMA node. NUMA machines are now common but few of the workloads are NUMA-aware and it's routine to see major performance degradation due to zone_reclaim_mode being enabled but relatively few can identify the problem. Those that require zone_reclaim_mode are likely to be able to detect when it needs to be enabled and tune appropriately so lets have a sensible default for the bulk of users. This patch (of 2): zone_reclaim_mode causes processes to prefer reclaiming memory from local node instead of spilling over to other nodes. This made sense initially when NUMA machines were almost exclusively HPC and the workload was partitioned into nodes. The NUMA penalties were sufficiently high to justify reclaiming the memory. On current machines and workloads it is often the case that zone_reclaim_mode destroys performance but not all users know how to detect this. Favour the common case and disable it by default. Users that are sophisticated enough to know they need zone_reclaim_mode will detect it. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Zhang Yanfei Acked-by: Michal Hocko Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 17 +++++++++-------- arch/ia64/include/asm/topology.h | 3 ++- arch/powerpc/include/asm/topology.h | 8 ++------ include/linux/topology.h | 3 ++- mm/page_alloc.c | 2 -- 5 files changed, 15 insertions(+), 18 deletions(-) (limited to 'Documentation') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index dd9d0e33b443..5b6da0fb5fbf 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -772,16 +772,17 @@ This is value ORed together of 2 = Zone reclaim writes dirty pages out 4 = Zone reclaim swaps pages -zone_reclaim_mode is set during bootup to 1 if it is determined that pages -from remote zones will cause a measurable performance reduction. The -page allocator will then reclaim easily reusable pages (those page -cache pages that are currently not used) before allocating off node pages. - -It may be beneficial to switch off zone reclaim if the system is -used for a file server and all of memory should be used for caching files -from disk. In that case the caching effect is more important than +zone_reclaim_mode is disabled by default. For file servers or workloads +that benefit from having their data cached, zone_reclaim_mode should be +left disabled as the caching effect is likely to be more important than data locality. +zone_reclaim may be enabled if it's known that the workload is partitioned +such that each partition fits within a NUMA node and that accessing remote +memory would cause a measurable performance reduction. The page allocator +will then reclaim easily reusable pages (those page cache pages that are +currently not used) before allocating off node pages. + Allowing zone reclaim to write out pages stops processes that are writing large amounts of data from dirtying pages on other nodes. Zone reclaim will write out dirty pages if a zone fills up and so effectively diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 3202aa74e0d6..6437ca21f61b 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -21,7 +21,8 @@ #define PENALTY_FOR_NODE_WITH_CPUS 255 /* - * Distance above which we begin to use zone reclaim + * Nodes within this distance are eligible for reclaim by zone_reclaim() when + * zone_reclaim_mode is enabled. */ #define RECLAIM_DISTANCE 15 diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index c9202151079f..6c8a8c5a37a1 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -9,12 +9,8 @@ struct device_node; #ifdef CONFIG_NUMA /* - * Before going off node we want the VM to try and reclaim from the local - * node. It does this if the remote distance is larger than RECLAIM_DISTANCE. - * With the default REMOTE_DISTANCE of 20 and the default RECLAIM_DISTANCE of - * 20, we never reclaim and go off node straight away. - * - * To fix this we choose a smaller value of RECLAIM_DISTANCE. + * If zone_reclaim_mode is enabled, a RECLAIM_DISTANCE of 10 will mean that + * all zones on all nodes will be eligible for zone_reclaim(). */ #define RECLAIM_DISTANCE 10 diff --git a/include/linux/topology.h b/include/linux/topology.h index 973671ff9e7d..dda6ee521e74 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -58,7 +58,8 @@ int arch_update_cpu_topology(void); /* * If the distance between nodes in a system is larger than RECLAIM_DISTANCE * (in whatever arch specific measurement units returned by node_distance()) - * then switch on zone reclaim on boot. + * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim() + * on nodes within this distance. */ #define RECLAIM_DISTANCE 30 #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7cfdcd808f52..dfe954fbb48a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1860,8 +1860,6 @@ static void __paginginit init_zone_allows_reclaim(int nid) for_each_node_state(i, N_MEMORY) if (node_distance(nid, i) <= RECLAIM_DISTANCE) node_set(i, NODE_DATA(nid)->reclaim_nodes); - else - zone_reclaim_mode = 1; } #else /* CONFIG_NUMA */ -- cgit v1.2.3 From 2ee06468702e0742114823a537510cd6f038cacc Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:28 -0700 Subject: Documentation/memcg: warn about incomplete kmemcg state Kmemcg is currently under development and lacks some important features. In particular, it does not have support of kmem reclaim on memory pressure inside cgroup, which practically makes it unusable in real life. Let's warn about it in both Kconfig and Documentation to prevent complaints arising. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 5 +++++ init/Kconfig | 6 ++++++ 2 files changed, 11 insertions(+) (limited to 'Documentation') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 1829c65f8371..4937e6fff9b4 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -270,6 +270,11 @@ When oom event notifier is registered, event will be delivered. 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) +WARNING: Current implementation lacks reclaim support. That means allocation + attempts will fail when close to the limit even if there are plenty of + kmem available for reclaim. That makes this option unusable in real + life so DO NOT SELECT IT unless for development purposes. + With the Kernel memory extension, the Memory Controller is able to limit the amount of kernel memory used by the system. Kernel memory is fundamentally different than user memory, since it can't be swapped out, which makes it diff --git a/init/Kconfig b/init/Kconfig index 9d3585bb2a7a..4a1822a1a680 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -996,6 +996,12 @@ config MEMCG_KMEM the kmem extension can use it to guarantee that no group of processes will ever exhaust kernel resources alone. + WARNING: Current implementation lacks reclaim support. That means + allocation attempts will fail when close to the limit even if there + are plenty of kmem available for reclaim. That makes this option + unusable in real life so DO NOT SELECT IT unless for development + purposes. + config CGROUP_HUGETLB bool "HugeTLB Resource Controller for Control Groups" depends on RESOURCE_COUNTERS && HUGETLB_PAGE -- cgit v1.2.3 From 3ba08129e38437561df44c36b7ea9081185d5333 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 4 Jun 2014 16:11:02 -0700 Subject: mm/memory-failure.c: support use of a dedicated thread to handle SIGBUS(BUS_MCEERR_AO) Currently memory error handler handles action optional errors in the deferred manner by default. And if a recovery aware application wants to handle it immediately, it can do it by setting PF_MCE_EARLY flag. However, such signal can be sent only to the main thread, so it's problematic if the application wants to have a dedicated thread to handler such signals. So this patch adds dedicated thread support to memory error handler. We have PF_MCE_EARLY flags for each thread separately, so with this patch AO signal is sent to the thread with PF_MCE_EARLY flag set, not the main thread. If you want to implement a dedicated thread, you call prctl() to set PF_MCE_EARLY on the thread. Memory error handler collects processes to be killed, so this patch lets it check PF_MCE_EARLY flag on each thread in the collecting routines. No behavioral change for all non-early kill cases. Tony said: : The old behavior was crazy - someone with a multithreaded process might : well expect that if they call prctl(PF_MCE_EARLY) in just one thread, then : that thread would see the SIGBUS with si_code = BUS_MCEERR_A0 - even if : that thread wasn't the main thread for the process. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Naoya Horiguchi Reviewed-by: Tony Luck Cc: Kamil Iskra Cc: Andi Kleen Cc: Borislav Petkov Cc: Chen Gong Cc: [3.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hwpoison.txt | 5 ++++ mm/memory-failure.c | 56 +++++++++++++++++++++++++++++++++---------- 2 files changed, 48 insertions(+), 13 deletions(-) (limited to 'Documentation') diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index 550068466605..6ae89a9edf2a 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -84,6 +84,11 @@ PR_MCE_KILL PR_MCE_KILL_EARLY: Early kill PR_MCE_KILL_LATE: Late kill PR_MCE_KILL_DEFAULT: Use system global default + Note that if you want to have a dedicated thread which handles + the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should + call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise, + the SIGBUS is sent to the main thread. + PR_MCE_KILL_GET return current mode diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ed339c505d55..cd8989c1027e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -380,15 +380,44 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, } } -static int task_early_kill(struct task_struct *tsk, int force_early) +/* + * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) + * on behalf of the thread group. Return task_struct of the (first found) + * dedicated thread if found, and return NULL otherwise. + * + * We already hold read_lock(&tasklist_lock) in the caller, so we don't + * have to call rcu_read_lock/unlock() in this function. + */ +static struct task_struct *find_early_kill_thread(struct task_struct *tsk) +{ + struct task_struct *t; + + for_each_thread(tsk, t) + if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) + return t; + return NULL; +} + +/* + * Determine whether a given process is "early kill" process which expects + * to be signaled when some page under the process is hwpoisoned. + * Return task_struct of the dedicated thread (main thread unless explicitly + * specified) if the process is "early kill," and otherwise returns NULL. + */ +static struct task_struct *task_early_kill(struct task_struct *tsk, + int force_early) { + struct task_struct *t; if (!tsk->mm) - return 0; + return NULL; if (force_early) - return 1; - if (tsk->flags & PF_MCE_PROCESS) - return !!(tsk->flags & PF_MCE_EARLY); - return sysctl_memory_failure_early_kill; + return tsk; + t = find_early_kill_thread(tsk); + if (t) + return t; + if (sysctl_memory_failure_early_kill) + return tsk; + return NULL; } /* @@ -410,16 +439,17 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk, force_early)) + if (!t) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, pgoff, pgoff) { vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); @@ -440,10 +470,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, read_lock(&tasklist_lock); for_each_process(tsk) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk, force_early)) + if (!t) continue; - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* @@ -453,8 +483,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * Assume applications who requested early kill want * to be informed of all such data corruptions. */ - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); -- cgit v1.2.3 From 4a0da71b96b9d4080c0820e9e7d02470ebe62dc6 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 4 Jun 2014 16:11:03 -0700 Subject: Documentation/sysctl/vm.txt: clarify vfs_cache_pressure description Existing description is worded in a way which almost encourages setting of vfs_cache_pressure above 100, possibly way above it. Users are left in a dark what this numeric value is - an int? a percentage? what the scale is? As a result, we are getting reports about noticeable performance degradation from users who have set vfs_cache_pressure to ridiculously high values - because they thought there is no downside to it. Via code inspection it's obvious that this value is treated as a percentage. This patch changes text to reflect this fact, and adds a cautionary paragraph advising against setting vfs_cache_pressure sky high. Signed-off-by: Denys Vlasenko Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'Documentation') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 5b6da0fb5fbf..bd4b34c03738 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -746,8 +746,8 @@ Changing this takes effect whenever an application requests memory. vfs_cache_pressure ------------------ -Controls the tendency of the kernel to reclaim the memory which is used for -caching of directory and inode objects. +This percentage value controls the tendency of the kernel to reclaim +the memory which is used for caching of directory and inode objects. At the default value of vfs_cache_pressure=100 the kernel will attempt to reclaim dentries and inodes at a "fair" rate with respect to pagecache and @@ -757,6 +757,11 @@ never reclaim dentries and inodes due to memory pressure and this can easily lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100 causes the kernel to prefer to reclaim dentries and inodes. +Increasing vfs_cache_pressure significantly beyond 100 may have negative +performance impact. Reclaim code needs to take various locks to find freeable +directory and inode objects. With vfs_cache_pressure=1000, it will look for +ten times more freeable objects than there are. + ============================================================== zone_reclaim_mode: -- cgit v1.2.3 From 6e099f557d9c6797c3ee3ee7b5c8cebe543ec1cc Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 4 Jun 2014 16:11:44 -0700 Subject: Documentation: expand/clarify debug documentation The pr_debug() and related debug print macros all differ from the normal pr_XXX() macros, in that the normal ones print unconditionally, while the debug macros are compiled out unless DEBUG is defined or CONFIG_DYNAMIC_DEBUG is set. This isn't obvious, and the only way to find this out is either to review the actual printk.h code or to read CodingStyle, and the message there doesn't highlight the fact. Change Documentation/CodingStyle to clearly indicate that pr_debug() and related debug printing macros behave differently than all other pr_XXX() macros, and attempt to clarify when and where the different debug printing methods might be used. Add short comment to printk.h above the pr_XXX() macros indicating that while these macros print unconditionally, pr_debug() does not. Signed-off-by: Dan Streetman Cc: Joe Perches Cc: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/CodingStyle | 22 +++++++++++++++------- include/linux/printk.h | 6 ++++++ 2 files changed, 21 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle index 7fe0546c504a..6b6bef31e956 100644 --- a/Documentation/CodingStyle +++ b/Documentation/CodingStyle @@ -660,15 +660,23 @@ There are a number of driver model diagnostic macros in which you should use to make sure messages are matched to the right device and driver, and are tagged with the right level: dev_err(), dev_warn(), dev_info(), and so forth. For messages that aren't associated with a -particular device, defines pr_debug() and pr_info(). +particular device, defines pr_notice(), pr_info(), +pr_warn(), pr_err(), etc. Coming up with good debugging messages can be quite a challenge; and once -you have them, they can be a huge help for remote troubleshooting. Such -messages should be compiled out when the DEBUG symbol is not defined (that -is, by default they are not included). When you use dev_dbg() or pr_debug(), -that's automatic. Many subsystems have Kconfig options to turn on -DDEBUG. -A related convention uses VERBOSE_DEBUG to add dev_vdbg() messages to the -ones already enabled by DEBUG. +you have them, they can be a huge help for remote troubleshooting. However +debug message printing is handled differently than printing other non-debug +messages. While the other pr_XXX() functions print unconditionally, +pr_debug() does not; it is compiled out by default, unless either DEBUG is +defined or CONFIG_DYNAMIC_DEBUG is set. That is true for dev_dbg() also, +and a related convention uses VERBOSE_DEBUG to add dev_vdbg() messages to +the ones already enabled by DEBUG. + +Many subsystems have Kconfig debug options to turn on -DDEBUG in the +corresponding Makefile; in other cases specific files #define DEBUG. And +when a debug message should be unconditionally printed, such as if it is +already inside a debug-related #ifdef secton, printk(KERN_DEBUG ...) can be +used. Chapter 14: Allocating memory diff --git a/include/linux/printk.h b/include/linux/printk.h index f086d6c99dbc..37f3a6589c1c 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -210,6 +210,12 @@ extern asmlinkage void dump_stack(void) __cold; #define pr_fmt(fmt) fmt #endif +/* + * These can be used to print at the various log levels. + * All of these will print unconditionally, although note that pr_debug() + * and other debug macros are compiled out unless either DEBUG is defined + * or CONFIG_DYNAMIC_DEBUG is set. + */ #define pr_emerg(fmt, ...) \ printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) #define pr_alert(fmt, ...) \ -- cgit v1.2.3 From 7b0b73d76651e5f88c88b76efa18d719f832bf6f Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 4 Jun 2014 16:12:17 -0700 Subject: init/main.c: add initcall_blacklist kernel parameter When a module is built into the kernel the module_init() function becomes an initcall. Sometimes debugging through dynamic debug can help, however, debugging built in kernel modules is typically done by changing the .config, recompiling, and booting the new kernel in an effort to determine exactly which module caused a problem. This patchset can be useful stand-alone or combined with initcall_debug. There are cases where some initcalls can hang the machine before the console can be flushed, which can make initcall_debug output inaccurate. Having the ability to skip initcalls can help further debugging of these scenarios. Usage: initcall_blacklist= ex) added "initcall_blacklist=sgi_uv_sysfs_init" as a kernel parameter and the log contains: blacklisting initcall sgi_uv_sysfs_init ... ... initcall sgi_uv_sysfs_init blacklisted ex) added "initcall_blacklist=foo_bar,sgi_uv_sysfs_init" as a kernel parameter and the log contains: blacklisting initcall foo_bar blacklisting initcall sgi_uv_sysfs_init ... ... initcall sgi_uv_sysfs_init blacklisted [akpm@linux-foundation.org: tweak printk text] Signed-off-by: Prarit Bhargava Cc: Richard Weinberger Cc: Andi Kleen Cc: Josh Boyer Cc: Rob Landley Cc: Steven Rostedt Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 4 +++ init/main.c | 68 +++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index adea3a22fa00..9973a7e2e0ac 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1312,6 +1312,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. for working out where the kernel is dying during startup. + initcall_blacklist= [KNL] Do not execute a comma-separated list of + initcall functions. Useful for debugging built-in + modules and initcalls. + initrd= [BOOT] Specify the location of the initial ramdisk inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver diff --git a/init/main.c b/init/main.c index 9d3a7b84de5c..8ac3833f2bdf 100644 --- a/init/main.c +++ b/init/main.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include @@ -665,6 +666,70 @@ static void __init do_ctors(void) bool initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); +#ifdef CONFIG_KALLSYMS +struct blacklist_entry { + struct list_head next; + char *buf; +}; + +static __initdata_or_module LIST_HEAD(blacklisted_initcalls); + +static int __init initcall_blacklist(char *str) +{ + char *str_entry; + struct blacklist_entry *entry; + + /* str argument is a comma-separated list of functions */ + do { + str_entry = strsep(&str, ","); + if (str_entry) { + pr_debug("blacklisting initcall %s\n", str_entry); + entry = alloc_bootmem(sizeof(*entry)); + entry->buf = alloc_bootmem(strlen(str_entry) + 1); + strcpy(entry->buf, str_entry); + list_add(&entry->next, &blacklisted_initcalls); + } + } while (str_entry); + + return 0; +} + +static bool __init_or_module initcall_blacklisted(initcall_t fn) +{ + struct list_head *tmp; + struct blacklist_entry *entry; + char *fn_name; + + fn_name = kasprintf(GFP_KERNEL, "%pf", fn); + if (!fn_name) + return false; + + list_for_each(tmp, &blacklisted_initcalls) { + entry = list_entry(tmp, struct blacklist_entry, next); + if (!strcmp(fn_name, entry->buf)) { + pr_debug("initcall %s blacklisted\n", fn_name); + kfree(fn_name); + return true; + } + } + + kfree(fn_name); + return false; +} +#else +static int __init initcall_blacklist(char *str) +{ + pr_warn("initcall_blacklist requires CONFIG_KALLSYMS\n"); + return 0; +} + +static bool __init_or_module initcall_blacklisted(initcall_t fn) +{ + return false; +} +#endif +__setup("initcall_blacklist=", initcall_blacklist); + static int __init_or_module do_one_initcall_debug(initcall_t fn) { ktime_t calltime, delta, rettime; @@ -689,6 +754,9 @@ int __init_or_module do_one_initcall(initcall_t fn) int ret; char msgbuf[64]; + if (initcall_blacklisted(fn)) + return -EPERM; + if (initcall_debug) ret = do_one_initcall_debug(fn); else -- cgit v1.2.3