From 27ee57c93ff00b8a2d6c6dd6b0b3dddda7b43b77 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Thu, 17 Mar 2016 14:17:35 -0700
Subject: mm: memcontrol: report slab usage in cgroup2 memory.stat

Show how much memory is used for storing reclaimable and unreclaimable
in-kernel data structures allocated from slab caches.

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroup-v2.txt | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index ff49cf901148..e4e0c1d78cee 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -843,6 +843,11 @@ PAGE_SIZE multiple when read back.
 		Amount of memory used to cache filesystem data,
 		including tmpfs and shared memory.
 
+	  slab
+
+		Amount of memory used for storing in-kernel data
+		structures.
+
 	  sock
 
 		Amount of memory used in network transmission buffers
@@ -871,6 +876,16 @@ PAGE_SIZE multiple when read back.
 		on the internal memory management lists used by the
 		page reclaim algorithm
 
+	  slab_reclaimable
+
+		Part of "slab" that might be reclaimed, such as
+		dentries and inodes.
+
+	  slab_unreclaimable
+
+		Part of "slab" that cannot be reclaimed on memory
+		pressure.
+
 	  pgfault
 
 		Total number of page faults incurred
-- 
cgit v1.2.3


From 12580e4b54ba8a1b22ec977c200be0174ca42348 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Thu, 17 Mar 2016 14:17:38 -0700
Subject: mm: memcontrol: report kernel stack usage in cgroup2 memory.stat

Show how much memory is allocated to kernel stacks.

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroup-v2.txt |  4 ++++
 include/linux/memcontrol.h  |  3 ++-
 kernel/fork.c               | 10 +++++++++-
 mm/memcontrol.c             |  2 ++
 4 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index e4e0c1d78cee..e2f4e7948a66 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -843,6 +843,10 @@ PAGE_SIZE multiple when read back.
 		Amount of memory used to cache filesystem data,
 		including tmpfs and shared memory.
 
+	  kernel_stack
+
+		Amount of memory allocated to kernel stacks.
+
 	  slab
 
 		Amount of memory used for storing in-kernel data
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e7af4834ffea..d6300313b298 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -52,9 +52,10 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
 	MEM_CGROUP_STAT_NSTATS,
 	/* default hierarchy stats */
-	MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS,
+	MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS,
 	MEMCG_SLAB_RECLAIMABLE,
 	MEMCG_SLAB_UNRECLAIMABLE,
+	MEMCG_SOCK,
 	MEMCG_NR_STAT,
 };
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 2e391c754ae7..accb7221d547 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -164,12 +164,20 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 	struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
 						  THREAD_SIZE_ORDER);
 
+	if (page)
+		memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
+					    1 << THREAD_SIZE_ORDER);
+
 	return page ? page_address(page) : NULL;
 }
 
 static inline void free_thread_info(struct thread_info *ti)
 {
-	free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+	struct page *page = virt_to_page(ti);
+
+	memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
+				    -(1 << THREAD_SIZE_ORDER));
+	__free_kmem_pages(page, THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_info_cache;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3ad64bf464fd..4b7dda7c2e74 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5106,6 +5106,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
 		   (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
 	seq_printf(m, "file %llu\n",
 		   (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
+	seq_printf(m, "kernel_stack %llu\n",
+		   (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE);
 	seq_printf(m, "slab %llu\n",
 		   (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
 			 stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
-- 
cgit v1.2.3


From f9719a03de51e13526d614e79d002f838770b2d6 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 17 Mar 2016 14:18:45 -0700
Subject: thp, vmstats: count deferred split events

Count how many times we put a THP in split queue.  Currently, it happens
on partial unmap of a THP.

Rapidly growing value can indicate that an application behaves
unfriendly wrt THP: often fault in huge page and then unmap part of it.
This leads to unnecessary memory fragmentation and the application may
require tuning.

The event also can help with debugging kernel [mis-]behaviour.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/transhuge.txt | 5 +++++
 include/linux/vm_event_item.h  | 1 +
 mm/huge_memory.c               | 1 +
 mm/vmstat.c                    | 1 +
 4 files changed, 8 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 21cf34f3ddb2..0dc8632aa01e 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -229,6 +229,11 @@ thp_split_page is incremented every time a huge page is split into base
 thp_split_page_failed is is incremented if kernel fails to split huge
 	page. This can happen if the page was pinned by somebody.
 
+thp_deferred_split_page is incremented when a huge page is put onto split
+	queue. This happens when a huge page is partially unmapped and
+	splitting it would free up some memory. Pages on split queue are
+	going to be split under memory pressure.
+
 thp_split_pmd is incremented every time a PMD split into table of PTEs.
 	This can happen, for instance, when application calls mprotect() or
 	munmap() on part of huge page. It doesn't split huge page, only
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 58ecc056ee45..ec084321fe09 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -72,6 +72,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		THP_COLLAPSE_ALLOC_FAILED,
 		THP_SPLIT_PAGE,
 		THP_SPLIT_PAGE_FAILED,
+		THP_DEFERRED_SPLIT_PAGE,
 		THP_SPLIT_PMD,
 		THP_ZERO_PAGE_ALLOC,
 		THP_ZERO_PAGE_ALLOC_FAILED,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1ea21e203a70..1dddfb21fc22 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3455,6 +3455,7 @@ void deferred_split_huge_page(struct page *page)
 
 	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
 	if (list_empty(page_deferred_list(page))) {
+		count_vm_event(THP_DEFERRED_SPLIT_PAGE);
 		list_add_tail(page_deferred_list(page), &pgdata->split_queue);
 		pgdata->split_queue_len++;
 	}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f80066248c94..5e4300482897 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -848,6 +848,7 @@ const char * const vmstat_text[] = {
 	"thp_collapse_alloc_failed",
 	"thp_split_page",
 	"thp_split_page_failed",
+	"thp_deferred_split_page",
 	"thp_split_pmd",
 	"thp_zero_page_alloc",
 	"thp_zero_page_alloc_failed",
-- 
cgit v1.2.3


From 795ae7a0de6b834a0cc202aa55c190ef81496665 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 17 Mar 2016 14:19:14 -0700
Subject: mm: scale kswapd watermarks in proportion to memory

In machines with 140G of memory and enterprise flash storage, we have
seen read and write bursts routinely exceed the kswapd watermarks and
cause thundering herds in direct reclaim.  Unfortunately, the only way
to tune kswapd aggressiveness is through adjusting min_free_kbytes - the
system's emergency reserves - which is entirely unrelated to the
system's latency requirements.  In order to get kswapd to maintain a
250M buffer of free memory, the emergency reserves need to be set to 1G.
That is a lot of memory wasted for no good reason.

On the other hand, it's reasonable to assume that allocation bursts and
overall allocation concurrency scale with memory capacity, so it makes
sense to make kswapd aggressiveness a function of that as well.

Change the kswapd watermark scale factor from the currently fixed 25% of
the tunable emergency reserve to a tunable 0.1% of memory.

Beyond 1G of memory, this will produce bigger watermark steps than the
current formula in default settings.  Ensure that the new formula never
chooses steps smaller than that, i.e.  25% of the emergency reserve.

On a 140G machine, this raises the default watermark steps - the
distance between min and low, and low and high - from 16M to 143M.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 18 ++++++++++++++++++
 include/linux/mm.h          |  1 +
 include/linux/mmzone.h      |  2 ++
 kernel/sysctl.c             | 10 ++++++++++
 mm/page_alloc.c             | 29 +++++++++++++++++++++++++++--
 5 files changed, 58 insertions(+), 2 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 89a887c76629..cb0368459da3 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -803,6 +803,24 @@ performance impact. Reclaim code needs to take various locks to find freeable
 directory and inode objects. With vfs_cache_pressure=1000, it will look for
 ten times more freeable objects than there are.
 
+=============================================================
+
+watermark_scale_factor:
+
+This factor controls the aggressiveness of kswapd. It defines the
+amount of memory left in a node/system before kswapd is woken up and
+how much memory needs to be free before kswapd goes back to sleep.
+
+The unit is in fractions of 10,000. The default value of 10 means the
+distances between watermarks are 0.1% of the available memory in the
+node/system. The maximum value is 1000, or 10% of memory.
+
+A high rate of threads entering direct reclaim (allocstall) or kswapd
+going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
+that the number of free pages kswapd maintains for latency reasons is
+too small for the allocation bursts occurring in the system. This knob
+can then be used to tune kswapd aggressiveness accordingly.
+
 ==============================================================
 
 zone_reclaim_mode:
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 75d1907b9009..f7fd64227d3a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1889,6 +1889,7 @@ extern void zone_pcp_reset(struct zone *zone);
 
 /* page_alloc.c */
 extern int min_free_kbytes;
+extern int watermark_scale_factor;
 
 /* nommu.c */
 extern atomic_long_t mmap_pages_allocated;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bdd9a270a813..c60df9257cc7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -841,6 +841,8 @@ static inline int is_highmem(struct zone *zone)
 struct ctl_table;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
+int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
+					void __user *, size_t *, loff_t *);
 extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
 int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f5102fabef7f..725587f10667 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -126,6 +126,7 @@ static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
+static int one_thousand = 1000;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
@@ -1403,6 +1404,15 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= min_free_kbytes_sysctl_handler,
 		.extra1		= &zero,
 	},
+	{
+		.procname	= "watermark_scale_factor",
+		.data		= &watermark_scale_factor,
+		.maxlen		= sizeof(watermark_scale_factor),
+		.mode		= 0644,
+		.proc_handler	= watermark_scale_factor_sysctl_handler,
+		.extra1		= &one,
+		.extra2		= &one_thousand,
+	},
 	{
 		.procname	= "percpu_pagelist_fraction",
 		.data		= &percpu_pagelist_fraction,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 941b802e11ec..d156310aedeb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {
 
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
+int watermark_scale_factor = 10;
 
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -6347,8 +6348,17 @@ static void __setup_per_zone_wmarks(void)
 			zone->watermark[WMARK_MIN] = tmp;
 		}
 
-		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
-		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+		/*
+		 * Set the kswapd watermarks distance according to the
+		 * scale factor in proportion to available memory, but
+		 * ensure a minimum size on small systems.
+		 */
+		tmp = max_t(u64, tmp >> 2,
+			    mult_frac(zone->managed_pages,
+				      watermark_scale_factor, 10000));
+
+		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
+		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
 
 		__mod_zone_page_state(zone, NR_ALLOC_BATCH,
 			high_wmark_pages(zone) - low_wmark_pages(zone) -
@@ -6489,6 +6499,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
 	return 0;
 }
 
+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+	void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int rc;
+
+	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (rc)
+		return rc;
+
+	if (write)
+		setup_per_zone_wmarks();
+
+	return 0;
+}
+
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
-- 
cgit v1.2.3


From 444eb2a449ef36fe115431ed7b71467c4563c7f1 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 17 Mar 2016 14:19:23 -0700
Subject: mm: thp: set THP defrag by default to madvise and add a stall-free
 defrag option

THP defrag is enabled by default to direct reclaim/compact but not wake
kswapd in the event of a THP allocation failure.  The problem is that
THP allocation requests potentially enter reclaim/compaction.  This
potentially incurs a severe stall that is not guaranteed to be offset by
reduced TLB misses.  While there has been considerable effort to reduce
the impact of reclaim/compaction, it is still a high cost and workloads
that should fit in memory fail to do so.  Specifically, a simple
anon/file streaming workload will enter direct reclaim on NUMA at least
even though the working set size is 80% of RAM.  It's been years and
it's time to throw in the towel.

First, this patch defines THP defrag as follows;

 madvise: A failed allocation will direct reclaim/compact if the application requests it
 never:   Neither reclaim/compact nor wake kswapd
 defer:   A failed allocation will wake kswapd/kcompactd
 always:  A failed allocation will direct reclaim/compact (historical behaviour)
          khugepaged defrag will enter direct/reclaim but not wake kswapd.

Next it sets the default defrag option to be "madvise" to only enter
direct reclaim/compaction for applications that specifically requested
it.

Lastly, it removes a check from the page allocator slowpath that is
related to __GFP_THISNODE to allow "defer" to work.  The callers that
really cares are slub/slab and they are updated accordingly.  The slab
one may be surprising because it also corrects a comment as kswapd was
never woken up by that path.

This means that a THP fault will no longer stall for most applications
by default and the ideal for most users that get THP if they are
immediately available.  There are still options for users that prefer a
stall at startup of a new application by either restoring historical
behaviour with "always" or pick a half-way point with "defer" where
kswapd does some of the work in the background and wakes kcompactd if
necessary.  THP defrag for khugepaged remains enabled and will enter
direct/reclaim but no wakeup kswapd or kcompactd.

After this patch a THP allocation failure will quickly fallback and rely
on khugepaged to recover the situation at some time in the future.  In
some cases, this will reduce THP usage but the benefit of THP is hard to
measure and not a universal win where as a stall to reclaim/compaction
is definitely measurable and can be painful.

The first test for this is using "usemem" to read a large file and write
a large anonymous mapping (to avoid the zero page) multiple times.  The
total size of the mappings is 80% of RAM and the benchmark simply
measures how long it takes to complete.  It uses multiple threads to see
if that is a factor.  On UMA, the performance is almost identical so is
not reported but on NUMA, we see this

usemem
                                   4.4.0                 4.4.0
                          kcompactd-v1r1         nodefrag-v1r3
Amean    System-1       102.86 (  0.00%)       46.81 ( 54.50%)
Amean    System-4        37.85 (  0.00%)       34.02 ( 10.12%)
Amean    System-7        48.12 (  0.00%)       46.89 (  2.56%)
Amean    System-12       51.98 (  0.00%)       56.96 ( -9.57%)
Amean    System-21       80.16 (  0.00%)       79.05 (  1.39%)
Amean    System-30      110.71 (  0.00%)      107.17 (  3.20%)
Amean    System-48      127.98 (  0.00%)      124.83 (  2.46%)
Amean    Elapsd-1       185.84 (  0.00%)      105.51 ( 43.23%)
Amean    Elapsd-4        26.19 (  0.00%)       25.58 (  2.33%)
Amean    Elapsd-7        21.65 (  0.00%)       21.62 (  0.16%)
Amean    Elapsd-12       18.58 (  0.00%)       17.94 (  3.43%)
Amean    Elapsd-21       17.53 (  0.00%)       16.60 (  5.33%)
Amean    Elapsd-30       17.45 (  0.00%)       17.13 (  1.84%)
Amean    Elapsd-48       15.40 (  0.00%)       15.27 (  0.82%)

For a single thread, the benchmark completes 43.23% faster with this
patch applied with smaller benefits as the thread increases.  Similar,
notice the large reduction in most cases in system CPU usage.  The
overall CPU time is

               4.4.0       4.4.0
        kcompactd-v1r1 nodefrag-v1r3
User        10357.65    10438.33
System       3988.88     3543.94
Elapsed      2203.01     1634.41

Which is substantial. Now, the reclaim figures

                                 4.4.0       4.4.0
                          kcompactd-v1r1nodefrag-v1r3
Minor Faults                 128458477   278352931
Major Faults                   2174976         225
Swap Ins                      16904701           0
Swap Outs                     17359627           0
Allocation stalls                43611           0
DMA allocs                           0           0
DMA32 allocs                  19832646    19448017
Normal allocs                614488453   580941839
Movable allocs                       0           0
Direct pages scanned          24163800           0
Kswapd pages scanned                 0           0
Kswapd pages reclaimed               0           0
Direct pages reclaimed        20691346           0
Compaction stalls                42263           0
Compaction success                 938           0
Compaction failures              41325           0

This patch eliminates almost all swapping and direct reclaim activity.
There is still overhead but it's from NUMA balancing which does not
identify that it's pointless trying to do anything with this workload.

I also tried the thpscale benchmark which forces a corner case where
compaction can be used heavily and measures the latency of whether base
or huge pages were used

thpscale Fault Latencies
                                       4.4.0                 4.4.0
                              kcompactd-v1r1         nodefrag-v1r3
Amean    fault-base-1      5288.84 (  0.00%)     2817.12 ( 46.73%)
Amean    fault-base-3      6365.53 (  0.00%)     3499.11 ( 45.03%)
Amean    fault-base-5      6526.19 (  0.00%)     4363.06 ( 33.15%)
Amean    fault-base-7      7142.25 (  0.00%)     4858.08 ( 31.98%)
Amean    fault-base-12    13827.64 (  0.00%)    10292.11 ( 25.57%)
Amean    fault-base-18    18235.07 (  0.00%)    13788.84 ( 24.38%)
Amean    fault-base-24    21597.80 (  0.00%)    24388.03 (-12.92%)
Amean    fault-base-30    26754.15 (  0.00%)    19700.55 ( 26.36%)
Amean    fault-base-32    26784.94 (  0.00%)    19513.57 ( 27.15%)
Amean    fault-huge-1      4223.96 (  0.00%)     2178.57 ( 48.42%)
Amean    fault-huge-3      2194.77 (  0.00%)     2149.74 (  2.05%)
Amean    fault-huge-5      2569.60 (  0.00%)     2346.95 (  8.66%)
Amean    fault-huge-7      3612.69 (  0.00%)     2997.70 ( 17.02%)
Amean    fault-huge-12     3301.75 (  0.00%)     6727.02 (-103.74%)
Amean    fault-huge-18     6696.47 (  0.00%)     6685.72 (  0.16%)
Amean    fault-huge-24     8000.72 (  0.00%)     9311.43 (-16.38%)
Amean    fault-huge-30    13305.55 (  0.00%)     9750.45 ( 26.72%)
Amean    fault-huge-32     9981.71 (  0.00%)    10316.06 ( -3.35%)

The average time to fault pages is substantially reduced in the majority
of caseds but with the obvious caveat that fewer THPs are actually used
in this adverse workload

                                   4.4.0                 4.4.0
                          kcompactd-v1r1         nodefrag-v1r3
Percentage huge-1         0.71 (  0.00%)       14.04 (1865.22%)
Percentage huge-3        10.77 (  0.00%)       33.05 (206.85%)
Percentage huge-5        60.39 (  0.00%)       38.51 (-36.23%)
Percentage huge-7        45.97 (  0.00%)       34.57 (-24.79%)
Percentage huge-12       68.12 (  0.00%)       40.07 (-41.17%)
Percentage huge-18       64.93 (  0.00%)       47.82 (-26.35%)
Percentage huge-24       62.69 (  0.00%)       44.23 (-29.44%)
Percentage huge-30       43.49 (  0.00%)       55.38 ( 27.34%)
Percentage huge-32       50.72 (  0.00%)       51.90 (  2.35%)

                                 4.4.0       4.4.0
                          kcompactd-v1r1nodefrag-v1r3
Minor Faults                  37429143    47564000
Major Faults                      1916        1558
Swap Ins                          1466        1079
Swap Outs                      2936863      149626
Allocation stalls                62510           3
DMA allocs                           0           0
DMA32 allocs                   6566458     6401314
Normal allocs                216361697   216538171
Movable allocs                       0           0
Direct pages scanned          25977580       17998
Kswapd pages scanned                 0     3638931
Kswapd pages reclaimed               0      207236
Direct pages reclaimed         8833714          88
Compaction stalls               103349           5
Compaction success                 270           4
Compaction failures             103079           1

Note again that while this does swap as it's an aggressive workload, the
direct relcim activity and allocation stalls is substantially reduced.
There is some kswapd activity but ftrace showed that the kswapd activity
was due to normal wakeups from 4K pages being allocated.
Compaction-related stalls and activity are almost eliminated.

I also tried the stutter benchmark.  For this, I do not have figures for
NUMA but it's something that does impact UMA so I'll report what is
available

stutter
                                 4.4.0                 4.4.0
                        kcompactd-v1r1         nodefrag-v1r3
Min         mmap      7.3571 (  0.00%)      7.3438 (  0.18%)
1st-qrtle   mmap      7.5278 (  0.00%)     17.9200 (-138.05%)
2nd-qrtle   mmap      7.6818 (  0.00%)     21.6055 (-181.25%)
3rd-qrtle   mmap     11.0889 (  0.00%)     21.8881 (-97.39%)
Max-90%     mmap     27.8978 (  0.00%)     22.1632 ( 20.56%)
Max-93%     mmap     28.3202 (  0.00%)     22.3044 ( 21.24%)
Max-95%     mmap     28.5600 (  0.00%)     22.4580 ( 21.37%)
Max-99%     mmap     29.6032 (  0.00%)     25.5216 ( 13.79%)
Max         mmap   4109.7289 (  0.00%)   4813.9832 (-17.14%)
Mean        mmap     12.4474 (  0.00%)     19.3027 (-55.07%)

This benchmark is trying to fault an anonymous mapping while there is a
heavy IO load -- a scenario that desktop users used to complain about
frequently.  This shows a mix because the ideal case of mapping with THP
is not hit as often.  However, note that 99% of the mappings complete
13.79% faster.  The CPU usage here is particularly interesting

               4.4.0       4.4.0
        kcompactd-v1r1nodefrag-v1r3
User           67.50        0.99
System       1327.88       91.30
Elapsed      2079.00     2128.98

And once again we look at the reclaim figures

                                 4.4.0       4.4.0
                          kcompactd-v1r1nodefrag-v1r3
Minor Faults                 335241922  1314582827
Major Faults                       715         819
Swap Ins                             0           0
Swap Outs                            0           0
Allocation stalls               532723           0
DMA allocs                           0           0
DMA32 allocs                1822364341  1177950222
Normal allocs               1815640808  1517844854
Movable allocs                       0           0
Direct pages scanned          21892772           0
Kswapd pages scanned          20015890    41879484
Kswapd pages reclaimed        19961986    41822072
Direct pages reclaimed        21892741           0
Compaction stalls              1065755           0
Compaction success                 514           0
Compaction failures            1065241           0

Allocation stalls and all direct reclaim activity is eliminated as well
as compaction-related stalls.

THP gives impressive gains in some cases but only if they are quickly
available.  We're not going to reach the point where they are completely
free so lets take the costs out of the fast paths finally and defer the
cost to kswapd, kcompactd and khugepaged where it belongs.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/transhuge.txt |  17 +++++++
 include/linux/gfp.h            |   2 +-
 include/linux/huge_mm.h        |   9 +---
 mm/huge_memory.c               | 101 +++++++++++++++++++++++++++--------------
 mm/page_alloc.c                |   8 ----
 mm/slab.c                      |   8 ++--
 mm/slub.c                      |   2 +-
 7 files changed, 91 insertions(+), 56 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 0dc8632aa01e..d9cb65cf5cfd 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -113,9 +113,26 @@ guaranteed, but it may be more likely in case the allocation is for a
 MADV_HUGEPAGE region.
 
 echo always >/sys/kernel/mm/transparent_hugepage/defrag
+echo defer >/sys/kernel/mm/transparent_hugepage/defrag
 echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
 echo never >/sys/kernel/mm/transparent_hugepage/defrag
 
+"always" means that an application requesting THP will stall on allocation
+failure and directly reclaim pages and compact memory in an effort to
+allocate a THP immediately. This may be desirable for virtual machines
+that benefit heavily from THP use and are willing to delay the VM start
+to utilise them.
+
+"defer" means that an application will wake kswapd in the background
+to reclaim pages and wake kcompact to compact memory so that THP is
+available in the near future. It's the responsibility of khugepaged
+to then install the THP pages later.
+
+"madvise" will enter direct reclaim like "always" but only for regions
+that are have used madvise(MADV_HUGEPAGE). This is the default behaviour.
+
+"never" should be self-explanatory.
+
 By default kernel tries to use huge zero page on read page fault.
 It's possible to disable huge zero page by writing 0 or enable it
 back by writing 1:
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c083d0820a87..11d56c6e7ef2 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -257,7 +257,7 @@ struct vm_area_struct;
 #define GFP_HIGHUSER_MOVABLE	(GFP_HIGHUSER | __GFP_MOVABLE)
 #define GFP_TRANSHUGE	((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
 			 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \
-			 ~__GFP_KSWAPD_RECLAIM)
+			 ~__GFP_RECLAIM)
 
 /* Convert GFP flags to their corresponding migrate type */
 #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 459fd25b378e..a4cecb4801ec 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -41,7 +41,8 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_FLAG,
 	TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-	TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+	TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+	TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
 	TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
@@ -71,12 +72,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
 	   ((__vma)->vm_flags & VM_HUGEPAGE))) &&			\
 	 !((__vma)->vm_flags & VM_NOHUGEPAGE) &&			\
 	 !is_vma_temporary_stack(__vma))
-#define transparent_hugepage_defrag(__vma)				\
-	((transparent_hugepage_flags &					\
-	  (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)) ||			\
-	 (transparent_hugepage_flags &					\
-	  (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) &&		\
-	  (__vma)->vm_flags & VM_HUGEPAGE))
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
 	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1dddfb21fc22..e08b1659ff19 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -78,7 +78,7 @@ unsigned long transparent_hugepage_flags __read_mostly =
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
 	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
 #endif
-	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 
@@ -270,37 +270,35 @@ static struct shrinker huge_zero_page_shrinker = {
 
 #ifdef CONFIG_SYSFS
 
-static ssize_t double_flag_show(struct kobject *kobj,
-				struct kobj_attribute *attr, char *buf,
-				enum transparent_hugepage_flag enabled,
-				enum transparent_hugepage_flag req_madv)
-{
-	if (test_bit(enabled, &transparent_hugepage_flags)) {
-		VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
-		return sprintf(buf, "[always] madvise never\n");
-	} else if (test_bit(req_madv, &transparent_hugepage_flags))
-		return sprintf(buf, "always [madvise] never\n");
-	else
-		return sprintf(buf, "always madvise [never]\n");
-}
-static ssize_t double_flag_store(struct kobject *kobj,
+static ssize_t triple_flag_store(struct kobject *kobj,
 				 struct kobj_attribute *attr,
 				 const char *buf, size_t count,
 				 enum transparent_hugepage_flag enabled,
+				 enum transparent_hugepage_flag deferred,
 				 enum transparent_hugepage_flag req_madv)
 {
-	if (!memcmp("always", buf,
+	if (!memcmp("defer", buf,
+		    min(sizeof("defer")-1, count))) {
+		if (enabled == deferred)
+			return -EINVAL;
+		clear_bit(enabled, &transparent_hugepage_flags);
+		clear_bit(req_madv, &transparent_hugepage_flags);
+		set_bit(deferred, &transparent_hugepage_flags);
+	} else if (!memcmp("always", buf,
 		    min(sizeof("always")-1, count))) {
-		set_bit(enabled, &transparent_hugepage_flags);
+		clear_bit(deferred, &transparent_hugepage_flags);
 		clear_bit(req_madv, &transparent_hugepage_flags);
+		set_bit(enabled, &transparent_hugepage_flags);
 	} else if (!memcmp("madvise", buf,
 			   min(sizeof("madvise")-1, count))) {
 		clear_bit(enabled, &transparent_hugepage_flags);
+		clear_bit(deferred, &transparent_hugepage_flags);
 		set_bit(req_madv, &transparent_hugepage_flags);
 	} else if (!memcmp("never", buf,
 			   min(sizeof("never")-1, count))) {
 		clear_bit(enabled, &transparent_hugepage_flags);
 		clear_bit(req_madv, &transparent_hugepage_flags);
+		clear_bit(deferred, &transparent_hugepage_flags);
 	} else
 		return -EINVAL;
 
@@ -310,17 +308,22 @@ static ssize_t double_flag_store(struct kobject *kobj,
 static ssize_t enabled_show(struct kobject *kobj,
 			    struct kobj_attribute *attr, char *buf)
 {
-	return double_flag_show(kobj, attr, buf,
-				TRANSPARENT_HUGEPAGE_FLAG,
-				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
+		return sprintf(buf, "[always] madvise never\n");
+	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
+		return sprintf(buf, "always [madvise] never\n");
+	else
+		return sprintf(buf, "always madvise [never]\n");
 }
+
 static ssize_t enabled_store(struct kobject *kobj,
 			     struct kobj_attribute *attr,
 			     const char *buf, size_t count)
 {
 	ssize_t ret;
 
-	ret = double_flag_store(kobj, attr, buf, count,
+	ret = triple_flag_store(kobj, attr, buf, count,
+				TRANSPARENT_HUGEPAGE_FLAG,
 				TRANSPARENT_HUGEPAGE_FLAG,
 				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
 
@@ -378,16 +381,23 @@ static ssize_t single_flag_store(struct kobject *kobj,
 static ssize_t defrag_show(struct kobject *kobj,
 			   struct kobj_attribute *attr, char *buf)
 {
-	return double_flag_show(kobj, attr, buf,
-				TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
-				TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+		return sprintf(buf, "[always] defer madvise never\n");
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+		return sprintf(buf, "always [defer] madvise never\n");
+	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+		return sprintf(buf, "always defer [madvise] never\n");
+	else
+		return sprintf(buf, "always defer madvise [never]\n");
+
 }
 static ssize_t defrag_store(struct kobject *kobj,
 			    struct kobj_attribute *attr,
 			    const char *buf, size_t count)
 {
-	return double_flag_store(kobj, attr, buf, count,
-				 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+	return triple_flag_store(kobj, attr, buf, count,
+				 TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+				 TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
 				 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
 }
 static struct kobj_attribute defrag_attr =
@@ -843,9 +853,30 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 	return 0;
 }
 
-static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
+/*
+ * If THP is set to always then directly reclaim/compact as necessary
+ * If set to defer then do no reclaim and defer to khugepaged
+ * If set to madvise and the VMA is flagged then directly reclaim/compact
+ */
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+{
+	gfp_t reclaim_flags = 0;
+
+	if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) &&
+	    (vma->vm_flags & VM_HUGEPAGE))
+		reclaim_flags = __GFP_DIRECT_RECLAIM;
+	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+		reclaim_flags = __GFP_KSWAPD_RECLAIM;
+	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+		reclaim_flags = __GFP_DIRECT_RECLAIM;
+
+	return GFP_TRANSHUGE | reclaim_flags;
+}
+
+/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
+static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
 {
-	return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp;
+	return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0);
 }
 
 /* Caller must hold page table lock. */
@@ -919,7 +950,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 		return ret;
 	}
-	gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+	gfp = alloc_hugepage_direct_gfpmask(vma);
 	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
 	if (unlikely(!page)) {
 		count_vm_event(THP_FAULT_FALLBACK);
@@ -1279,7 +1310,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 alloc:
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow()) {
-		huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+		huge_gfp = alloc_hugepage_direct_gfpmask(vma);
 		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
 	} else
 		new_page = NULL;
@@ -2249,11 +2280,12 @@ static int khugepaged_find_target_node(void)
 	return 0;
 }
 
-static inline struct page *alloc_hugepage(int defrag)
+static inline struct page *alloc_khugepaged_hugepage(void)
 {
 	struct page *page;
 
-	page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+	page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
+			   HPAGE_PMD_ORDER);
 	if (page)
 		prep_transhuge_page(page);
 	return page;
@@ -2264,7 +2296,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
 	struct page *hpage;
 
 	do {
-		hpage = alloc_hugepage(khugepaged_defrag());
+		hpage = alloc_khugepaged_hugepage();
 		if (!hpage) {
 			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 			if (!*wait)
@@ -2335,8 +2367,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
 	/* Only allocate from the target node */
-	gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
-		__GFP_THISNODE;
+	gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE;
 
 	/* release the mmap_sem read lock. */
 	new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d156310aedeb..096a00d98a45 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3119,14 +3119,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 				(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
 		gfp_mask &= ~__GFP_ATOMIC;
 
-	/*
-	 * If this allocation cannot block and it is for a specific node, then
-	 * fail early.  There's no need to wakeup kswapd or retry for a
-	 * speculative node-specific allocation.
-	 */
-	if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
-		goto nopage;
-
 retry:
 	if (gfp_mask & __GFP_KSWAPD_RECLAIM)
 		wake_all_kswapds(order, ac);
diff --git a/mm/slab.c b/mm/slab.c
index 56dd0df2a8ce..e1f6c27c3db5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -670,7 +670,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
 
 static inline gfp_t gfp_exact_node(gfp_t flags)
 {
-	return flags;
+	return flags & ~__GFP_NOFAIL;
 }
 
 #else	/* CONFIG_NUMA */
@@ -841,12 +841,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 }
 
 /*
- * Construct gfp mask to allocate from a specific node but do not direct reclaim
- * or warn about failures. kswapd may still wake to reclaim in the background.
+ * Construct gfp mask to allocate from a specific node but do not reclaim or
+ * warn about failures.
  */
 static inline gfp_t gfp_exact_node(gfp_t flags)
 {
-	return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;
+	return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
 }
 #endif
 
diff --git a/mm/slub.c b/mm/slub.c
index 2f2f04d39104..64ed5f3a3046 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1426,7 +1426,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	 */
 	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
 	if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
-		alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM;
+		alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
 
 	page = alloc_slab_page(s, alloc_gfp, node, oo);
 	if (unlikely(!page)) {
-- 
cgit v1.2.3


From b6e6edcfa40561e9c8abe5eecf1c96f8e5fd9c6f Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 17 Mar 2016 14:20:28 -0700
Subject: mm: memcontrol: reclaim and OOM kill when shrinking memory.max below
 usage

Setting the original memory.limit_in_bytes hardlimit is subject to a
race condition when the desired value is below the current usage.  The
code tries a few times to first reclaim and then see if the usage has
dropped to where we would like it to be, but there is no locking, and
the workload is free to continue making new charges up to the old limit.
Thus, attempting to shrink a workload relies on pure luck and hope that
the workload happens to cooperate.

To fix this in the cgroup2 memory.max knob, do it the other way round:
set the limit first, then try enforcement.  And if reclaim is not able
to succeed, trigger OOM kills in the group.  Keep going until the new
limit is met, we run out of OOM victims and there's only unreclaimable
memory left, or the task writing to memory.max is killed.  This allows
users to shrink groups reliably, and the behavior is consistent with
what happens when new charges are attempted in excess of memory.max.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroup-v2.txt |  6 ++++++
 mm/memcontrol.c             | 38 ++++++++++++++++++++++++++++++++++----
 2 files changed, 40 insertions(+), 4 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index e2f4e7948a66..8f1329a5f700 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1387,6 +1387,12 @@ system than killing the group.  Otherwise, memory.max is there to
 limit this type of spillover and ultimately contain buggy or even
 malicious applications.
 
+Setting the original memory.limit_in_bytes below the current usage was
+subject to a race condition, where concurrent charges could cause the
+limit setting to fail. memory.max on the other hand will first set the
+limit to prevent new charges, and then reclaim and OOM kill until the
+new limit is met - or the task writing to memory.max is killed.
+
 The combined memory+swap accounting and limiting is replaced by real
 control over swap space.
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f7c9b4cbdf01..8614e0d750e5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1236,7 +1236,7 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
 	return limit;
 }
 
-static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				     int order)
 {
 	struct oom_control oc = {
@@ -1314,6 +1314,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	}
 unlock:
 	mutex_unlock(&oom_lock);
+	return chosen;
 }
 
 #if MAX_NUMNODES > 1
@@ -5029,6 +5030,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 				char *buf, size_t nbytes, loff_t off)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+	bool drained = false;
 	unsigned long max;
 	int err;
 
@@ -5037,9 +5040,36 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 	if (err)
 		return err;
 
-	err = mem_cgroup_resize_limit(memcg, max);
-	if (err)
-		return err;
+	xchg(&memcg->memory.limit, max);
+
+	for (;;) {
+		unsigned long nr_pages = page_counter_read(&memcg->memory);
+
+		if (nr_pages <= max)
+			break;
+
+		if (signal_pending(current)) {
+			err = -EINTR;
+			break;
+		}
+
+		if (!drained) {
+			drain_all_stock(memcg);
+			drained = true;
+			continue;
+		}
+
+		if (nr_reclaims) {
+			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
+							  GFP_KERNEL, true))
+				nr_reclaims--;
+			continue;
+		}
+
+		mem_cgroup_events(memcg, MEMCG_OOM, 1);
+		if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
+			break;
+	}
 
 	memcg_wb_domain_size_changed(memcg);
 	return nbytes;
-- 
cgit v1.2.3


From 5de23d435e88996b1efe0e2cebe242074ce67c9e Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 17 Mar 2016 14:20:54 -0700
Subject: proc: add /proc/<pid>/timerslack_ns interface

This patch provides a proc/PID/timerslack_ns interface which exposes a
task's timerslack value in nanoseconds and allows it to be changed.

This allows power/performance management software to set timer slack for
other threads according to its policy for the thread (such as when the
thread is designated foreground vs.  background activity)

If the value written is non-zero, slack is set to that value.  Otherwise
sets it to the default for the thread.

This interface checks that the calling task has permissions to to use
PTRACE_MODE_ATTACH_FSCREDS on the target task, so that we can ensure
arbitrary apps do not change the timer slack for other apps.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Oren Laadan <orenl@cellrox.com>
Cc: Ruchi Kandoi <kandoiruchi@google.com>
Cc: Rom Lemarchand <romlem@android.com>
Cc: Android Kernel Team <kernel-team@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 18 ++++++++++
 fs/proc/base.c                     | 67 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 843b045b4069..7f5607a089b4 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -43,6 +43,7 @@ Table of Contents
   3.7   /proc/<pid>/task/<tid>/children - Information about task children
   3.8   /proc/<pid>/fdinfo/<fd> - Information about opened file
   3.9   /proc/<pid>/map_files - Information about memory mapped files
+  3.10  /proc/<pid>/timerslack_ns - Task timerslack value
 
   4	Configuring procfs
   4.1	Mount options
@@ -1862,6 +1863,23 @@ time one can open(2) mappings from the listings of two processes and
 comparing their inode numbers to figure out which anonymous memory areas
 are actually shared.
 
+3.10	/proc/<pid>/timerslack_ns - Task timerslack value
+---------------------------------------------------------
+This file provides the value of the task's timerslack value in nanoseconds.
+This value specifies a amount of time that normal timers may be deferred
+in order to coalesce timers and avoid unnecessary wakeups.
+
+This allows a task's interactivity vs power consumption trade off to be
+adjusted.
+
+Writing 0 to the file will set the tasks timerslack to the default value.
+
+Valid values are from 0 - ULLONG_MAX
+
+An application setting the value must have PTRACE_MODE_ATTACH_FSCREDS level
+permissions on the task specified to change its timerslack_ns value.
+
+
 ------------------------------------------------------------------------------
 Configuring procfs
 ------------------------------------------------------------------------------
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4f764c2ac1a5..35f583ad7dbe 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2257,6 +2257,72 @@ static const struct file_operations proc_timers_operations = {
 	.release	= seq_release_private,
 };
 
+static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *offset)
+{
+	struct inode *inode = file_inode(file);
+	struct task_struct *p;
+	u64 slack_ns;
+	int err;
+
+	err = kstrtoull_from_user(buf, count, 10, &slack_ns);
+	if (err < 0)
+		return err;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
+		task_lock(p);
+		if (slack_ns == 0)
+			p->timer_slack_ns = p->default_timer_slack_ns;
+		else
+			p->timer_slack_ns = slack_ns;
+		task_unlock(p);
+	} else
+		count = -EPERM;
+
+	put_task_struct(p);
+
+	return count;
+}
+
+static int timerslack_ns_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+	int err =  0;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	if (ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)) {
+		task_lock(p);
+		seq_printf(m, "%llu\n", p->timer_slack_ns);
+		task_unlock(p);
+	} else
+		err = -EPERM;
+
+	put_task_struct(p);
+
+	return err;
+}
+
+static int timerslack_ns_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, timerslack_ns_show, inode);
+}
+
+static const struct file_operations proc_pid_set_timerslack_ns_operations = {
+	.open		= timerslack_ns_open,
+	.read		= seq_read,
+	.write		= timerslack_ns_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int proc_pident_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@@ -2831,6 +2897,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	REG("timers",	  S_IRUGO, proc_timers_operations),
 #endif
+	REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
-- 
cgit v1.2.3


From 93e205a728e6cb8d7d11f6836e289798a1de25e2 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 17 Mar 2016 14:21:15 -0700
Subject: fix Christoph's email addresses

There are various email addresses for me throughout the kernel.  Use the
one that will always be valid.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroup-v1/cgroups.txt | 2 +-
 Documentation/cgroup-v1/cpusets.txt | 2 +-
 MAINTAINERS                         | 2 +-
 arch/ia64/include/asm/rwsem.h       | 2 +-
 include/linux/quicklist.h           | 2 +-
 mm/mmu_notifier.c                   | 2 +-
 mm/quicklist.c                      | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/cgroup-v1/cgroups.txt b/Documentation/cgroup-v1/cgroups.txt
index c6256ae9885b..947e6fe31ef9 100644
--- a/Documentation/cgroup-v1/cgroups.txt
+++ b/Documentation/cgroup-v1/cgroups.txt
@@ -8,7 +8,7 @@ Original copyright statements from cpusets.txt:
 Portions Copyright (C) 2004 BULL SA.
 Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
 Modified by Paul Jackson <pj@sgi.com>
-Modified by Christoph Lameter <clameter@sgi.com>
+Modified by Christoph Lameter <cl@linux.com>
 
 CONTENTS:
 =========
diff --git a/Documentation/cgroup-v1/cpusets.txt b/Documentation/cgroup-v1/cpusets.txt
index fdf7dff3f607..e5cdcd445615 100644
--- a/Documentation/cgroup-v1/cpusets.txt
+++ b/Documentation/cgroup-v1/cpusets.txt
@@ -6,7 +6,7 @@ Written by Simon.Derr@bull.net
 
 Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
 Modified by Paul Jackson <pj@sgi.com>
-Modified by Christoph Lameter <clameter@sgi.com>
+Modified by Christoph Lameter <cl@linux.com>
 Modified by Paul Menage <menage@google.com>
 Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 99bd725affc6..67d34bb7335c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8473,7 +8473,7 @@ F:	include/crypto/pcrypt.h
 
 PER-CPU MEMORY ALLOCATOR
 M:	Tejun Heo <tj@kernel.org>
-M:	Christoph Lameter <cl@linux-foundation.org>
+M:	Christoph Lameter <cl@linux.com>
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu.git
 S:	Maintained
 F:	include/linux/percpu*.h
diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h
index 3027e7516d85..ce112472bdd6 100644
--- a/arch/ia64/include/asm/rwsem.h
+++ b/arch/ia64/include/asm/rwsem.h
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2003 Ken Chen <kenneth.w.chen@intel.com>
  * Copyright (C) 2003 Asit Mallick <asit.k.mallick@intel.com>
- * Copyright (C) 2005 Christoph Lameter <clameter@sgi.com>
+ * Copyright (C) 2005 Christoph Lameter <cl@linux.com>
  *
  * Based on asm-i386/rwsem.h and other architecture implementation.
  *
diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h
index bd466439c588..3bdfa70bc642 100644
--- a/include/linux/quicklist.h
+++ b/include/linux/quicklist.h
@@ -5,7 +5,7 @@
  * as needed after allocation when they are freed. Per cpu lists of pages
  * are kept that only contain node local pages.
  *
- * (C) 2007, SGI. Christoph Lameter <clameter@sgi.com>
+ * (C) 2007, SGI. Christoph Lameter <cl@linux.com>
  */
 #include <linux/kernel.h>
 #include <linux/gfp.h>
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5fbdd367bbed..f4259e496f83 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -3,7 +3,7 @@
  *
  *  Copyright (C) 2008  Qumranet, Inc.
  *  Copyright (C) 2008  SGI
- *             Christoph Lameter <clameter@sgi.com>
+ *             Christoph Lameter <cl@linux.com>
  *
  *  This work is licensed under the terms of the GNU GPL, version 2. See
  *  the COPYING file in the top-level directory.
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 942212970529..daf6ff6e199a 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -8,7 +8,7 @@
  * improved on it.
  *
  * Copyright (C) 2007 SGI,
- * 	Christoph Lameter <clameter@sgi.com>
+ * 	Christoph Lameter <cl@linux.com>
  * 		Generalized, added support for multiple lists and
  * 		constructors / destructors.
  */
-- 
cgit v1.2.3