From a725356b6659469d182d662f22d770d83d3bc7b5 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 18 Sep 2018 16:34:34 +0300
Subject: vfs: swap names of {do,vfs}_clone_file_range()

Commit 031a072a0b8a ("vfs: call vfs_clone_file_range() under freeze
protection") created a wrapper do_clone_file_range() around
vfs_clone_file_range() moving the freeze protection to former, so
overlayfs could call the latter.

The more common vfs practice is to call do_xxx helpers from vfs_xxx
helpers, where freeze protecction is taken in the vfs_xxx helper, so
this anomality could be a source of confusion.

It seems that commit 8ede205541ff ("ovl: add reflink/copyfile/dedup
support") may have fallen a victim to this confusion -
ovl_clone_file_range() calls the vfs_clone_file_range() helper in the
hope of getting freeze protection on upper fs, but in fact results in
overlayfs allowing to bypass upper fs freeze protection.

Swap the names of the two helpers to conform to common vfs practice
and call the correct helpers from overlayfs and nfsd.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 include/linux/fs.h | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6c0b4a1c22ff..897eae8faee1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1828,8 +1828,10 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
 extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
 				      struct inode *inode_out, loff_t pos_out,
 				      u64 *len, bool is_dedupe);
+extern int do_clone_file_range(struct file *file_in, loff_t pos_in,
+			       struct file *file_out, loff_t pos_out, u64 len);
 extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
-		struct file *file_out, loff_t pos_out, u64 len);
+				struct file *file_out, loff_t pos_out, u64 len);
 extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 					 struct inode *dest, loff_t destoff,
 					 loff_t len, bool *is_same);
@@ -2773,19 +2775,6 @@ static inline void file_end_write(struct file *file)
 	__sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE);
 }
 
-static inline int do_clone_file_range(struct file *file_in, loff_t pos_in,
-				      struct file *file_out, loff_t pos_out,
-				      u64 len)
-{
-	int ret;
-
-	file_start_write(file_out);
-	ret = vfs_clone_file_range(file_in, pos_in, file_out, pos_out, len);
-	file_end_write(file_out);
-
-	return ret;
-}
-
 /*
  * get_write_access() gets write permission for a file.
  * put_write_access() releases this write permission.
-- 
cgit v1.2.3


From efaffc5e40aeced0bcb497ed7a0a5b8c14abfcdf Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Mon, 1 Oct 2018 11:05:24 +0100
Subject: mm, sched/numa: Remove rate-limiting of automatic NUMA balancing
 migration

Rate limiting of page migrations due to automatic NUMA balancing was
introduced to mitigate the worst-case scenario of migrating at high
frequency due to false sharing or slowly ping-ponging between nodes.
Since then, a lot of effort was spent on correctly identifying these
pages and avoiding unnecessary migrations and the safety net may no longer
be required.

Jirka Hladky reported a regression in 4.17 due to a scheduler patch that
avoids spreading STREAM tasks wide prematurely. However, once the task
was properly placed, it delayed migrating the memory due to rate limiting.
Increasing the limit fixed the problem for him.

Currently, the limit is hard-coded and does not account for the real
capabilities of the hardware. Even if an estimate was attempted, it would
not properly account for the number of memory controllers and it could
not account for the amount of bandwidth used for normal accesses. Rather
than fudging, this patch simply eliminates the rate limiting.

However, Jirka reports that a STREAM configuration using multiple
processes achieved similar performance to 4.16. In local tests, this patch
improved performance of STREAM relative to the baseline but it is somewhat
machine-dependent. Most workloads show little or not performance difference
implying that there is not a heavily reliance on the throttling mechanism
and it is safe to remove.

STREAM on 2-socket machine
                         4.19.0-rc5             4.19.0-rc5
                         numab-v1r1       noratelimit-v1r1
MB/sec copy     43298.52 (   0.00%)    44673.38 (   3.18%)
MB/sec scale    30115.06 (   0.00%)    31293.06 (   3.91%)
MB/sec add      32825.12 (   0.00%)    34883.62 (   6.27%)
MB/sec triad    32549.52 (   0.00%)    34906.60 (   7.24%

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Reviewed-by: Rik van Riel <riel@surriel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jirka Hladky <jhladky@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linux-MM <linux-mm@kvack.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20181001100525.29789-2-mgorman@techsingularity.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mmzone.h         |  6 ----
 include/trace/events/migrate.h | 27 ------------------
 mm/migrate.c                   | 65 ------------------------------------------
 mm/page_alloc.c                |  2 --
 4 files changed, 100 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1e22d96734e0..3f4c0b167333 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -671,12 +671,6 @@ typedef struct pglist_data {
 #ifdef CONFIG_NUMA_BALANCING
 	/* Lock serializing the migrate rate limiting window */
 	spinlock_t numabalancing_migrate_lock;
-
-	/* Rate limiting time interval */
-	unsigned long numabalancing_migrate_next_window;
-
-	/* Number of pages migrated during the rate limiting time interval */
-	unsigned long numabalancing_migrate_nr_pages;
 #endif
 	/*
 	 * This is a per-node reserve of pages that are not available
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 711372845945..705b33d1e395 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -70,33 +70,6 @@ TRACE_EVENT(mm_migrate_pages,
 		__print_symbolic(__entry->mode, MIGRATE_MODE),
 		__print_symbolic(__entry->reason, MIGRATE_REASON))
 );
-
-TRACE_EVENT(mm_numa_migrate_ratelimit,
-
-	TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages),
-
-	TP_ARGS(p, dst_nid, nr_pages),
-
-	TP_STRUCT__entry(
-		__array(	char,		comm,	TASK_COMM_LEN)
-		__field(	pid_t,		pid)
-		__field(	int,		dst_nid)
-		__field(	unsigned long,	nr_pages)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->dst_nid	= dst_nid;
-		__entry->nr_pages	= nr_pages;
-	),
-
-	TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu",
-		__entry->comm,
-		__entry->pid,
-		__entry->dst_nid,
-		__entry->nr_pages)
-);
 #endif /* _TRACE_MIGRATE_H */
 
 /* This part must be outside protection */
diff --git a/mm/migrate.c b/mm/migrate.c
index 4f1d894835b5..5e285c1249a0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1855,54 +1855,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
 	return newpage;
 }
 
-/*
- * page migration rate limiting control.
- * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
- * window of time. Default here says do not migrate more than 1280M per second.
- */
-static unsigned int migrate_interval_millisecs __read_mostly = 100;
-static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
-
-/* Returns true if the node is migrate rate-limited after the update */
-static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
-					unsigned long nr_pages)
-{
-	unsigned long next_window, interval;
-
-	next_window = READ_ONCE(pgdat->numabalancing_migrate_next_window);
-	interval = msecs_to_jiffies(migrate_interval_millisecs);
-
-	/*
-	 * Rate-limit the amount of data that is being migrated to a node.
-	 * Optimal placement is no good if the memory bus is saturated and
-	 * all the time is being spent migrating!
-	 */
-	if (time_after(jiffies, next_window) &&
-			spin_trylock(&pgdat->numabalancing_migrate_lock)) {
-		pgdat->numabalancing_migrate_nr_pages = 0;
-		do {
-			next_window += interval;
-		} while (unlikely(time_after(jiffies, next_window)));
-
-		WRITE_ONCE(pgdat->numabalancing_migrate_next_window, next_window);
-		spin_unlock(&pgdat->numabalancing_migrate_lock);
-	}
-	if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
-		trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
-								nr_pages);
-		return true;
-	}
-
-	/*
-	 * This is an unlocked non-atomic update so errors are possible.
-	 * The consequences are failing to migrate when we potentiall should
-	 * have which is not severe enough to warrant locking. If it is ever
-	 * a problem, it can be converted to a per-cpu counter.
-	 */
-	pgdat->numabalancing_migrate_nr_pages += nr_pages;
-	return false;
-}
-
 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 {
 	int page_lru;
@@ -1975,14 +1927,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
 	if (page_is_file_cache(page) && PageDirty(page))
 		goto out;
 
-	/*
-	 * Rate-limit the amount of data that is being migrated to a node.
-	 * Optimal placement is no good if the memory bus is saturated and
-	 * all the time is being spent migrating!
-	 */
-	if (numamigrate_update_ratelimit(pgdat, 1))
-		goto out;
-
 	isolated = numamigrate_isolate_page(pgdat, page);
 	if (!isolated)
 		goto out;
@@ -2029,14 +1973,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	unsigned long mmun_start = address & HPAGE_PMD_MASK;
 	unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
 
-	/*
-	 * Rate-limit the amount of data that is being migrated to a node.
-	 * Optimal placement is no good if the memory bus is saturated and
-	 * all the time is being spent migrating!
-	 */
-	if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
-		goto out_dropref;
-
 	new_page = alloc_pages_node(node,
 		(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
 		HPAGE_PMD_ORDER);
@@ -2133,7 +2069,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 
 out_fail:
 	count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
-out_dropref:
 	ptl = pmd_lock(mm, pmd);
 	if (pmd_same(*pmd, entry)) {
 		entry = pmd_modify(entry, vma->vm_page_prot);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 89d2a2ab3fe6..706a738c0aee 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6197,8 +6197,6 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
 static void pgdat_init_numabalancing(struct pglist_data *pgdat)
 {
 	spin_lock_init(&pgdat->numabalancing_migrate_lock);
-	pgdat->numabalancing_migrate_nr_pages = 0;
-	pgdat->numabalancing_migrate_next_window = jiffies;
 }
 #else
 static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}
-- 
cgit v1.2.3


From 4d4c2d89913e2d891bd6a34b12050a2576e60525 Mon Sep 17 00:00:00 2001
From: Noralf Trønnes <noralf@tronnes.org>
Date: Mon, 1 Oct 2018 21:45:36 +0200
Subject: drm/cma-helper: Fix crash in fbdev error path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Sergey Suloev reported a crash happening in drm_client_dev_hotplug()
when fbdev had failed to register.

[    9.124598] vc4_hdmi 3f902000.hdmi: ASoC: Failed to create component debugfs directory
[    9.147667] vc4_hdmi 3f902000.hdmi: vc4-hdmi-hifi <-> 3f902000.hdmi mapping ok
[    9.155184] vc4_hdmi 3f902000.hdmi: ASoC: no DMI vendor name!
[    9.166544] vc4-drm soc:gpu: bound 3f902000.hdmi (ops vc4_hdmi_ops [vc4])
[    9.173840] vc4-drm soc:gpu: bound 3f806000.vec (ops vc4_vec_ops [vc4])
[    9.181029] vc4-drm soc:gpu: bound 3f004000.txp (ops vc4_txp_ops [vc4])
[    9.188519] vc4-drm soc:gpu: bound 3f400000.hvs (ops vc4_hvs_ops [vc4])
[    9.195690] vc4-drm soc:gpu: bound 3f206000.pixelvalve (ops vc4_crtc_ops [vc4])
[    9.203523] vc4-drm soc:gpu: bound 3f207000.pixelvalve (ops vc4_crtc_ops [vc4])
[    9.215032] vc4-drm soc:gpu: bound 3f807000.pixelvalve (ops vc4_crtc_ops [vc4])
[    9.274785] vc4-drm soc:gpu: bound 3fc00000.v3d (ops vc4_v3d_ops [vc4])
[    9.290246] [drm] Initialized vc4 0.0.0 20140616 for soc:gpu on minor 0
[    9.297464] [drm] Supports vblank timestamp caching Rev 2 (21.10.2013).
[    9.304600] [drm] Driver supports precise vblank timestamp query.
[    9.382856] vc4-drm soc:gpu: [drm:drm_fb_helper_fbdev_setup [drm_kms_helper]] *ERROR* Failed to set fbdev configuration
[   10.404937] Unable to handle kernel paging request at virtual address 00330a656369768a
[   10.441620] [00330a656369768a] address between user and kernel address ranges
[   10.449087] Internal error: Oops: 96000004 [#1] PREEMPT SMP
[   10.454762] Modules linked in: brcmfmac vc4 drm_kms_helper cfg80211 drm rfkill smsc95xx brcmutil usbnet drm_panel_orientation_quirks raspberrypi_hwmon bcm2835_dma crc32_ce pwm_bcm2835 bcm2835_rng virt_dma rng_core i2c_bcm2835 ip_tables x_tables ipv6
[   10.477296] CPU: 2 PID: 45 Comm: kworker/2:1 Not tainted 4.19.0-rc5 #3
[   10.483934] Hardware name: Raspberry Pi 3 Model B Rev 1.2 (DT)
[   10.489966] Workqueue: events output_poll_execute [drm_kms_helper]
[   10.596515] Process kworker/2:1 (pid: 45, stack limit = 0x000000007e8924dc)
[   10.603590] Call trace:
[   10.606259]  drm_client_dev_hotplug+0x5c/0xb0 [drm]
[   10.611303]  drm_kms_helper_hotplug_event+0x30/0x40 [drm_kms_helper]
[   10.617849]  output_poll_execute+0xc4/0x1e0 [drm_kms_helper]
[   10.623616]  process_one_work+0x1c8/0x318
[   10.627695]  worker_thread+0x48/0x428
[   10.631420]  kthread+0xf8/0x128
[   10.634615]  ret_from_fork+0x10/0x18
[   10.638255] Code: 54000220 f9401261 aa1303e0 b4000141 (f9400c21)
[   10.644456] ---[ end trace c75b4a4b0e141908 ]---

The reason for this is that drm_fbdev_cma_init() removes the drm_client
when fbdev registration fails, but it doesn't remove the client from the
drm_device client list. So the client list now has a pointer that points
into the unknown and we have a 'use after free' situation.

Split drm_client_new() into drm_client_init() and drm_client_add() to fix
removal in the error path.

Fixes: 894a677f4b3e ("drm/cma-helper: Use the generic fbdev emulation")
Reported-by: Sergey Suloev <ssuloev@orpaltech.com>
Cc: Stefan Wahren <stefan.wahren@i2se.com>
Cc: Eric Anholt <eric@anholt.net>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Noralf Trønnes <noralf@tronnes.org>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20181001194536.57756-1-noralf@tronnes.org
---
 drivers/gpu/drm/drm_client.c        | 35 ++++++++++++++++++++++++++---------
 drivers/gpu/drm/drm_fb_cma_helper.c |  4 +++-
 drivers/gpu/drm/drm_fb_helper.c     |  4 +++-
 include/drm/drm_client.h            |  5 +++--
 4 files changed, 35 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_client.c b/drivers/gpu/drm/drm_client.c
index baff50a4c234..df31c3815092 100644
--- a/drivers/gpu/drm/drm_client.c
+++ b/drivers/gpu/drm/drm_client.c
@@ -63,20 +63,21 @@ static void drm_client_close(struct drm_client_dev *client)
 EXPORT_SYMBOL(drm_client_close);
 
 /**
- * drm_client_new - Create a DRM client
+ * drm_client_init - Initialise a DRM client
  * @dev: DRM device
  * @client: DRM client
  * @name: Client name
  * @funcs: DRM client functions (optional)
  *
+ * This initialises the client and opens a &drm_file. Use drm_client_add() to complete the process.
  * The caller needs to hold a reference on @dev before calling this function.
  * The client is freed when the &drm_device is unregistered. See drm_client_release().
  *
  * Returns:
  * Zero on success or negative error code on failure.
  */
-int drm_client_new(struct drm_device *dev, struct drm_client_dev *client,
-		   const char *name, const struct drm_client_funcs *funcs)
+int drm_client_init(struct drm_device *dev, struct drm_client_dev *client,
+		    const char *name, const struct drm_client_funcs *funcs)
 {
 	int ret;
 
@@ -95,10 +96,6 @@ int drm_client_new(struct drm_device *dev, struct drm_client_dev *client,
 	if (ret)
 		goto err_put_module;
 
-	mutex_lock(&dev->clientlist_mutex);
-	list_add(&client->list, &dev->clientlist);
-	mutex_unlock(&dev->clientlist_mutex);
-
 	drm_dev_get(dev);
 
 	return 0;
@@ -109,13 +106,33 @@ err_put_module:
 
 	return ret;
 }
-EXPORT_SYMBOL(drm_client_new);
+EXPORT_SYMBOL(drm_client_init);
+
+/**
+ * drm_client_add - Add client to the device list
+ * @client: DRM client
+ *
+ * Add the client to the &drm_device client list to activate its callbacks.
+ * @client must be initialized by a call to drm_client_init(). After
+ * drm_client_add() it is no longer permissible to call drm_client_release()
+ * directly (outside the unregister callback), instead cleanup will happen
+ * automatically on driver unload.
+ */
+void drm_client_add(struct drm_client_dev *client)
+{
+	struct drm_device *dev = client->dev;
+
+	mutex_lock(&dev->clientlist_mutex);
+	list_add(&client->list, &dev->clientlist);
+	mutex_unlock(&dev->clientlist_mutex);
+}
+EXPORT_SYMBOL(drm_client_add);
 
 /**
  * drm_client_release - Release DRM client resources
  * @client: DRM client
  *
- * Releases resources by closing the &drm_file that was opened by drm_client_new().
+ * Releases resources by closing the &drm_file that was opened by drm_client_init().
  * It is called automatically if the &drm_client_funcs.unregister callback is _not_ set.
  *
  * This function should only be called from the unregister callback. An exception
diff --git a/drivers/gpu/drm/drm_fb_cma_helper.c b/drivers/gpu/drm/drm_fb_cma_helper.c
index 9da36a6271d3..9ac1f2e0f064 100644
--- a/drivers/gpu/drm/drm_fb_cma_helper.c
+++ b/drivers/gpu/drm/drm_fb_cma_helper.c
@@ -160,7 +160,7 @@ struct drm_fbdev_cma *drm_fbdev_cma_init(struct drm_device *dev,
 
 	fb_helper = &fbdev_cma->fb_helper;
 
-	ret = drm_client_new(dev, &fb_helper->client, "fbdev", NULL);
+	ret = drm_client_init(dev, &fb_helper->client, "fbdev", NULL);
 	if (ret)
 		goto err_free;
 
@@ -169,6 +169,8 @@ struct drm_fbdev_cma *drm_fbdev_cma_init(struct drm_device *dev,
 	if (ret)
 		goto err_client_put;
 
+	drm_client_add(&fb_helper->client);
+
 	return fbdev_cma;
 
 err_client_put:
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 16ec93b75dbf..515a7aec57ac 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -3218,12 +3218,14 @@ int drm_fbdev_generic_setup(struct drm_device *dev, unsigned int preferred_bpp)
 	if (!fb_helper)
 		return -ENOMEM;
 
-	ret = drm_client_new(dev, &fb_helper->client, "fbdev", &drm_fbdev_client_funcs);
+	ret = drm_client_init(dev, &fb_helper->client, "fbdev", &drm_fbdev_client_funcs);
 	if (ret) {
 		kfree(fb_helper);
 		return ret;
 	}
 
+	drm_client_add(&fb_helper->client);
+
 	fb_helper->preferred_bpp = preferred_bpp;
 
 	drm_fbdev_client_hotplug(&fb_helper->client);
diff --git a/include/drm/drm_client.h b/include/drm/drm_client.h
index 989f8e52864d..971bb7853776 100644
--- a/include/drm/drm_client.h
+++ b/include/drm/drm_client.h
@@ -87,9 +87,10 @@ struct drm_client_dev {
 	struct drm_file *file;
 };
 
-int drm_client_new(struct drm_device *dev, struct drm_client_dev *client,
-		   const char *name, const struct drm_client_funcs *funcs);
+int drm_client_init(struct drm_device *dev, struct drm_client_dev *client,
+		    const char *name, const struct drm_client_funcs *funcs);
 void drm_client_release(struct drm_client_dev *client);
+void drm_client_add(struct drm_client_dev *client);
 
 void drm_client_dev_unregister(struct drm_device *dev);
 void drm_client_dev_hotplug(struct drm_device *dev);
-- 
cgit v1.2.3


From 9d2f67e43b73e8af7438be219b66a5de0cfa8bd9 Mon Sep 17 00:00:00 2001
From: Jianfeng Tan <jianfeng.tan@linux.alibaba.com>
Date: Sat, 29 Sep 2018 15:41:27 +0000
Subject: net/packet: fix packet drop as of virtio gso

When we use raw socket as the vhost backend, a packet from virito with
gso offloading information, cannot be sent out in later validaton at
xmit path, as we did not set correct skb->protocol which is further used
for looking up the gso function.

To fix this, we set this field according to virito hdr information.

Fixes: e858fae2b0b8f4 ("virtio_net: use common code for virtio_net_hdr and skb GSO conversion")
Signed-off-by: Jianfeng Tan <jianfeng.tan@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 18 ++++++++++++++++++
 net/packet/af_packet.c     | 11 +++++++----
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 9397628a1967..cb462f9ab7dd 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -5,6 +5,24 @@
 #include <linux/if_vlan.h>
 #include <uapi/linux/virtio_net.h>
 
+static inline int virtio_net_hdr_set_proto(struct sk_buff *skb,
+					   const struct virtio_net_hdr *hdr)
+{
+	switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+	case VIRTIO_NET_HDR_GSO_TCPV4:
+	case VIRTIO_NET_HDR_GSO_UDP:
+		skb->protocol = cpu_to_be16(ETH_P_IP);
+		break;
+	case VIRTIO_NET_HDR_GSO_TCPV6:
+		skb->protocol = cpu_to_be16(ETH_P_IPV6);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 					const struct virtio_net_hdr *hdr,
 					bool little_endian)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 75c92a87e7b2..d6e94dc7e290 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2715,10 +2715,12 @@ tpacket_error:
 			}
 		}
 
-		if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr,
-							      vio_le())) {
-			tp_len = -EINVAL;
-			goto tpacket_error;
+		if (po->has_vnet_hdr) {
+			if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
+				tp_len = -EINVAL;
+				goto tpacket_error;
+			}
+			virtio_net_hdr_set_proto(skb, vnet_hdr);
 		}
 
 		skb->destructor = tpacket_destruct_skb;
@@ -2915,6 +2917,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 		if (err)
 			goto out_free;
 		len += sizeof(vnet_hdr);
+		virtio_net_hdr_set_proto(skb, &vnet_hdr);
 	}
 
 	skb_probe_transport_header(skb, reserve);
-- 
cgit v1.2.3


From 017b1660df89f5fb4bfe66c34e35f7d2031100c7 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Fri, 5 Oct 2018 15:51:29 -0700
Subject: mm: migration: fix migration of huge PMD shared pages

The page migration code employs try_to_unmap() to try and unmap the source
page.  This is accomplished by using rmap_walk to find all vmas where the
page is mapped.  This search stops when page mapcount is zero.  For shared
PMD huge pages, the page map count is always 1 no matter the number of
mappings.  Shared mappings are tracked via the reference count of the PMD
page.  Therefore, try_to_unmap stops prematurely and does not completely
unmap all mappings of the source page.

This problem can result is data corruption as writes to the original
source page can happen after contents of the page are copied to the target
page.  Hence, data is lost.

This problem was originally seen as DB corruption of shared global areas
after a huge page was soft offlined due to ECC memory errors.  DB
developers noticed they could reproduce the issue by (hotplug) offlining
memory used to back huge pages.  A simple testcase can reproduce the
problem by creating a shared PMD mapping (note that this must be at least
PUD_SIZE in size and PUD_SIZE aligned (1GB on x86)), and using
migrate_pages() to migrate process pages between nodes while continually
writing to the huge pages being migrated.

To fix, have the try_to_unmap_one routine check for huge PMD sharing by
calling huge_pmd_unshare for hugetlbfs huge pages.  If it is a shared
mapping it will be 'unshared' which removes the page table entry and drops
the reference on the PMD page.  After this, flush caches and TLB.

mmu notifiers are called before locking page tables, but we can not be
sure of PMD sharing until page tables are locked.  Therefore, check for
the possibility of PMD sharing before locking so that notifiers can
prepare for the worst possible case.

Link: http://lkml.kernel.org/r/20180823205917.16297-2-mike.kravetz@oracle.com
[mike.kravetz@oracle.com: make _range_in_vma() a static inline]
  Link: http://lkml.kernel.org/r/6063f215-a5c8-2f0c-465a-2c515ddc952d@oracle.com
Fixes: 39dde65c9940 ("shared page table for hugetlb page")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/hugetlb.h | 14 ++++++++++++++
 include/linux/mm.h      |  6 ++++++
 mm/hugetlb.c            | 37 +++++++++++++++++++++++++++++++++++--
 mm/rmap.c               | 42 +++++++++++++++++++++++++++++++++++++++---
 4 files changed, 94 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6b68e345f0ca..087fd5f48c91 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -140,6 +140,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 pte_t *huge_pte_offset(struct mm_struct *mm,
 		       unsigned long addr, unsigned long sz);
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+				unsigned long *start, unsigned long *end);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
 struct page *follow_huge_pd(struct vm_area_struct *vma,
@@ -170,6 +172,18 @@ static inline unsigned long hugetlb_total_pages(void)
 	return 0;
 }
 
+static inline int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr,
+					pte_t *ptep)
+{
+	return 0;
+}
+
+static inline void adjust_range_if_pmd_sharing_possible(
+				struct vm_area_struct *vma,
+				unsigned long *start, unsigned long *end)
+{
+}
+
 #define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n)	({ BUG(); 0; })
 #define follow_huge_addr(mm, addr, write)	ERR_PTR(-EINVAL)
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a61ebe8ad4ca..0416a7204be3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2455,6 +2455,12 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
 	return vma;
 }
 
+static inline bool range_in_vma(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end)
+{
+	return (vma && vma->vm_start <= start && end <= vma->vm_end);
+}
+
 #ifdef CONFIG_MMU
 pgprot_t vm_get_page_prot(unsigned long vm_flags);
 void vma_set_page_prot(struct vm_area_struct *vma);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3c21775f196b..b903d746e132 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4545,12 +4545,40 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
 	/*
 	 * check on proper vm_flags and page table alignment
 	 */
-	if (vma->vm_flags & VM_MAYSHARE &&
-	    vma->vm_start <= base && end <= vma->vm_end)
+	if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
 		return true;
 	return false;
 }
 
+/*
+ * Determine if start,end range within vma could be mapped by shared pmd.
+ * If yes, adjust start and end to cover range associated with possible
+ * shared pmd mappings.
+ */
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+				unsigned long *start, unsigned long *end)
+{
+	unsigned long check_addr = *start;
+
+	if (!(vma->vm_flags & VM_MAYSHARE))
+		return;
+
+	for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
+		unsigned long a_start = check_addr & PUD_MASK;
+		unsigned long a_end = a_start + PUD_SIZE;
+
+		/*
+		 * If sharing is possible, adjust start/end if necessary.
+		 */
+		if (range_in_vma(vma, a_start, a_end)) {
+			if (a_start < *start)
+				*start = a_start;
+			if (a_end > *end)
+				*end = a_end;
+		}
+	}
+}
+
 /*
  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
  * and returns the corresponding pte. While this is not necessary for the
@@ -4648,6 +4676,11 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 {
 	return 0;
 }
+
+void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
+				unsigned long *start, unsigned long *end)
+{
+}
 #define want_pmd_share()	(0)
 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
 
diff --git a/mm/rmap.c b/mm/rmap.c
index eb477809a5c0..1e79fac3186b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1362,11 +1362,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	}
 
 	/*
-	 * We have to assume the worse case ie pmd for invalidation. Note that
-	 * the page can not be free in this function as call of try_to_unmap()
-	 * must hold a reference on the page.
+	 * For THP, we have to assume the worse case ie pmd for invalidation.
+	 * For hugetlb, it could be much worse if we need to do pud
+	 * invalidation in the case of pmd sharing.
+	 *
+	 * Note that the page can not be free in this function as call of
+	 * try_to_unmap() must hold a reference on the page.
 	 */
 	end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+	if (PageHuge(page)) {
+		/*
+		 * If sharing is possible, start and end will be adjusted
+		 * accordingly.
+		 */
+		adjust_range_if_pmd_sharing_possible(vma, &start, &end);
+	}
 	mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
 
 	while (page_vma_mapped_walk(&pvmw)) {
@@ -1409,6 +1419,32 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
 		address = pvmw.address;
 
+		if (PageHuge(page)) {
+			if (huge_pmd_unshare(mm, &address, pvmw.pte)) {
+				/*
+				 * huge_pmd_unshare unmapped an entire PMD
+				 * page.  There is no way of knowing exactly
+				 * which PMDs may be cached for this mm, so
+				 * we must flush them all.  start/end were
+				 * already adjusted above to cover this range.
+				 */
+				flush_cache_range(vma, start, end);
+				flush_tlb_range(vma, start, end);
+				mmu_notifier_invalidate_range(mm, start, end);
+
+				/*
+				 * The ref count of the PMD page was dropped
+				 * which is part of the way map counting
+				 * is done for shared PMDs.  Return 'true'
+				 * here.  When there is no other sharing,
+				 * huge_pmd_unshare returns false and we will
+				 * unmap the actual page and drop map count
+				 * to zero.
+				 */
+				page_vma_mapped_walk_done(&pvmw);
+				break;
+			}
+		}
 
 		if (IS_ENABLED(CONFIG_MIGRATION) &&
 		    (flags & TTU_MIGRATION) &&
-- 
cgit v1.2.3


From 20916d4636a9b3c1bf562b305f91d126771edaf9 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Fri, 5 Oct 2018 15:51:54 -0700
Subject: mm/hugetlb: add mmap() encodings for 32MB and 512MB page sizes

ARM64 architecture also supports 32MB and 512MB HugeTLB page sizes.  This
just adds mmap() system call argument encoding for them.

Link: http://lkml.kernel.org/r/1537841300-6979-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/asm-generic/hugetlb_encode.h | 2 ++
 include/uapi/linux/memfd.h                | 2 ++
 include/uapi/linux/mman.h                 | 2 ++
 include/uapi/linux/shm.h                  | 2 ++
 4 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h
index e4732d3c2998..b0f8e87235bd 100644
--- a/include/uapi/asm-generic/hugetlb_encode.h
+++ b/include/uapi/asm-generic/hugetlb_encode.h
@@ -26,7 +26,9 @@
 #define HUGETLB_FLAG_ENCODE_2MB		(21 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_8MB		(23 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16MB	(24 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_32MB	(25 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_256MB	(28 << HUGETLB_FLAG_ENCODE_SHIFT)
+#define HUGETLB_FLAG_ENCODE_512MB	(29 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_1GB		(30 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_2GB		(31 << HUGETLB_FLAG_ENCODE_SHIFT)
 #define HUGETLB_FLAG_ENCODE_16GB	(34 << HUGETLB_FLAG_ENCODE_SHIFT)
diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
index 015a4c0bbb47..7a8a26751c23 100644
--- a/include/uapi/linux/memfd.h
+++ b/include/uapi/linux/memfd.h
@@ -25,7 +25,9 @@
 #define MFD_HUGE_2MB	HUGETLB_FLAG_ENCODE_2MB
 #define MFD_HUGE_8MB	HUGETLB_FLAG_ENCODE_8MB
 #define MFD_HUGE_16MB	HUGETLB_FLAG_ENCODE_16MB
+#define MFD_HUGE_32MB	HUGETLB_FLAG_ENCODE_32MB
 #define MFD_HUGE_256MB	HUGETLB_FLAG_ENCODE_256MB
+#define MFD_HUGE_512MB	HUGETLB_FLAG_ENCODE_512MB
 #define MFD_HUGE_1GB	HUGETLB_FLAG_ENCODE_1GB
 #define MFD_HUGE_2GB	HUGETLB_FLAG_ENCODE_2GB
 #define MFD_HUGE_16GB	HUGETLB_FLAG_ENCODE_16GB
diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index bfd5938fede6..d0f515d53299 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -28,7 +28,9 @@
 #define MAP_HUGE_2MB	HUGETLB_FLAG_ENCODE_2MB
 #define MAP_HUGE_8MB	HUGETLB_FLAG_ENCODE_8MB
 #define MAP_HUGE_16MB	HUGETLB_FLAG_ENCODE_16MB
+#define MAP_HUGE_32MB	HUGETLB_FLAG_ENCODE_32MB
 #define MAP_HUGE_256MB	HUGETLB_FLAG_ENCODE_256MB
+#define MAP_HUGE_512MB	HUGETLB_FLAG_ENCODE_512MB
 #define MAP_HUGE_1GB	HUGETLB_FLAG_ENCODE_1GB
 #define MAP_HUGE_2GB	HUGETLB_FLAG_ENCODE_2GB
 #define MAP_HUGE_16GB	HUGETLB_FLAG_ENCODE_16GB
diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h
index dde1344f047c..6507ad0afc81 100644
--- a/include/uapi/linux/shm.h
+++ b/include/uapi/linux/shm.h
@@ -65,7 +65,9 @@ struct shmid_ds {
 #define SHM_HUGE_2MB	HUGETLB_FLAG_ENCODE_2MB
 #define SHM_HUGE_8MB	HUGETLB_FLAG_ENCODE_8MB
 #define SHM_HUGE_16MB	HUGETLB_FLAG_ENCODE_16MB
+#define SHM_HUGE_32MB	HUGETLB_FLAG_ENCODE_32MB
 #define SHM_HUGE_256MB	HUGETLB_FLAG_ENCODE_256MB
+#define SHM_HUGE_512MB	HUGETLB_FLAG_ENCODE_512MB
 #define SHM_HUGE_1GB	HUGETLB_FLAG_ENCODE_1GB
 #define SHM_HUGE_2GB	HUGETLB_FLAG_ENCODE_2GB
 #define SHM_HUGE_16GB	HUGETLB_FLAG_ENCODE_16GB
-- 
cgit v1.2.3