From a725356b6659469d182d662f22d770d83d3bc7b5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 18 Sep 2018 16:34:34 +0300 Subject: vfs: swap names of {do,vfs}_clone_file_range() Commit 031a072a0b8a ("vfs: call vfs_clone_file_range() under freeze protection") created a wrapper do_clone_file_range() around vfs_clone_file_range() moving the freeze protection to former, so overlayfs could call the latter. The more common vfs practice is to call do_xxx helpers from vfs_xxx helpers, where freeze protecction is taken in the vfs_xxx helper, so this anomality could be a source of confusion. It seems that commit 8ede205541ff ("ovl: add reflink/copyfile/dedup support") may have fallen a victim to this confusion - ovl_clone_file_range() calls the vfs_clone_file_range() helper in the hope of getting freeze protection on upper fs, but in fact results in overlayfs allowing to bypass upper fs freeze protection. Swap the names of the two helpers to conform to common vfs practice and call the correct helpers from overlayfs and nfsd. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6c0b4a1c22ff..897eae8faee1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1828,8 +1828,10 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, extern int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, struct inode *inode_out, loff_t pos_out, u64 *len, bool is_dedupe); +extern int do_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len); extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len); + struct file *file_out, loff_t pos_out, u64 len); extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, struct inode *dest, loff_t destoff, loff_t len, bool *is_same); @@ -2773,19 +2775,6 @@ static inline void file_end_write(struct file *file) __sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE); } -static inline int do_clone_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - u64 len) -{ - int ret; - - file_start_write(file_out); - ret = vfs_clone_file_range(file_in, pos_in, file_out, pos_out, len); - file_end_write(file_out); - - return ret; -} - /* * get_write_access() gets write permission for a file. * put_write_access() releases this write permission. -- cgit v1.2.3 From efaffc5e40aeced0bcb497ed7a0a5b8c14abfcdf Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 1 Oct 2018 11:05:24 +0100 Subject: mm, sched/numa: Remove rate-limiting of automatic NUMA balancing migration Rate limiting of page migrations due to automatic NUMA balancing was introduced to mitigate the worst-case scenario of migrating at high frequency due to false sharing or slowly ping-ponging between nodes. Since then, a lot of effort was spent on correctly identifying these pages and avoiding unnecessary migrations and the safety net may no longer be required. Jirka Hladky reported a regression in 4.17 due to a scheduler patch that avoids spreading STREAM tasks wide prematurely. However, once the task was properly placed, it delayed migrating the memory due to rate limiting. Increasing the limit fixed the problem for him. Currently, the limit is hard-coded and does not account for the real capabilities of the hardware. Even if an estimate was attempted, it would not properly account for the number of memory controllers and it could not account for the amount of bandwidth used for normal accesses. Rather than fudging, this patch simply eliminates the rate limiting. However, Jirka reports that a STREAM configuration using multiple processes achieved similar performance to 4.16. In local tests, this patch improved performance of STREAM relative to the baseline but it is somewhat machine-dependent. Most workloads show little or not performance difference implying that there is not a heavily reliance on the throttling mechanism and it is safe to remove. STREAM on 2-socket machine 4.19.0-rc5 4.19.0-rc5 numab-v1r1 noratelimit-v1r1 MB/sec copy 43298.52 ( 0.00%) 44673.38 ( 3.18%) MB/sec scale 30115.06 ( 0.00%) 31293.06 ( 3.91%) MB/sec add 32825.12 ( 0.00%) 34883.62 ( 6.27%) MB/sec triad 32549.52 ( 0.00%) 34906.60 ( 7.24% Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Acked-by: Peter Zijlstra Cc: Jirka Hladky Cc: Linus Torvalds Cc: Linux-MM Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20181001100525.29789-2-mgorman@techsingularity.net Signed-off-by: Ingo Molnar --- include/linux/mmzone.h | 6 ---- include/trace/events/migrate.h | 27 ------------------ mm/migrate.c | 65 ------------------------------------------ mm/page_alloc.c | 2 -- 4 files changed, 100 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1e22d96734e0..3f4c0b167333 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -671,12 +671,6 @@ typedef struct pglist_data { #ifdef CONFIG_NUMA_BALANCING /* Lock serializing the migrate rate limiting window */ spinlock_t numabalancing_migrate_lock; - - /* Rate limiting time interval */ - unsigned long numabalancing_migrate_next_window; - - /* Number of pages migrated during the rate limiting time interval */ - unsigned long numabalancing_migrate_nr_pages; #endif /* * This is a per-node reserve of pages that are not available diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 711372845945..705b33d1e395 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -70,33 +70,6 @@ TRACE_EVENT(mm_migrate_pages, __print_symbolic(__entry->mode, MIGRATE_MODE), __print_symbolic(__entry->reason, MIGRATE_REASON)) ); - -TRACE_EVENT(mm_numa_migrate_ratelimit, - - TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages), - - TP_ARGS(p, dst_nid, nr_pages), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN) - __field( pid_t, pid) - __field( int, dst_nid) - __field( unsigned long, nr_pages) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->dst_nid = dst_nid; - __entry->nr_pages = nr_pages; - ), - - TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu", - __entry->comm, - __entry->pid, - __entry->dst_nid, - __entry->nr_pages) -); #endif /* _TRACE_MIGRATE_H */ /* This part must be outside protection */ diff --git a/mm/migrate.c b/mm/migrate.c index 4f1d894835b5..5e285c1249a0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1855,54 +1855,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page, return newpage; } -/* - * page migration rate limiting control. - * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs - * window of time. Default here says do not migrate more than 1280M per second. - */ -static unsigned int migrate_interval_millisecs __read_mostly = 100; -static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); - -/* Returns true if the node is migrate rate-limited after the update */ -static bool numamigrate_update_ratelimit(pg_data_t *pgdat, - unsigned long nr_pages) -{ - unsigned long next_window, interval; - - next_window = READ_ONCE(pgdat->numabalancing_migrate_next_window); - interval = msecs_to_jiffies(migrate_interval_millisecs); - - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (time_after(jiffies, next_window) && - spin_trylock(&pgdat->numabalancing_migrate_lock)) { - pgdat->numabalancing_migrate_nr_pages = 0; - do { - next_window += interval; - } while (unlikely(time_after(jiffies, next_window))); - - WRITE_ONCE(pgdat->numabalancing_migrate_next_window, next_window); - spin_unlock(&pgdat->numabalancing_migrate_lock); - } - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { - trace_mm_numa_migrate_ratelimit(current, pgdat->node_id, - nr_pages); - return true; - } - - /* - * This is an unlocked non-atomic update so errors are possible. - * The consequences are failing to migrate when we potentiall should - * have which is not severe enough to warrant locking. If it is ever - * a problem, it can be converted to a per-cpu counter. - */ - pgdat->numabalancing_migrate_nr_pages += nr_pages; - return false; -} - static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; @@ -1975,14 +1927,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, if (page_is_file_cache(page) && PageDirty(page)) goto out; - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (numamigrate_update_ratelimit(pgdat, 1)) - goto out; - isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) goto out; @@ -2029,14 +1973,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, unsigned long mmun_start = address & HPAGE_PMD_MASK; unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; - /* - * Rate-limit the amount of data that is being migrated to a node. - * Optimal placement is no good if the memory bus is saturated and - * all the time is being spent migrating! - */ - if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) - goto out_dropref; - new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), HPAGE_PMD_ORDER); @@ -2133,7 +2069,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, out_fail: count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); -out_dropref: ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { entry = pmd_modify(entry, vma->vm_page_prot); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89d2a2ab3fe6..706a738c0aee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6197,8 +6197,6 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages, static void pgdat_init_numabalancing(struct pglist_data *pgdat) { spin_lock_init(&pgdat->numabalancing_migrate_lock); - pgdat->numabalancing_migrate_nr_pages = 0; - pgdat->numabalancing_migrate_next_window = jiffies; } #else static void pgdat_init_numabalancing(struct pglist_data *pgdat) {} -- cgit v1.2.3 From 4d4c2d89913e2d891bd6a34b12050a2576e60525 Mon Sep 17 00:00:00 2001 From: Noralf Trønnes Date: Mon, 1 Oct 2018 21:45:36 +0200 Subject: drm/cma-helper: Fix crash in fbdev error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sergey Suloev reported a crash happening in drm_client_dev_hotplug() when fbdev had failed to register. [ 9.124598] vc4_hdmi 3f902000.hdmi: ASoC: Failed to create component debugfs directory [ 9.147667] vc4_hdmi 3f902000.hdmi: vc4-hdmi-hifi <-> 3f902000.hdmi mapping ok [ 9.155184] vc4_hdmi 3f902000.hdmi: ASoC: no DMI vendor name! [ 9.166544] vc4-drm soc:gpu: bound 3f902000.hdmi (ops vc4_hdmi_ops [vc4]) [ 9.173840] vc4-drm soc:gpu: bound 3f806000.vec (ops vc4_vec_ops [vc4]) [ 9.181029] vc4-drm soc:gpu: bound 3f004000.txp (ops vc4_txp_ops [vc4]) [ 9.188519] vc4-drm soc:gpu: bound 3f400000.hvs (ops vc4_hvs_ops [vc4]) [ 9.195690] vc4-drm soc:gpu: bound 3f206000.pixelvalve (ops vc4_crtc_ops [vc4]) [ 9.203523] vc4-drm soc:gpu: bound 3f207000.pixelvalve (ops vc4_crtc_ops [vc4]) [ 9.215032] vc4-drm soc:gpu: bound 3f807000.pixelvalve (ops vc4_crtc_ops [vc4]) [ 9.274785] vc4-drm soc:gpu: bound 3fc00000.v3d (ops vc4_v3d_ops [vc4]) [ 9.290246] [drm] Initialized vc4 0.0.0 20140616 for soc:gpu on minor 0 [ 9.297464] [drm] Supports vblank timestamp caching Rev 2 (21.10.2013). [ 9.304600] [drm] Driver supports precise vblank timestamp query. [ 9.382856] vc4-drm soc:gpu: [drm:drm_fb_helper_fbdev_setup [drm_kms_helper]] *ERROR* Failed to set fbdev configuration [ 10.404937] Unable to handle kernel paging request at virtual address 00330a656369768a [ 10.441620] [00330a656369768a] address between user and kernel address ranges [ 10.449087] Internal error: Oops: 96000004 [#1] PREEMPT SMP [ 10.454762] Modules linked in: brcmfmac vc4 drm_kms_helper cfg80211 drm rfkill smsc95xx brcmutil usbnet drm_panel_orientation_quirks raspberrypi_hwmon bcm2835_dma crc32_ce pwm_bcm2835 bcm2835_rng virt_dma rng_core i2c_bcm2835 ip_tables x_tables ipv6 [ 10.477296] CPU: 2 PID: 45 Comm: kworker/2:1 Not tainted 4.19.0-rc5 #3 [ 10.483934] Hardware name: Raspberry Pi 3 Model B Rev 1.2 (DT) [ 10.489966] Workqueue: events output_poll_execute [drm_kms_helper] [ 10.596515] Process kworker/2:1 (pid: 45, stack limit = 0x000000007e8924dc) [ 10.603590] Call trace: [ 10.606259] drm_client_dev_hotplug+0x5c/0xb0 [drm] [ 10.611303] drm_kms_helper_hotplug_event+0x30/0x40 [drm_kms_helper] [ 10.617849] output_poll_execute+0xc4/0x1e0 [drm_kms_helper] [ 10.623616] process_one_work+0x1c8/0x318 [ 10.627695] worker_thread+0x48/0x428 [ 10.631420] kthread+0xf8/0x128 [ 10.634615] ret_from_fork+0x10/0x18 [ 10.638255] Code: 54000220 f9401261 aa1303e0 b4000141 (f9400c21) [ 10.644456] ---[ end trace c75b4a4b0e141908 ]--- The reason for this is that drm_fbdev_cma_init() removes the drm_client when fbdev registration fails, but it doesn't remove the client from the drm_device client list. So the client list now has a pointer that points into the unknown and we have a 'use after free' situation. Split drm_client_new() into drm_client_init() and drm_client_add() to fix removal in the error path. Fixes: 894a677f4b3e ("drm/cma-helper: Use the generic fbdev emulation") Reported-by: Sergey Suloev Cc: Stefan Wahren Cc: Eric Anholt Cc: Daniel Vetter Signed-off-by: Noralf Trønnes Reviewed-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20181001194536.57756-1-noralf@tronnes.org --- drivers/gpu/drm/drm_client.c | 35 ++++++++++++++++++++++++++--------- drivers/gpu/drm/drm_fb_cma_helper.c | 4 +++- drivers/gpu/drm/drm_fb_helper.c | 4 +++- include/drm/drm_client.h | 5 +++-- 4 files changed, 35 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_client.c b/drivers/gpu/drm/drm_client.c index baff50a4c234..df31c3815092 100644 --- a/drivers/gpu/drm/drm_client.c +++ b/drivers/gpu/drm/drm_client.c @@ -63,20 +63,21 @@ static void drm_client_close(struct drm_client_dev *client) EXPORT_SYMBOL(drm_client_close); /** - * drm_client_new - Create a DRM client + * drm_client_init - Initialise a DRM client * @dev: DRM device * @client: DRM client * @name: Client name * @funcs: DRM client functions (optional) * + * This initialises the client and opens a &drm_file. Use drm_client_add() to complete the process. * The caller needs to hold a reference on @dev before calling this function. * The client is freed when the &drm_device is unregistered. See drm_client_release(). * * Returns: * Zero on success or negative error code on failure. */ -int drm_client_new(struct drm_device *dev, struct drm_client_dev *client, - const char *name, const struct drm_client_funcs *funcs) +int drm_client_init(struct drm_device *dev, struct drm_client_dev *client, + const char *name, const struct drm_client_funcs *funcs) { int ret; @@ -95,10 +96,6 @@ int drm_client_new(struct drm_device *dev, struct drm_client_dev *client, if (ret) goto err_put_module; - mutex_lock(&dev->clientlist_mutex); - list_add(&client->list, &dev->clientlist); - mutex_unlock(&dev->clientlist_mutex); - drm_dev_get(dev); return 0; @@ -109,13 +106,33 @@ err_put_module: return ret; } -EXPORT_SYMBOL(drm_client_new); +EXPORT_SYMBOL(drm_client_init); + +/** + * drm_client_add - Add client to the device list + * @client: DRM client + * + * Add the client to the &drm_device client list to activate its callbacks. + * @client must be initialized by a call to drm_client_init(). After + * drm_client_add() it is no longer permissible to call drm_client_release() + * directly (outside the unregister callback), instead cleanup will happen + * automatically on driver unload. + */ +void drm_client_add(struct drm_client_dev *client) +{ + struct drm_device *dev = client->dev; + + mutex_lock(&dev->clientlist_mutex); + list_add(&client->list, &dev->clientlist); + mutex_unlock(&dev->clientlist_mutex); +} +EXPORT_SYMBOL(drm_client_add); /** * drm_client_release - Release DRM client resources * @client: DRM client * - * Releases resources by closing the &drm_file that was opened by drm_client_new(). + * Releases resources by closing the &drm_file that was opened by drm_client_init(). * It is called automatically if the &drm_client_funcs.unregister callback is _not_ set. * * This function should only be called from the unregister callback. An exception diff --git a/drivers/gpu/drm/drm_fb_cma_helper.c b/drivers/gpu/drm/drm_fb_cma_helper.c index 9da36a6271d3..9ac1f2e0f064 100644 --- a/drivers/gpu/drm/drm_fb_cma_helper.c +++ b/drivers/gpu/drm/drm_fb_cma_helper.c @@ -160,7 +160,7 @@ struct drm_fbdev_cma *drm_fbdev_cma_init(struct drm_device *dev, fb_helper = &fbdev_cma->fb_helper; - ret = drm_client_new(dev, &fb_helper->client, "fbdev", NULL); + ret = drm_client_init(dev, &fb_helper->client, "fbdev", NULL); if (ret) goto err_free; @@ -169,6 +169,8 @@ struct drm_fbdev_cma *drm_fbdev_cma_init(struct drm_device *dev, if (ret) goto err_client_put; + drm_client_add(&fb_helper->client); + return fbdev_cma; err_client_put: diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index 16ec93b75dbf..515a7aec57ac 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -3218,12 +3218,14 @@ int drm_fbdev_generic_setup(struct drm_device *dev, unsigned int preferred_bpp) if (!fb_helper) return -ENOMEM; - ret = drm_client_new(dev, &fb_helper->client, "fbdev", &drm_fbdev_client_funcs); + ret = drm_client_init(dev, &fb_helper->client, "fbdev", &drm_fbdev_client_funcs); if (ret) { kfree(fb_helper); return ret; } + drm_client_add(&fb_helper->client); + fb_helper->preferred_bpp = preferred_bpp; drm_fbdev_client_hotplug(&fb_helper->client); diff --git a/include/drm/drm_client.h b/include/drm/drm_client.h index 989f8e52864d..971bb7853776 100644 --- a/include/drm/drm_client.h +++ b/include/drm/drm_client.h @@ -87,9 +87,10 @@ struct drm_client_dev { struct drm_file *file; }; -int drm_client_new(struct drm_device *dev, struct drm_client_dev *client, - const char *name, const struct drm_client_funcs *funcs); +int drm_client_init(struct drm_device *dev, struct drm_client_dev *client, + const char *name, const struct drm_client_funcs *funcs); void drm_client_release(struct drm_client_dev *client); +void drm_client_add(struct drm_client_dev *client); void drm_client_dev_unregister(struct drm_device *dev); void drm_client_dev_hotplug(struct drm_device *dev); -- cgit v1.2.3 From 9d2f67e43b73e8af7438be219b66a5de0cfa8bd9 Mon Sep 17 00:00:00 2001 From: Jianfeng Tan Date: Sat, 29 Sep 2018 15:41:27 +0000 Subject: net/packet: fix packet drop as of virtio gso When we use raw socket as the vhost backend, a packet from virito with gso offloading information, cannot be sent out in later validaton at xmit path, as we did not set correct skb->protocol which is further used for looking up the gso function. To fix this, we set this field according to virito hdr information. Fixes: e858fae2b0b8f4 ("virtio_net: use common code for virtio_net_hdr and skb GSO conversion") Signed-off-by: Jianfeng Tan Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 18 ++++++++++++++++++ net/packet/af_packet.c | 11 +++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 9397628a1967..cb462f9ab7dd 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -5,6 +5,24 @@ #include #include +static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, + const struct virtio_net_hdr *hdr) +{ + switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { + case VIRTIO_NET_HDR_GSO_TCPV4: + case VIRTIO_NET_HDR_GSO_UDP: + skb->protocol = cpu_to_be16(ETH_P_IP); + break; + case VIRTIO_NET_HDR_GSO_TCPV6: + skb->protocol = cpu_to_be16(ETH_P_IPV6); + break; + default: + return -EINVAL; + } + + return 0; +} + static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, const struct virtio_net_hdr *hdr, bool little_endian) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 75c92a87e7b2..d6e94dc7e290 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2715,10 +2715,12 @@ tpacket_error: } } - if (po->has_vnet_hdr && virtio_net_hdr_to_skb(skb, vnet_hdr, - vio_le())) { - tp_len = -EINVAL; - goto tpacket_error; + if (po->has_vnet_hdr) { + if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { + tp_len = -EINVAL; + goto tpacket_error; + } + virtio_net_hdr_set_proto(skb, vnet_hdr); } skb->destructor = tpacket_destruct_skb; @@ -2915,6 +2917,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (err) goto out_free; len += sizeof(vnet_hdr); + virtio_net_hdr_set_proto(skb, &vnet_hdr); } skb_probe_transport_header(skb, reserve); -- cgit v1.2.3 From 017b1660df89f5fb4bfe66c34e35f7d2031100c7 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 5 Oct 2018 15:51:29 -0700 Subject: mm: migration: fix migration of huge PMD shared pages The page migration code employs try_to_unmap() to try and unmap the source page. This is accomplished by using rmap_walk to find all vmas where the page is mapped. This search stops when page mapcount is zero. For shared PMD huge pages, the page map count is always 1 no matter the number of mappings. Shared mappings are tracked via the reference count of the PMD page. Therefore, try_to_unmap stops prematurely and does not completely unmap all mappings of the source page. This problem can result is data corruption as writes to the original source page can happen after contents of the page are copied to the target page. Hence, data is lost. This problem was originally seen as DB corruption of shared global areas after a huge page was soft offlined due to ECC memory errors. DB developers noticed they could reproduce the issue by (hotplug) offlining memory used to back huge pages. A simple testcase can reproduce the problem by creating a shared PMD mapping (note that this must be at least PUD_SIZE in size and PUD_SIZE aligned (1GB on x86)), and using migrate_pages() to migrate process pages between nodes while continually writing to the huge pages being migrated. To fix, have the try_to_unmap_one routine check for huge PMD sharing by calling huge_pmd_unshare for hugetlbfs huge pages. If it is a shared mapping it will be 'unshared' which removes the page table entry and drops the reference on the PMD page. After this, flush caches and TLB. mmu notifiers are called before locking page tables, but we can not be sure of PMD sharing until page tables are locked. Therefore, check for the possibility of PMD sharing before locking so that notifiers can prepare for the worst possible case. Link: http://lkml.kernel.org/r/20180823205917.16297-2-mike.kravetz@oracle.com [mike.kravetz@oracle.com: make _range_in_vma() a static inline] Link: http://lkml.kernel.org/r/6063f215-a5c8-2f0c-465a-2c515ddc952d@oracle.com Fixes: 39dde65c9940 ("shared page table for hugetlb page") Signed-off-by: Mike Kravetz Acked-by: Kirill A. Shutemov Reviewed-by: Naoya Horiguchi Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: Jerome Glisse Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- include/linux/hugetlb.h | 14 ++++++++++++++ include/linux/mm.h | 6 ++++++ mm/hugetlb.c | 37 +++++++++++++++++++++++++++++++++++-- mm/rmap.c | 42 +++++++++++++++++++++++++++++++++++++++--- 4 files changed, 94 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6b68e345f0ca..087fd5f48c91 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -140,6 +140,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end); struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write); struct page *follow_huge_pd(struct vm_area_struct *vma, @@ -170,6 +172,18 @@ static inline unsigned long hugetlb_total_pages(void) return 0; } +static inline int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, + pte_t *ptep) +{ + return 0; +} + +static inline void adjust_range_if_pmd_sharing_possible( + struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ +} + #define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; }) #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) diff --git a/include/linux/mm.h b/include/linux/mm.h index a61ebe8ad4ca..0416a7204be3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2455,6 +2455,12 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, return vma; } +static inline bool range_in_vma(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + return (vma && vma->vm_start <= start && end <= vma->vm_end); +} + #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(unsigned long vm_flags); void vma_set_page_prot(struct vm_area_struct *vma); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3c21775f196b..b903d746e132 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4545,12 +4545,40 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) /* * check on proper vm_flags and page table alignment */ - if (vma->vm_flags & VM_MAYSHARE && - vma->vm_start <= base && end <= vma->vm_end) + if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) return true; return false; } +/* + * Determine if start,end range within vma could be mapped by shared pmd. + * If yes, adjust start and end to cover range associated with possible + * shared pmd mappings. + */ +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ + unsigned long check_addr = *start; + + if (!(vma->vm_flags & VM_MAYSHARE)) + return; + + for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) { + unsigned long a_start = check_addr & PUD_MASK; + unsigned long a_end = a_start + PUD_SIZE; + + /* + * If sharing is possible, adjust start/end if necessary. + */ + if (range_in_vma(vma, a_start, a_end)) { + if (a_start < *start) + *start = a_start; + if (a_end > *end) + *end = a_end; + } + } +} + /* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the @@ -4648,6 +4676,11 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { return 0; } + +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ +} #define want_pmd_share() (0) #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ diff --git a/mm/rmap.c b/mm/rmap.c index eb477809a5c0..1e79fac3186b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1362,11 +1362,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } /* - * We have to assume the worse case ie pmd for invalidation. Note that - * the page can not be free in this function as call of try_to_unmap() - * must hold a reference on the page. + * For THP, we have to assume the worse case ie pmd for invalidation. + * For hugetlb, it could be much worse if we need to do pud + * invalidation in the case of pmd sharing. + * + * Note that the page can not be free in this function as call of + * try_to_unmap() must hold a reference on the page. */ end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + if (PageHuge(page)) { + /* + * If sharing is possible, start and end will be adjusted + * accordingly. + */ + adjust_range_if_pmd_sharing_possible(vma, &start, &end); + } mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); while (page_vma_mapped_walk(&pvmw)) { @@ -1409,6 +1419,32 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); address = pvmw.address; + if (PageHuge(page)) { + if (huge_pmd_unshare(mm, &address, pvmw.pte)) { + /* + * huge_pmd_unshare unmapped an entire PMD + * page. There is no way of knowing exactly + * which PMDs may be cached for this mm, so + * we must flush them all. start/end were + * already adjusted above to cover this range. + */ + flush_cache_range(vma, start, end); + flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range(mm, start, end); + + /* + * The ref count of the PMD page was dropped + * which is part of the way map counting + * is done for shared PMDs. Return 'true' + * here. When there is no other sharing, + * huge_pmd_unshare returns false and we will + * unmap the actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + } if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && -- cgit v1.2.3 From 20916d4636a9b3c1bf562b305f91d126771edaf9 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 5 Oct 2018 15:51:54 -0700 Subject: mm/hugetlb: add mmap() encodings for 32MB and 512MB page sizes ARM64 architecture also supports 32MB and 512MB HugeTLB page sizes. This just adds mmap() system call argument encoding for them. Link: http://lkml.kernel.org/r/1537841300-6979-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Punit Agrawal Acked-by: Mike Kravetz Cc: Michal Hocko Cc: Will Deacon Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- include/uapi/asm-generic/hugetlb_encode.h | 2 ++ include/uapi/linux/memfd.h | 2 ++ include/uapi/linux/mman.h | 2 ++ include/uapi/linux/shm.h | 2 ++ 4 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h index e4732d3c2998..b0f8e87235bd 100644 --- a/include/uapi/asm-generic/hugetlb_encode.h +++ b/include/uapi/asm-generic/hugetlb_encode.h @@ -26,7 +26,9 @@ #define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_32MB (25 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512MB (29 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT) diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h index 015a4c0bbb47..7a8a26751c23 100644 --- a/include/uapi/linux/memfd.h +++ b/include/uapi/linux/memfd.h @@ -25,7 +25,9 @@ #define MFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB #define MFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB #define MFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MFD_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB #define MFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MFD_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB #define MFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB #define MFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define MFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index bfd5938fede6..d0f515d53299 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -28,7 +28,9 @@ #define MAP_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB #define MAP_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB #define MAP_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MAP_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB #define MAP_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MAP_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB #define MAP_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB #define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h index dde1344f047c..6507ad0afc81 100644 --- a/include/uapi/linux/shm.h +++ b/include/uapi/linux/shm.h @@ -65,7 +65,9 @@ struct shmid_ds { #define SHM_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB #define SHM_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB #define SHM_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define SHM_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB #define SHM_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define SHM_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB #define SHM_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB #define SHM_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define SHM_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB -- cgit v1.2.3