From cc725ef3cb202ef2019a3c67c8913efa05c3cce6 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Tue, 5 Mar 2019 15:41:41 -0800 Subject: ocfs2: fix a panic problem caused by o2cb_ctl In the process of creating a node, it will cause NULL pointer dereference in kernel if o2cb_ctl failed in the interval (mkdir, o2cb_set_node_attribute(node_num)] in function o2cb_add_node. The node num is initialized to 0 in function o2nm_node_group_make_item, o2nm_node_group_drop_item will mistake the node number 0 for a valid node number when we delete the node before the node number is set correctly. If the local node number of the current host happens to be 0, cluster->cl_local_node will be set to O2NM_INVALID_NODE_NUM while o2hb_thread still running. The panic stack is generated as follows: o2hb_thread \-o2hb_do_disk_heartbeat \-o2hb_check_own_slot |-slot = ®->hr_slots[o2nm_this_node()]; //o2nm_this_node() return O2NM_INVALID_NODE_NUM We need to check whether the node number is set when we delete the node. Link: http://lkml.kernel.org/r/133d8045-72cc-863e-8eae-5013f9f6bc51@huawei.com Signed-off-by: Jia Guo Reviewed-by: Joseph Qi Acked-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/nodemanager.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 0e4166cc23a0..4ac775e32240 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group, struct o2nm_node *node = to_o2nm_node(item); struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); - o2net_disconnect_node(node); + if (cluster->cl_nodes[node->nd_num] == node) { + o2net_disconnect_node(node); - if (cluster->cl_has_local && - (cluster->cl_local_node == node->nd_num)) { - cluster->cl_has_local = 0; - cluster->cl_local_node = O2NM_INVALID_NODE_NUM; - o2net_stop_listening(node); + if (cluster->cl_has_local && + (cluster->cl_local_node == node->nd_num)) { + cluster->cl_has_local = 0; + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; + o2net_stop_listening(node); + } } /* XXX call into net to stop this node from trading messages */ -- cgit v1.2.3 From 5500ab4ed3b8f0749ec584d8c5e2738bc01ea52e Mon Sep 17 00:00:00 2001 From: Gang He Date: Tue, 5 Mar 2019 15:41:45 -0800 Subject: ocfs2: fix the application IO timeout when fstrim is running The user reported this problem, the upper application IO was timeout when fstrim was running on this ocfs2 partition. the application monitoring resource agent considered that this application did not work, then this node was fenced by the cluster brain (e.g. pacemaker). The root cause is that fstrim thread always holds main_bm meta-file related locks until all the cluster groups are trimmed. This patch will make fstrim thread release main_bm meta-file related locks when each cluster group is trimmed, this will let the current application IO has a chance to claim the clusters from main_bm meta-file. Link: http://lkml.kernel.org/r/20190111090014.31645-1-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 159 +++++++++++++++++++++++++++++-------------------- fs/ocfs2/dlmglue.c | 5 ++ fs/ocfs2/ocfs2.h | 1 + fs/ocfs2/ocfs2_trace.h | 2 + fs/ocfs2/super.c | 2 + 5 files changed, 106 insertions(+), 63 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d1cbb27808e2..6f0999015a44 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb, return count; } -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) +static +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range) { struct ocfs2_super *osb = OCFS2_SB(sb); - u64 start, len, trimmed, first_group, last_group, group; + u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0; int ret, cnt; u32 first_bit, last_bit, minlen; struct buffer_head *main_bm_bh = NULL; @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) struct buffer_head *gd_bh = NULL; struct ocfs2_dinode *main_bm; struct ocfs2_group_desc *gd = NULL; - struct ocfs2_trim_fs_info info, *pinfo = NULL; start = range->start >> osb->s_clustersize_bits; len = range->len >> osb->s_clustersize_bits; @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) return -EINVAL; + trace_ocfs2_trim_mainbm(start, len, minlen); + +next_group: main_bm_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) } main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; - if (start >= le32_to_cpu(main_bm->i_clusters)) { - ret = -EINVAL; - goto out_unlock; - } - - len = range->len >> osb->s_clustersize_bits; - if (start + len > le32_to_cpu(main_bm->i_clusters)) - len = le32_to_cpu(main_bm->i_clusters) - start; - - trace_ocfs2_trim_fs(start, len, minlen); - - ocfs2_trim_fs_lock_res_init(osb); - ret = ocfs2_trim_fs_lock(osb, NULL, 1); - if (ret < 0) { - if (ret != -EAGAIN) { - mlog_errno(ret); - ocfs2_trim_fs_lock_res_uninit(osb); + /* + * Do some check before trim the first group. + */ + if (!group) { + if (start >= le32_to_cpu(main_bm->i_clusters)) { + ret = -EINVAL; goto out_unlock; } - mlog(ML_NOTICE, "Wait for trim on device (%s) to " - "finish, which is running from another node.\n", - osb->dev_str); - ret = ocfs2_trim_fs_lock(osb, &info, 0); - if (ret < 0) { - mlog_errno(ret); - ocfs2_trim_fs_lock_res_uninit(osb); - goto out_unlock; - } + if (start + len > le32_to_cpu(main_bm->i_clusters)) + len = le32_to_cpu(main_bm->i_clusters) - start; - if (info.tf_valid && info.tf_success && - info.tf_start == start && info.tf_len == len && - info.tf_minlen == minlen) { - /* Avoid sending duplicated trim to a shared device */ - mlog(ML_NOTICE, "The same trim on device (%s) was " - "just done from node (%u), return.\n", - osb->dev_str, info.tf_nodenum); - range->len = info.tf_trimlen; - goto out_trimunlock; - } + /* + * Determine first and last group to examine based on + * start and len + */ + first_group = ocfs2_which_cluster_group(main_bm_inode, start); + if (first_group == osb->first_cluster_group_blkno) + first_bit = start; + else + first_bit = start - ocfs2_blocks_to_clusters(sb, + first_group); + last_group = ocfs2_which_cluster_group(main_bm_inode, + start + len - 1); + group = first_group; } - info.tf_nodenum = osb->node_num; - info.tf_start = start; - info.tf_len = len; - info.tf_minlen = minlen; - - /* Determine first and last group to examine based on start and len */ - first_group = ocfs2_which_cluster_group(main_bm_inode, start); - if (first_group == osb->first_cluster_group_blkno) - first_bit = start; - else - first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); - last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); - last_bit = osb->bitmap_cpg; - - trimmed = 0; - for (group = first_group; group <= last_group;) { + do { if (first_bit + len >= osb->bitmap_cpg) last_bit = osb->bitmap_cpg; else @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); else group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); - } - range->len = trimmed * sb->s_blocksize; + } while (0); - info.tf_trimlen = range->len; - info.tf_success = (ret ? 0 : 1); - pinfo = &info; -out_trimunlock: - ocfs2_trim_fs_unlock(osb, pinfo); - ocfs2_trim_fs_lock_res_uninit(osb); out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); + main_bm_bh = NULL; out_mutex: inode_unlock(main_bm_inode); iput(main_bm_inode); + + /* + * If all the groups trim are not done or failed, but we should release + * main_bm related locks for avoiding the current IO starve, then go to + * trim the next group + */ + if (ret >= 0 && group <= last_group) + goto next_group; out: + range->len = trimmed * sb->s_blocksize; + return ret; +} + +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(sb); + struct ocfs2_trim_fs_info info, *pinfo = NULL; + + ocfs2_trim_fs_lock_res_init(osb); + + trace_ocfs2_trim_fs(range->start, range->len, range->minlen); + + ret = ocfs2_trim_fs_lock(osb, NULL, 1); + if (ret < 0) { + if (ret != -EAGAIN) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + return ret; + } + + mlog(ML_NOTICE, "Wait for trim on device (%s) to " + "finish, which is running from another node.\n", + osb->dev_str); + ret = ocfs2_trim_fs_lock(osb, &info, 0); + if (ret < 0) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + return ret; + } + + if (info.tf_valid && info.tf_success && + info.tf_start == range->start && + info.tf_len == range->len && + info.tf_minlen == range->minlen) { + /* Avoid sending duplicated trim to a shared device */ + mlog(ML_NOTICE, "The same trim on device (%s) was " + "just done from node (%u), return.\n", + osb->dev_str, info.tf_nodenum); + range->len = info.tf_trimlen; + goto out; + } + } + + info.tf_nodenum = osb->node_num; + info.tf_start = range->start; + info.tf_len = range->len; + info.tf_minlen = range->minlen; + + ret = ocfs2_trim_mainbm(sb, range); + + info.tf_trimlen = range->len; + info.tf_success = (ret < 0 ? 0 : 1); + pinfo = &info; +out: + ocfs2_trim_fs_unlock(osb, pinfo); + ocfs2_trim_fs_lock_res_uninit(osb); return ret; } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 7c835824247e..af405586c5b1 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb) { struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + /* Only one trimfs thread are allowed to work at the same time. */ + mutex_lock(&osb->obs_trim_fs_mutex); + ocfs2_lock_res_init_once(lockres); ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb) ocfs2_simple_drop_lockres(osb, lockres); ocfs2_lock_res_free(lockres); + + mutex_unlock(&osb->obs_trim_fs_mutex); } static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 4f86ac0027b5..1f029fbe8b8d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -407,6 +407,7 @@ struct ocfs2_super struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; struct ocfs2_lock_res osb_trim_fs_lockres; + struct mutex obs_trim_fs_mutex; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 2ee76a90ba8f..dc4bce1649c1 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent, DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm); + DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); /* End of trace events for fs/ocfs2/alloc.c. */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 3415e0b09398..96ae7cedd487 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb) if (ocfs2_is_hard_readonly(osb)) goto leave; + mutex_init(&osb->obs_trim_fs_mutex); + status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); -- cgit v1.2.3 From f402cf03fc4c5576df379e1e252a6afc17658414 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 5 Mar 2019 15:41:48 -0800 Subject: ocfs2: Use zero-sized array and struct_size() in kzalloc() Update the code to use a zero-sized array instead of a pointer in structure ocfs2_slot_info and use struct_size() in kzalloc(). Notice that one of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct foo { int stuff; void *entry[]; }; instance = kzalloc(sizeof(struct foo) + sizeof(void *) * count, GFP_KERNEL); Instead of leaving these open-coded and prone to type mistakes, we can now use the new struct_size() helper: instance = kzalloc(struct_size(instance, entry, count), GFP_KERNEL); This code was detected with the help of Coccinelle. Link: http://lkml.kernel.org/r/20190108191903.GA22056@embeddedor Signed-off-by: Gustavo A. R. Silva Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/slot_map.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index d7407994f308..ea0756d83250 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -55,7 +55,7 @@ struct ocfs2_slot_info { unsigned int si_blocks; struct buffer_head **si_bh; unsigned int si_num_slots; - struct ocfs2_slot *si_slots; + struct ocfs2_slot si_slots[]; }; @@ -420,9 +420,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) struct inode *inode = NULL; struct ocfs2_slot_info *si; - si = kzalloc(sizeof(struct ocfs2_slot_info) + - (sizeof(struct ocfs2_slot) * osb->max_slots), - GFP_KERNEL); + si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL); if (!si) { status = -ENOMEM; mlog_errno(status); @@ -431,8 +429,6 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) si->si_extended = ocfs2_uses_extended_slot_map(osb); si->si_num_slots = osb->max_slots; - si->si_slots = (struct ocfs2_slot *)((char *)si + - sizeof(struct ocfs2_slot_info)); inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); -- cgit v1.2.3 From a905737fdd767c75688e1e6de65967923007ec1d Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 5 Mar 2019 15:41:52 -0800 Subject: fs/inode.c: inode_set_flags(): replace opencoded set_mask_bits() It seems that commits 5f16f3225b0624 and 00a1a053ebe5, both with same commitlog ("ext4: atomically set inode->i_flags in ext4_set_inode_flags()") introduced the set_mask_bits API, but somehow missed not using it in ext4 in the end. Also, set_mask_bits() is used in fs quite a bit and we can possibly come up with a generic llsc based implementation (w/o the cmpxchg loop) Link: http://lkml.kernel.org/r/1548275584-18096-3-git-send-email-vgupta@synopsys.com Signed-off-by: Vineet Gupta Reviewed-by: Anthony Yznaga Cc: Alexander Viro Cc: Theodore Ts'o Cc: Peter Zijlstra (Intel) Cc: Chris Wilson Cc: Ingo Molnar Cc: Jani Nikula Cc: Miklos Szeredi Cc: Oleg Nesterov Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 73432e64f874..e9d97add2b36 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2093,14 +2093,8 @@ EXPORT_SYMBOL(inode_dio_wait); void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask) { - unsigned int old_flags, new_flags; - WARN_ON_ONCE(flags & ~mask); - do { - old_flags = READ_ONCE(inode->i_flags); - new_flags = (old_flags & ~mask) | flags; - } while (unlikely(cmpxchg(&inode->i_flags, old_flags, - new_flags) != old_flags)); + set_mask_bits(&inode->i_flags, mask, flags); } EXPORT_SYMBOL(inode_set_flags); -- cgit v1.2.3 From 5704a06810682683355624923547b41540e2801a Mon Sep 17 00:00:00 2001 From: Shuriyc Chu Date: Tue, 5 Mar 2019 15:41:56 -0800 Subject: fs/file.c: initialize init_files.resize_wait (Taken from https://bugzilla.kernel.org/show_bug.cgi?id=200647) 'get_unused_fd_flags' in kthread cause kernel crash. It works fine on 4.1, but causes crash after get 64 fds. It also cause crash on ubuntu1404/1604/1804, centos7.5, and the crash messages are almost the same. The crash message on centos7.5 shows below: start fd 61 start fd 62 start fd 63 BUG: unable to handle kernel NULL pointer dereference at (null) IP: __wake_up_common+0x2e/0x90 PGD 0 Oops: 0000 [#1] SMP Modules linked in: test(OE) xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter devlink sunrpc kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd sg ppdev pcspkr virtio_balloon parport_pc parport i2c_piix4 joydev ip_tables xfs libcrc32c sr_mod cdrom sd_mod crc_t10dif crct10dif_generic ata_generic pata_acpi virtio_scsi virtio_console virtio_net cirrus drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm crct10dif_pclmul crct10dif_common crc32c_intel drm ata_piix serio_raw libata virtio_pci virtio_ring i2c_core virtio floppy dm_mirror dm_region_hash dm_log dm_mod CPU: 2 PID: 1820 Comm: test_fd Kdump: loaded Tainted: G OE ------------ 3.10.0-862.3.3.el7.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014 task: ffff8e92b9431fa0 ti: ffff8e94247a0000 task.ti: ffff8e94247a0000 RIP: 0010:__wake_up_common+0x2e/0x90 RSP: 0018:ffff8e94247a2d18 EFLAGS: 00010086 RAX: 0000000000000000 RBX: ffffffff9d09daa0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000003 RDI: ffffffff9d09daa0 RBP: ffff8e94247a2d50 R08: 0000000000000000 R09: ffff8e92b95dfda8 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff9d09daa8 R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000003 FS: 0000000000000000(0000) GS:ffff8e9434e80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000017c686000 CR4: 00000000000207e0 Call Trace: __wake_up+0x39/0x50 expand_files+0x131/0x250 __alloc_fd+0x47/0x170 get_unused_fd_flags+0x30/0x40 test_fd+0x12a/0x1c0 [test] kthread+0xd1/0xe0 ret_from_fork_nospec_begin+0x21/0x21 Code: 66 90 55 48 89 e5 41 57 41 89 f7 41 56 41 89 ce 41 55 41 54 49 89 fc 49 83 c4 08 53 48 83 ec 10 48 8b 47 08 89 55 cc 4c 89 45 d0 <48> 8b 08 49 39 c4 48 8d 78 e8 4c 8d 69 e8 75 08 eb 3b 4c 89 ef RIP __wake_up_common+0x2e/0x90 RSP CR2: 0000000000000000 This issue exists since CentOS 7.5 3.10.0-862 and CentOS 7.4 (3.10.0-693.21.1 ) is ok. Root cause: the item 'resize_wait' is not initialized before being used. Reported-by: Richard Zhang Reviewed-by: Andrew Morton Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 3209ee271c41..a10487aa0a84 100644 --- a/fs/file.c +++ b/fs/file.c @@ -457,6 +457,7 @@ struct files_struct init_files = { .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), + .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) -- cgit v1.2.3 From ca215086b14b89a0e70fc211314944aa6ce50020 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:23 -0800 Subject: mm: convert PG_balloon to PG_offline PG_balloon was introduced to implement page migration/compaction for pages inflated in virtio-balloon. Nowadays, it is only a marker that a page is part of virtio-balloon and therefore logically offline. We also want to make use of this flag in other balloon drivers - for inflated pages or when onlining a section but keeping some pages offline (e.g. used right now by XEN and Hyper-V via set_online_page_callback()). We are going to expose this flag to dump tools like makedumpfile. But instead of exposing PG_balloon, let's generalize the concept of marking pages as logically offline, so it can be reused for other purposes later on. Rename PG_balloon to PG_offline. This is an indicator that the page is logically offline, the content stale and that it should not be touched (e.g. a hypervisor would have to allocate backing storage in order for the guest to dump an unused page). We can then e.g. exclude such pages from dumps. We replace and reuse KPF_BALLOON (23), as this shouldn't really harm (and for now the semantics stay the same). In following patches, we will make use of this bit also in other balloon drivers. While at it, document PGTABLE. [akpm@linux-foundation.org: fix comment text, per David] Link: http://lkml.kernel.org/r/20181119101616.8901-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Konstantin Khlebnikov Acked-by: Michael S. Tsirkin Acked-by: Pankaj gupta Cc: Jonathan Corbet Cc: Alexey Dobriyan Cc: Mike Rapoport Cc: Christian Hansen Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Stephen Rothwell Cc: Matthew Wilcox Cc: Michal Hocko Cc: Pavel Tatashin Cc: Alexander Duyck Cc: Naoya Horiguchi Cc: Miles Chen Cc: David Rientjes Cc: Kazuhito Hagio Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Dave Young Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Juergen Gross Cc: Julien Freche Cc: Kairui Song Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Lianbo Jiang Cc: Michal Hocko Cc: Nadav Amit Cc: Omar Sandoval Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/pagemap.rst | 9 ++++++--- fs/proc/page.c | 4 ++-- include/linux/balloon_compaction.h | 8 ++++---- include/linux/page-flags.h | 11 +++++++---- include/uapi/linux/kernel-page-flags.h | 2 +- tools/vm/page-types.c | 2 +- 6 files changed, 21 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index 3f7bade2c231..340a5aee9b80 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -75,9 +75,10 @@ number of times a page is mapped. 20. NOPAGE 21. KSM 22. THP - 23. BALLOON + 23. OFFLINE 24. ZERO_PAGE 25. IDLE + 26. PGTABLE * ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the memory cgroup each page is charged to, indexed by PFN. Only available when @@ -118,8 +119,8 @@ Short descriptions to the page flags identical memory pages dynamically shared between one or more processes 22 - THP contiguous pages which construct transparent hugepages -23 - BALLOON - balloon compaction page +23 - OFFLINE + page is logically offline 24 - ZERO_PAGE zero page for pfn_zero or huge_zero page 25 - IDLE @@ -128,6 +129,8 @@ Short descriptions to the page flags Note that this flag may be stale in case the page was accessed via a PTE. To make sure the flag is up-to-date one has to read ``/sys/kernel/mm/page_idle/bitmap`` first. +26 - PGTABLE + page is in use as a page table IO related page flags --------------------- diff --git a/fs/proc/page.c b/fs/proc/page.c index 40b05e0d4274..544d1ee15aee 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page) else if (page_count(page) == 0 && is_free_buddy_page(page)) u |= 1 << KPF_BUDDY; - if (PageBalloon(page)) - u |= 1 << KPF_BALLOON; + if (PageOffline(page)) + u |= 1 << KPF_OFFLINE; if (PageTable(page)) u |= 1 << KPF_PGTABLE; diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index cbe50da5a59d..f111c780ef1d 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -95,7 +95,7 @@ extern int balloon_page_migrate(struct address_space *mapping, static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { - __SetPageBalloon(page); + __SetPageOffline(page); __SetPageMovable(page, balloon->inode->i_mapping); set_page_private(page, (unsigned long)balloon); list_add(&page->lru, &balloon->pages); @@ -111,7 +111,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, */ static inline void balloon_page_delete(struct page *page) { - __ClearPageBalloon(page); + __ClearPageOffline(page); __ClearPageMovable(page); set_page_private(page, 0); /* @@ -141,13 +141,13 @@ static inline gfp_t balloon_mapping_gfp_mask(void) static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { - __SetPageBalloon(page); + __SetPageOffline(page); list_add(&page->lru, &balloon->pages); } static inline void balloon_page_delete(struct page *page) { - __ClearPageBalloon(page); + __ClearPageOffline(page); list_del(&page->lru); } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 39b4494e29f1..808b4183e30d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -671,7 +671,7 @@ PAGEFLAG_FALSE(DoubleMap) /* Reserve 0x0000007f to catch underflows of page_mapcount */ #define PAGE_MAPCOUNT_RESERVE -128 #define PG_buddy 0x00000080 -#define PG_balloon 0x00000100 +#define PG_offline 0x00000100 #define PG_kmemcg 0x00000200 #define PG_table 0x00000400 @@ -706,10 +706,13 @@ static __always_inline void __ClearPage##uname(struct page *page) \ PAGE_TYPE_OPS(Buddy, buddy) /* - * PageBalloon() is true for pages that are on the balloon page list - * (see mm/balloon_compaction.c). + * PageOffline() indicates that the page is logically offline although the + * containing section is online. (e.g. inflated in a balloon driver or + * not onlined when onlining the section). + * The content of these pages is effectively stale. Such pages should not + * be touched (read/write/dump/save) except by their owner. */ -PAGE_TYPE_OPS(Balloon, balloon) +PAGE_TYPE_OPS(Offline, offline) /* * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index 21b9113c69da..6f2f2720f3ac 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h @@ -32,7 +32,7 @@ #define KPF_KSM 21 #define KPF_THP 22 -#define KPF_BALLOON 23 +#define KPF_OFFLINE 23 #define KPF_ZERO_PAGE 24 #define KPF_IDLE 25 #define KPF_PGTABLE 26 diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 1ff3a6c0367b..6f64b2b93234 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -133,7 +133,7 @@ static const char * const page_flag_names[] = { [KPF_NOPAGE] = "n:nopage", [KPF_KSM] = "x:ksm", [KPF_THP] = "t:thp", - [KPF_BALLOON] = "o:balloon", + [KPF_OFFLINE] = "o:offline", [KPF_PGTABLE] = "g:pgtable", [KPF_ZERO_PAGE] = "z:zero_page", [KPF_IDLE] = "i:idle_page", -- cgit v1.2.3 From 60cd4bcd62384cfa1e5890cebacccf08b3161156 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 5 Mar 2019 15:43:13 -0800 Subject: memcg: localize memcg_kmem_enabled() check Move the memcg_kmem_enabled() checks into memcg kmem charge/uncharge functions, so, the users don't have to explicitly check that condition. This is purely code cleanup patch without any functional change. Only the order of checks in memcg_charge_slab() can potentially be changed but the functionally it will be same. This should not matter as memcg_charge_slab() is not in the hot path. Link: http://lkml.kernel.org/r/20190103161203.162375-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 3 +-- include/linux/memcontrol.h | 37 +++++++++++++++++++++++++++++++++---- mm/memcontrol.c | 16 ++++++++-------- mm/page_alloc.c | 4 ++-- mm/slab.h | 4 ---- 5 files changed, 44 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index bdc5d3c0977d..51d5fd8840ab 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -140,8 +140,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct page *page = buf->page; if (page_count(page) == 1) { - if (memcg_kmem_enabled()) - memcg_kmem_uncharge(page, 0); + memcg_kmem_uncharge(page, 0); __SetPageLocked(page); return 0; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 83ae11cbd12c..b0eb29ea0d9c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1273,12 +1273,12 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); void memcg_kmem_put_cache(struct kmem_cache *cachep); -int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, - struct mem_cgroup *memcg); #ifdef CONFIG_MEMCG_KMEM -int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); -void memcg_kmem_uncharge(struct page *page, int order); +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); +void __memcg_kmem_uncharge(struct page *page, int order); +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg); extern struct static_key_false memcg_kmem_enabled_key; extern struct workqueue_struct *memcg_kmem_cache_wq; @@ -1300,6 +1300,26 @@ static inline bool memcg_kmem_enabled(void) return static_branch_unlikely(&memcg_kmem_enabled_key); } +static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + if (memcg_kmem_enabled()) + return __memcg_kmem_charge(page, gfp, order); + return 0; +} + +static inline void memcg_kmem_uncharge(struct page *page, int order) +{ + if (memcg_kmem_enabled()) + __memcg_kmem_uncharge(page, order); +} + +static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, + int order, struct mem_cgroup *memcg) +{ + if (memcg_kmem_enabled()) + return __memcg_kmem_charge_memcg(page, gfp, order, memcg); + return 0; +} /* * helper for accessing a memcg's index. It will be used as an index in the * child cache array in kmem_cache, and also to derive its name. This function @@ -1325,6 +1345,15 @@ static inline void memcg_kmem_uncharge(struct page *page, int order) { } +static inline int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +{ + return 0; +} + +static inline void __memcg_kmem_uncharge(struct page *page, int order) +{ +} + #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index af7f18b32389..72414bb7e226 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2573,7 +2573,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) } /** - * memcg_kmem_charge_memcg: charge a kmem page + * __memcg_kmem_charge_memcg: charge a kmem page * @page: page to charge * @gfp: reclaim mode * @order: allocation order @@ -2581,7 +2581,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) * * Returns 0 on success, an error code on failure. */ -int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct mem_cgroup *memcg) { unsigned int nr_pages = 1 << order; @@ -2604,24 +2604,24 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, } /** - * memcg_kmem_charge: charge a kmem page to the current memory cgroup + * __memcg_kmem_charge: charge a kmem page to the current memory cgroup * @page: page to charge * @gfp: reclaim mode * @order: allocation order * * Returns 0 on success, an error code on failure. */ -int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; int ret = 0; - if (mem_cgroup_disabled() || memcg_kmem_bypass()) + if (memcg_kmem_bypass()) return 0; memcg = get_mem_cgroup_from_current(); if (!mem_cgroup_is_root(memcg)) { - ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); if (!ret) __SetPageKmemcg(page); } @@ -2629,11 +2629,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) return ret; } /** - * memcg_kmem_uncharge: uncharge a kmem page + * __memcg_kmem_uncharge: uncharge a kmem page * @page: page to uncharge * @order: allocation order */ -void memcg_kmem_uncharge(struct page *page, int order) +void __memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; unsigned int nr_pages = 1 << order; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1f9f1409df9b..034b8b6043a3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1056,7 +1056,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (PageMappingFlags(page)) page->mapping = NULL; if (memcg_kmem_enabled() && PageKmemcg(page)) - memcg_kmem_uncharge(page, order); + __memcg_kmem_uncharge(page, order); if (check_free) bad += free_pages_check(page); if (bad) @@ -4568,7 +4568,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, out: if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && - unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { + unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) { __free_pages(page, order); page = NULL; } diff --git a/mm/slab.h b/mm/slab.h index 384105318779..e5e6658eeacc 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -276,8 +276,6 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - if (!memcg_kmem_enabled()) - return 0; if (is_root_cache(s)) return 0; return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); @@ -286,8 +284,6 @@ static __always_inline int memcg_charge_slab(struct page *page, static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { - if (!memcg_kmem_enabled()) - return; memcg_kmem_uncharge(page, order); } -- cgit v1.2.3 From 147e1a97c4a0bdd43f55a582a9416bb9092563a9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 5 Mar 2019 15:45:45 -0800 Subject: fs: kernfs: add poll file operation Patch series "psi: pressure stall monitors", v3. Android is adopting psi to detect and remedy memory pressure that results in stuttering and decreased responsiveness on mobile devices. Psi gives us the stall information, but because we're dealing with latencies in the millisecond range, periodically reading the pressure files to detect stalls in a timely fashion is not feasible. Psi also doesn't aggregate its averages at a high enough frequency right now. This patch series extends the psi interface such that users can configure sensitive latency thresholds and use poll() and friends to be notified when these are breached. As high-frequency aggregation is costly, it implements an aggregation method that is optimized for fast, short-interval averaging, and makes the aggregation frequency adaptive, such that high-frequency updates only happen while monitored stall events are actively occurring. With these patches applied, Android can monitor for, and ward off, mounting memory shortages before they cause problems for the user. For example, using memory stall monitors in userspace low memory killer daemon (lmkd) we can detect mounting pressure and kill less important processes before device becomes visibly sluggish. In our memory stress testing psi memory monitors produce roughly 10x less false positives compared to vmpressure signals. Having ability to specify multiple triggers for the same psi metric allows other parts of Android framework to monitor memory state of the device and act accordingly. The new interface is straightforward. The user opens one of the pressure files for writing and writes a trigger description into the file descriptor that defines the stall state - some or full, and the maximum stall time over a given window of time. E.g.: /* Signal when stall time exceeds 100ms of a 1s window */ char trigger[] = "full 100000 1000000"; fd = open("/proc/pressure/memory"); write(fd, trigger, sizeof(trigger)); while (poll() >= 0) { ... } close(fd); When the monitored stall state is entered, psi adapts its aggregation frequency according to what the configured time window requires in order to emit event signals in a timely fashion. Once the stalling subsides, aggregation reverts back to normal. The trigger is associated with the open file descriptor. To stop monitoring, the user only needs to close the file descriptor and the trigger is discarded. Patches 1-4 prepare the psi code for polling support. Patch 5 implements the adaptive polling logic, the pressure growth detection optimized for short intervals, and hooks up write() and poll() on the pressure files. The patches were developed in collaboration with Johannes Weiner. This patch (of 5): Kernfs has a standardized poll/notification mechanism for waking all pollers on all fds when a filesystem node changes. To allow polling for custom events, add a .poll callback that can override the default. This is in preparation for pollable cgroup pressure files which have per-fd trigger configurations. Link: http://lkml.kernel.org/r/20190124211518.244221-2-surenb@google.com Signed-off-by: Johannes Weiner Signed-off-by: Suren Baghdasaryan Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/kernfs/file.c | 31 ++++++++++++++++++++----------- include/linux/kernfs.h | 6 ++++++ 2 files changed, 26 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index f8d5021a652e..ae948aaa4c53 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -832,26 +832,35 @@ void kernfs_drain_open_files(struct kernfs_node *kn) * to see if it supports poll (Neither 'poll' nor 'select' return * an appropriate error code). When in doubt, set a suitable timeout value. */ +__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) +{ + struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry); + struct kernfs_open_node *on = kn->attr.open; + + poll_wait(of->file, &on->poll, wait); + + if (of->event != atomic_read(&on->event)) + return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; + + return DEFAULT_POLLMASK; +} + static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); - struct kernfs_open_node *on = kn->attr.open; + __poll_t ret; if (!kernfs_get_active(kn)) - goto trigger; + return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; - poll_wait(filp, &on->poll, wait); + if (kn->attr.ops->poll) + ret = kn->attr.ops->poll(of, wait); + else + ret = kernfs_generic_poll(of, wait); kernfs_put_active(kn); - - if (of->event != atomic_read(&on->event)) - goto trigger; - - return DEFAULT_POLLMASK; - - trigger: - return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; + return ret; } static void kernfs_notify_workfn(struct work_struct *work) diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 5b36b1287a5a..0cac1207bb00 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -25,6 +25,7 @@ struct seq_file; struct vm_area_struct; struct super_block; struct file_system_type; +struct poll_table_struct; struct kernfs_open_node; struct kernfs_iattrs; @@ -261,6 +262,9 @@ struct kernfs_ops { ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); + __poll_t (*poll)(struct kernfs_open_file *of, + struct poll_table_struct *pt); + int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma); #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -350,6 +354,8 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns); int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); +__poll_t kernfs_generic_poll(struct kernfs_open_file *of, + struct poll_table_struct *pt); void kernfs_notify(struct kernfs_node *kn); const void *kernfs_super_ns(struct super_block *sb); -- cgit v1.2.3 From 0cbe3e26abe0cfe7effb67f620a77d46cce628b2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:46:26 -0800 Subject: mm: update ptep_modify_prot_start/commit to take vm_area_struct as arg Patch series "NestMMU pte upgrade workaround for mprotect", v5. We can upgrade pte access (R -> RW transition) via mprotect. We need to make sure we follow the recommended pte update sequence as outlined in commit bd5050e38aec ("powerpc/mm/radix: Change pte relax sequence to handle nest MMU hang") for such updates. This patch series does that. This patch (of 5): Some architectures may want to call flush_tlb_range from these helpers. Link: http://lkml.kernel.org/r/20190116085035.29729-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Nicholas Piggin Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/pgtable.h | 4 ++-- arch/s390/mm/pgtable.c | 6 ++++-- arch/x86/include/asm/paravirt.h | 11 ++++++----- arch/x86/include/asm/paravirt_types.h | 5 +++-- arch/x86/xen/mmu.h | 4 ++-- arch/x86/xen/mmu_pv.c | 8 ++++---- fs/proc/task_mmu.c | 4 ++-- include/asm-generic/pgtable.h | 16 ++++++++-------- mm/memory.c | 4 ++-- mm/mprotect.c | 4 ++-- 10 files changed, 35 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 063732414dfb..5d730199e37b 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1069,8 +1069,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION -pte_t ptep_modify_prot_start(struct mm_struct *, unsigned long, pte_t *); -void ptep_modify_prot_commit(struct mm_struct *, unsigned long, pte_t *, pte_t); +pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); +void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, pte_t *, pte_t); #define __HAVE_ARCH_PTEP_CLEAR_FLUSH static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 689b66f29fc6..71aa01170768 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -301,12 +301,13 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(ptep_xchg_lazy); -pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pgste_t pgste; pte_t old; int nodat; + struct mm_struct *mm = vma->vm_mm; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); @@ -319,10 +320,11 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, return old; } -void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, +void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { pgste_t pgste; + struct mm_struct *mm = vma->vm_mm; if (!MACHINE_HAS_NX) pte_val(pte) &= ~_PAGE_NOEXEC; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a97f28d914d5..c5a7f18cce7e 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -422,25 +422,26 @@ static inline pgdval_t pgd_val(pgd_t pgd) } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION -static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, +static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pteval_t ret; - ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, mm, addr, ptep); + ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep); return (pte_t) { .pte = ret }; } -static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, +static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { + if (sizeof(pteval_t) > sizeof(long)) /* 5 arg words */ - pv_ops.mmu.ptep_modify_prot_commit(mm, addr, ptep, pte); + pv_ops.mmu.ptep_modify_prot_commit(vma, addr, ptep, pte); else PVOP_VCALL4(mmu.ptep_modify_prot_commit, - mm, addr, ptep, pte.pte); + vma, addr, ptep, pte.pte); } static inline void set_pte(pte_t *ptep, pte_t pte) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 488c59686a73..2474e434a6f7 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -55,6 +55,7 @@ struct task_struct; struct cpumask; struct flush_tlb_info; struct mmu_gather; +struct vm_area_struct; /* * Wrapper type for pointers to code which uses the non-standard @@ -254,9 +255,9 @@ struct pv_mmu_ops { pte_t *ptep, pte_t pteval); void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); - pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, + pte_t (*ptep_modify_prot_start)(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); - void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, + void (*ptep_modify_prot_commit)(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte); struct paravirt_callee_save pte_val; diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index a7e47cf7ec6c..6e4c6bd62203 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -17,8 +17,8 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, +pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte); unsigned long xen_read_cr2_direct(void); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 0f4fe206dcc2..856a85814f00 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -306,20 +306,20 @@ static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, __xen_set_pte(ptep, pteval); } -pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, +pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* Just return the pte as-is. We preserve the bits on commit */ - trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); + trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep); return *ptep; } -void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, +void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { struct mmu_update u; - trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); + trace_xen_mmu_ptep_modify_prot_commit(vma->vm_mm, addr, ptep, pte); xen_mc_batch(); u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 85b0ef890b28..9c2ef731dd5f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -948,10 +948,10 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, pte_t ptent = *pte; if (pte_present(ptent)) { - ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte); + ptent = ptep_modify_prot_start(vma, addr, pte); ptent = pte_wrprotect(ptent); ptent = pte_clear_soft_dirty(ptent); - ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent); + ptep_modify_prot_commit(vma, addr, pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 05e61e6c843f..8b0e933efe26 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -606,7 +606,7 @@ static inline int pmd_none_or_clear_bad(pmd_t *pmd) return 0; } -static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm, +static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -615,10 +615,10 @@ static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm, * non-present, preventing the hardware from asynchronously * updating it. */ - return ptep_get_and_clear(mm, addr, ptep); + return ptep_get_and_clear(vma->vm_mm, addr, ptep); } -static inline void __ptep_modify_prot_commit(struct mm_struct *mm, +static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -626,7 +626,7 @@ static inline void __ptep_modify_prot_commit(struct mm_struct *mm, * The pte is non-present, so there's no hardware state to * preserve. */ - set_pte_at(mm, addr, ptep, pte); + set_pte_at(vma->vm_mm, addr, ptep, pte); } #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -644,22 +644,22 @@ static inline void __ptep_modify_prot_commit(struct mm_struct *mm, * queue the update to be done at some later time. The update must be * actually committed before the pte lock is released, however. */ -static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, +static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - return __ptep_modify_prot_start(mm, addr, ptep); + return __ptep_modify_prot_start(vma, addr, ptep); } /* * Commit an update to a pte, leaving any hardware-controlled bits in * the PTE unmodified. */ -static inline void ptep_modify_prot_commit(struct mm_struct *mm, +static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { - __ptep_modify_prot_commit(mm, addr, ptep, pte); + __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ #endif /* CONFIG_MMU */ diff --git a/mm/memory.c b/mm/memory.c index 6aff43171a7b..5ade52502ea0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3619,12 +3619,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Make it present again, Depending on how arch implementes non * accessible ptes, some can allow access by kernel mode. */ - pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte); + pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); pte = pte_modify(pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte); + ptep_modify_prot_commit(vma, vmf->address, vmf->pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); page = vm_normal_page(vma, vmf->address, pte); diff --git a/mm/mprotect.c b/mm/mprotect.c index 36cb358db170..c89ce07923c8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -110,7 +110,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; } - ptent = ptep_modify_prot_start(mm, addr, pte); + ptent = ptep_modify_prot_start(vma, addr, pte); ptent = pte_modify(ptent, newprot); if (preserve_write) ptent = pte_mk_savedwrite(ptent); @@ -121,7 +121,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, !(vma->vm_flags & VM_SOFTDIRTY))) { ptent = pte_mkwrite(ptent); } - ptep_modify_prot_commit(mm, addr, pte, ptent); + ptep_modify_prot_commit(vma, addr, pte, ptent); pages++; } else if (IS_ENABLED(CONFIG_MIGRATION)) { swp_entry_t entry = pte_to_swp_entry(oldpte); -- cgit v1.2.3 From 04a8645304500be88b3345b65fef7efe58016166 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 5 Mar 2019 15:46:29 -0800 Subject: mm: update ptep_modify_prot_commit to take old pte value as arg Architectures like ppc64 require to do a conditional tlb flush based on the old and new value of pte. Enable that by passing old pte value as the arg. Link: http://lkml.kernel.org/r/20190116085035.29729-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/pgtable.h | 3 ++- arch/s390/mm/pgtable.c | 2 +- arch/x86/include/asm/paravirt.h | 2 +- fs/proc/task_mmu.c | 8 +++++--- include/asm-generic/pgtable.h | 2 +- mm/memory.c | 8 ++++---- mm/mprotect.c | 6 +++--- 7 files changed, 17 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 5d730199e37b..76dc344edb8c 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1070,7 +1070,8 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); -void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, pte_t *, pte_t); +void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, + pte_t *, pte_t, pte_t); #define __HAVE_ARCH_PTEP_CLEAR_FLUSH static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 71aa01170768..8485d6dc2754 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -321,7 +321,7 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, } void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t pte) + pte_t *ptep, pte_t old_pte, pte_t pte) { pgste_t pgste; struct mm_struct *mm = vma->vm_mm; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c5a7f18cce7e..c25c38a05c1c 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -433,7 +433,7 @@ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned } static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t pte) + pte_t *ptep, pte_t old_pte, pte_t pte) { if (sizeof(pteval_t) > sizeof(long)) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9c2ef731dd5f..beccb0b1d57c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -948,10 +948,12 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, pte_t ptent = *pte; if (pte_present(ptent)) { - ptent = ptep_modify_prot_start(vma, addr, pte); - ptent = pte_wrprotect(ptent); + pte_t old_pte; + + old_pte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); - ptep_modify_prot_commit(vma, addr, pte, ptent); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 8b0e933efe26..fa782fba51ee 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -657,7 +657,7 @@ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t pte) + pte_t *ptep, pte_t old_pte, pte_t pte) { __ptep_modify_prot_commit(vma, addr, ptep, pte); } diff --git a/mm/memory.c b/mm/memory.c index 5ade52502ea0..557c6fffedd1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3599,7 +3599,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) int last_cpupid; int target_nid; bool migrated = false; - pte_t pte; + pte_t pte, old_pte; bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; @@ -3619,12 +3619,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Make it present again, Depending on how arch implementes non * accessible ptes, some can allow access by kernel mode. */ - pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); - pte = pte_modify(pte, vma->vm_page_prot); + old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - ptep_modify_prot_commit(vma, vmf->address, vmf->pte, pte); + ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); page = vm_normal_page(vma, vmf->address, pte); diff --git a/mm/mprotect.c b/mm/mprotect.c index c89ce07923c8..028c724dcb1a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -110,8 +110,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; } - ptent = ptep_modify_prot_start(vma, addr, pte); - ptent = pte_modify(ptent, newprot); + oldpte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_modify(oldpte, newprot); if (preserve_write) ptent = pte_mk_savedwrite(ptent); @@ -121,7 +121,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, !(vma->vm_flags & VM_SOFTDIRTY))) { ptent = pte_mkwrite(ptent); } - ptep_modify_prot_commit(vma, addr, pte, ptent); + ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); pages++; } else if (IS_ENABLED(CONFIG_MIGRATION)) { swp_entry_t entry = pte_to_swp_entry(oldpte); -- cgit v1.2.3 From ab3948f58ff841e51feb845720624665ef5b7ef3 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 5 Mar 2019 15:47:54 -0800 Subject: mm/memfd: add an F_SEAL_FUTURE_WRITE seal to memfd Android uses ashmem for sharing memory regions. We are looking forward to migrating all usecases of ashmem to memfd so that we can possibly remove the ashmem driver in the future from staging while also benefiting from using memfd and contributing to it. Note staging drivers are also not ABI and generally can be removed at anytime. One of the main usecases Android has is the ability to create a region and mmap it as writeable, then add protection against making any "future" writes while keeping the existing already mmap'ed writeable-region active. This allows us to implement a usecase where receivers of the shared memory buffer can get a read-only view, while the sender continues to write to the buffer. See CursorWindow documentation in Android for more details: https://developer.android.com/reference/android/database/CursorWindow This usecase cannot be implemented with the existing F_SEAL_WRITE seal. To support the usecase, this patch adds a new F_SEAL_FUTURE_WRITE seal which prevents any future mmap and write syscalls from succeeding while keeping the existing mmap active. A better way to do F_SEAL_FUTURE_WRITE seal was discussed [1] last week where we don't need to modify core VFS structures to get the same behavior of the seal. This solves several side-effects pointed by Andy. self-tests are provided in later patch to verify the expected semantics. [1] https://lore.kernel.org/lkml/20181111173650.GA256781@google.com/ Thanks a lot to Andy for suggestions to improve code. Link: http://lkml.kernel.org/r/20190112203816.85534-2-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Acked-by: John Stultz Cc: Andy Lutomirski Cc: Minchan Kim Cc: Jann Horn Cc: Al Viro Cc: Andy Lutomirski Cc: Hugh Dickins Cc: J. Bruce Fields Cc: Jeff Layton Cc: Marc-Andr Lureau Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- include/uapi/linux/fcntl.h | 1 + mm/memfd.c | 3 ++- mm/shmem.c | 25 ++++++++++++++++++++++--- 4 files changed, 26 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a7fa037b876b..b0eef008de67 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode_lock(inode); /* protected by i_mutex */ - if (info->seals & F_SEAL_WRITE) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { inode_unlock(inode); return -EPERM; } diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 6448cdd9a350..a2f8658f1c55 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -41,6 +41,7 @@ #define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ #define F_SEAL_GROW 0x0004 /* prevent file from growing */ #define F_SEAL_WRITE 0x0008 /* prevent writes */ +#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ /* (1U << 31) is reserved for signed error codes */ /* diff --git a/mm/memfd.c b/mm/memfd.c index 97264c79d2cd..650e65a46b9c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -131,7 +131,8 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ - F_SEAL_WRITE) + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE) static int memfd_add_seals(struct file *file, unsigned int seals) { diff --git a/mm/shmem.c b/mm/shmem.c index 283a1833dafc..b3db3779a30a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2190,6 +2190,24 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { + struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + + if (info->seals & F_SEAL_FUTURE_WRITE) { + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * "future write" seal active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; + + /* + * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED + * read-only mapping, take care to not allow mprotect to revert + * protections. + */ + vma->vm_flags &= ~(VM_MAYWRITE); + } + file_accessed(file); vma->vm_ops = &shmem_vm_ops; if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && @@ -2440,8 +2458,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; /* i_mutex is held by caller */ - if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { - if (info->seals & F_SEAL_WRITE) + if (unlikely(info->seals & (F_SEAL_GROW | + F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) return -EPERM; if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) return -EPERM; @@ -2704,7 +2723,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); /* protected by i_mutex */ - if (info->seals & F_SEAL_WRITE) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; } -- cgit v1.2.3 From 756ca74c7f656b6ed3cb60344845878226b658ae Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 5 Mar 2019 15:50:22 -0800 Subject: fs/proc/self.c: code cleanup for proc_setup_self() Remove unnecessary ERR_PTR()/PTR_ERR() cast in proc_setup_self(). Link: http://lkml.kernel.org/r/20190124030150.8472-1-cgxu519@gmx.com Signed-off-by: Chengguang Xu Reviewed-by: Andrew Morton Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/self.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/proc/self.c b/fs/proc/self.c index 127265e5c55f..57c0a1047250 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -38,6 +38,7 @@ int proc_setup_self(struct super_block *s) struct inode *root_inode = d_inode(s->s_root); struct pid_namespace *ns = proc_pid_ns(root_inode); struct dentry *self; + int ret = -ENOMEM; inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); @@ -51,20 +52,19 @@ int proc_setup_self(struct super_block *s) inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; d_add(self, inode); + ret = 0; } else { dput(self); - self = ERR_PTR(-ENOMEM); } - } else { - self = ERR_PTR(-ENOMEM); } inode_unlock(root_inode); - if (IS_ERR(self)) { + + if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); - return PTR_ERR(self); - } - ns->proc_self = self; - return 0; + else + ns->proc_self = self; + + return ret; } void __init proc_self_init(void) -- cgit v1.2.3 From 45f68ab50234e825cdc7aee76a40d227d92eea14 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 5 Mar 2019 15:50:25 -0800 Subject: fs/proc/thread_self.c: code cleanup for proc_setup_thread_self() Remove unnecessary ERR_PTR()/PTR_ERR() cast in proc_setup_thread_self(). Link: http://lkml.kernel.org/r/20190124030150.8472-2-cgxu519@gmx.com Signed-off-by: Chengguang Xu Reviewed-by: Andrew Morton Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/thread_self.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index b905010ca9eb..f61ae53533f5 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -38,6 +38,7 @@ int proc_setup_thread_self(struct super_block *s) struct inode *root_inode = d_inode(s->s_root); struct pid_namespace *ns = proc_pid_ns(root_inode); struct dentry *thread_self; + int ret = -ENOMEM; inode_lock(root_inode); thread_self = d_alloc_name(s->s_root, "thread-self"); @@ -51,20 +52,19 @@ int proc_setup_thread_self(struct super_block *s) inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_thread_self_inode_operations; d_add(thread_self, inode); + ret = 0; } else { dput(thread_self); - thread_self = ERR_PTR(-ENOMEM); } - } else { - thread_self = ERR_PTR(-ENOMEM); } inode_unlock(root_inode); - if (IS_ERR(thread_self)) { + + if (ret) pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); - return PTR_ERR(thread_self); - } - ns->proc_thread_self = thread_self; - return 0; + else + ns->proc_thread_self = thread_self; + + return ret; } void __init proc_thread_self_init(void) -- cgit v1.2.3 From 867aaccf1f2c35eff4706ea69299f731f2a1953e Mon Sep 17 00:00:00 2001 From: Zhikang Zhang Date: Tue, 5 Mar 2019 15:50:29 -0800 Subject: proc: remove unused argument in proc_pid_lookup() [adobriyan@gmail.com: delete "extern" from prototype] Link: http://lkml.kernel.org/r/20190114195635.GA9372@avx2 Signed-off-by: Zhikang Zhang Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- fs/proc/internal.h | 2 +- fs/proc/root.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index f5ed9512d193..fcb0180d3988 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3161,7 +3161,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry, return d_splice_alias(inode, dentry); } -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) +struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) { struct task_struct *task; unsigned tgid; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 95b14196f284..4fc5a9b68f76 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -162,7 +162,7 @@ extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struc extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); -extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); +struct dentry *proc_pid_lookup(struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ diff --git a/fs/proc/root.c b/fs/proc/root.c index f4b1a9d2eca6..621e6ec322ca 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -154,7 +154,7 @@ static int proc_root_getattr(const struct path *path, struct kstat *stat, static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { - if (!proc_pid_lookup(dir, dentry, flags)) + if (!proc_pid_lookup(dentry, flags)) return NULL; return proc_lookup(dir, dentry, flags); -- cgit v1.2.3 From 5713f35c0575a1137b705e13d10f8ee58f2ec7e8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Mar 2019 15:50:32 -0800 Subject: proc: read kernel cpu stat pointer once Help gcc generate better code: $ ./scripts/bloat-o-meter ../vmlinux-000 ../vmlinux-001 add/remove: 2/2 grow/shrink: 0/1 up/down: 92/-142 (-50) Function old new delta get_iowait_time.isra - 46 +46 get_idle_time.isra - 46 +46 show_stat 1489 1477 -12 get_iowait_time 65 - -65 get_idle_time 65 - -65 Link: http://lkml.kernel.org/r/20190114195907.GA9680@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/stat.c | 60 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 535eda7857cf..49aa0a2b0d9e 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -23,21 +23,21 @@ #ifdef arch_idle_time -static u64 get_idle_time(int cpu) +static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle; - idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + idle = kcs->cpustat[CPUTIME_IDLE]; if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) idle += arch_idle_time(cpu); return idle; } -static u64 get_iowait_time(int cpu) +static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait; - iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + iowait = kcs->cpustat[CPUTIME_IOWAIT]; if (cpu_online(cpu) && nr_iowait_cpu(cpu)) iowait += arch_idle_time(cpu); return iowait; @@ -45,7 +45,7 @@ static u64 get_iowait_time(int cpu) #else -static u64 get_idle_time(int cpu) +static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; @@ -54,14 +54,14 @@ static u64 get_idle_time(int cpu) if (idle_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ - idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + idle = kcs->cpustat[CPUTIME_IDLE]; else idle = idle_usecs * NSEC_PER_USEC; return idle; } -static u64 get_iowait_time(int cpu) +static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait, iowait_usecs = -1ULL; @@ -70,7 +70,7 @@ static u64 get_iowait_time(int cpu) if (iowait_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ - iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + iowait = kcs->cpustat[CPUTIME_IOWAIT]; else iowait = iowait_usecs * NSEC_PER_USEC; @@ -95,16 +95,18 @@ static int show_stat(struct seq_file *p, void *v) getboottime64(&boottime); for_each_possible_cpu(i) { - user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; - nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; - idle += get_idle_time(i); - iowait += get_iowait_time(i); - irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; - softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; - steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; - guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; - guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + struct kernel_cpustat *kcs = &kcpustat_cpu(i); + + user += kcs->cpustat[CPUTIME_USER]; + nice += kcs->cpustat[CPUTIME_NICE]; + system += kcs->cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(kcs, i); + iowait += get_iowait_time(kcs, i); + irq += kcs->cpustat[CPUTIME_IRQ]; + softirq += kcs->cpustat[CPUTIME_SOFTIRQ]; + steal += kcs->cpustat[CPUTIME_STEAL]; + guest += kcs->cpustat[CPUTIME_GUEST]; + guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); @@ -130,17 +132,19 @@ static int show_stat(struct seq_file *p, void *v) seq_putc(p, '\n'); for_each_online_cpu(i) { + struct kernel_cpustat *kcs = &kcpustat_cpu(i); + /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; - nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; - idle = get_idle_time(i); - iowait = get_iowait_time(i); - irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; - softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; - steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; - guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; - guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; + user = kcs->cpustat[CPUTIME_USER]; + nice = kcs->cpustat[CPUTIME_NICE]; + system = kcs->cpustat[CPUTIME_SYSTEM]; + idle = get_idle_time(kcs, i); + iowait = get_iowait_time(kcs, i); + irq = kcs->cpustat[CPUTIME_IRQ]; + softirq = kcs->cpustat[CPUTIME_SOFTIRQ]; + steal = kcs->cpustat[CPUTIME_STEAL]; + guest = kcs->cpustat[CPUTIME_GUEST]; + guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); -- cgit v1.2.3 From 08b55775133b77acc9975ad772b41813cbfea674 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 5 Mar 2019 15:50:35 -0800 Subject: proc: use seq_puts() everywhere seq_printf() without format specifiers == faster seq_puts() Link: http://lkml.kernel.org/r/20190114200545.GC9680@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 16 ++++++++-------- fs/proc/base.c | 2 +- fs/proc/task_nommu.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index 9d428d5a0ac8..2edbb657f859 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -343,28 +343,28 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); #endif - seq_printf(m, "\nSpeculation_Store_Bypass:\t"); + seq_puts(m, "\nSpeculation_Store_Bypass:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { case -EINVAL: - seq_printf(m, "unknown"); + seq_puts(m, "unknown"); break; case PR_SPEC_NOT_AFFECTED: - seq_printf(m, "not vulnerable"); + seq_puts(m, "not vulnerable"); break; case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: - seq_printf(m, "thread force mitigated"); + seq_puts(m, "thread force mitigated"); break; case PR_SPEC_PRCTL | PR_SPEC_DISABLE: - seq_printf(m, "thread mitigated"); + seq_puts(m, "thread mitigated"); break; case PR_SPEC_PRCTL | PR_SPEC_ENABLE: - seq_printf(m, "thread vulnerable"); + seq_puts(m, "thread vulnerable"); break; case PR_SPEC_DISABLE: - seq_printf(m, "globally mitigated"); + seq_puts(m, "globally mitigated"); break; default: - seq_printf(m, "vulnerable"); + seq_puts(m, "vulnerable"); break; } seq_putc(m, '\n'); diff --git a/fs/proc/base.c b/fs/proc/base.c index fcb0180d3988..511b279ec69c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -456,7 +456,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { if (unlikely(!sched_info_on())) - seq_printf(m, "0 0 0\n"); + seq_puts(m, "0 0 0\n"); else seq_printf(m, "%llu %llu %lu\n", (unsigned long long)task->se.sum_exec_runtime, diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 0b63d68dedb2..3b7e310297d2 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -178,7 +178,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) seq_file_path(m, file, ""); } else if (mm && is_stack(vma)) { seq_pad(m, ' '); - seq_printf(m, "[stack]"); + seq_puts(m, "[stack]"); } seq_putc(m, '\n'); -- cgit v1.2.3