diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-29 11:06:13 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-29 11:06:13 -0700 |
commit | 3644286f6cbcea86f6fa4d308e7ac06bf2a3715a (patch) | |
tree | d052231d1029d3e4f783c9b74b295eb07d194130 /fs | |
parent | 767fcbc80f63d7f08ff6c0858fe33583e6fdd327 (diff) | |
parent | 59cda49ecf6c9a32fae4942420701b6e087204f6 (diff) | |
download | linux-3644286f6cbcea86f6fa4d308e7ac06bf2a3715a.tar.bz2 |
Merge tag 'fsnotify_for_v5.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs
Pull fsnotify updates from Jan Kara:
- support for limited fanotify functionality for unpriviledged users
- faster merging of fanotify events
- a few smaller fsnotify improvements
* tag 'fsnotify_for_v5.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs:
shmem: allow reporting fanotify events with file handles on tmpfs
fs: introduce a wrapper uuid_to_fsid()
fanotify_user: use upper_32_bits() to verify mask
fanotify: support limited functionality for unprivileged users
fanotify: configurable limits via sysfs
fanotify: limit number of event merge attempts
fsnotify: use hash table for faster events merge
fanotify: mix event info and pid into merge key hash
fanotify: reduce event objectid to 29-bit hash
fsnotify: allow fsnotify_{peek,remove}_first_event with empty queue
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ext2/super.c | 5 | ||||
-rw-r--r-- | fs/ext4/super.c | 5 | ||||
-rw-r--r-- | fs/notify/fanotify/fanotify.c | 166 | ||||
-rw-r--r-- | fs/notify/fanotify/fanotify.h | 46 | ||||
-rw-r--r-- | fs/notify/fanotify/fanotify_user.c | 219 | ||||
-rw-r--r-- | fs/notify/fdinfo.c | 3 | ||||
-rw-r--r-- | fs/notify/group.c | 1 | ||||
-rw-r--r-- | fs/notify/inotify/inotify_fsnotify.c | 9 | ||||
-rw-r--r-- | fs/notify/inotify/inotify_user.c | 7 | ||||
-rw-r--r-- | fs/notify/mark.c | 4 | ||||
-rw-r--r-- | fs/notify/notification.c | 64 | ||||
-rw-r--r-- | fs/zonefs/super.c | 5 |
12 files changed, 395 insertions, 139 deletions
diff --git a/fs/ext2/super.c b/fs/ext2/super.c index d2fd9707e953..21e09fbaa46f 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1399,7 +1399,6 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) struct super_block *sb = dentry->d_sb; struct ext2_sb_info *sbi = EXT2_SB(sb); struct ext2_super_block *es = sbi->s_es; - u64 fsid; spin_lock(&sbi->s_lock); @@ -1453,9 +1452,7 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) buf->f_ffree = ext2_count_free_inodes(sb); es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); buf->f_namelen = EXT2_NAME_LEN; - fsid = le64_to_cpup((void *)es->s_uuid) ^ - le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid = u64_to_fsid(fsid); + buf->f_fsid = uuid_to_fsid(es->s_uuid); spin_unlock(&sbi->s_lock); return 0; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b9693680463a..3868377dec2d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6153,7 +6153,6 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_fsblk_t overhead = 0, resv_blocks; - u64 fsid; s64 bfree; resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); @@ -6174,9 +6173,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); buf->f_namelen = EXT4_NAME_LEN; - fsid = le64_to_cpup((void *)es->s_uuid) ^ - le64_to_cpup((void *)es->s_uuid + sizeof(u64)); - buf->f_fsid = u64_to_fsid(fsid); + buf->f_fsid = uuid_to_fsid(es->s_uuid); #ifdef CONFIG_QUOTA if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 1192c9953620..057abd2cf887 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -14,6 +14,7 @@ #include <linux/audit.h> #include <linux/sched/mm.h> #include <linux/statfs.h> +#include <linux/stringhash.h> #include "fanotify.h" @@ -22,12 +23,24 @@ static bool fanotify_path_equal(struct path *p1, struct path *p2) return p1->mnt == p2->mnt && p1->dentry == p2->dentry; } +static unsigned int fanotify_hash_path(const struct path *path) +{ + return hash_ptr(path->dentry, FANOTIFY_EVENT_HASH_BITS) ^ + hash_ptr(path->mnt, FANOTIFY_EVENT_HASH_BITS); +} + static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1, __kernel_fsid_t *fsid2) { return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1]; } +static unsigned int fanotify_hash_fsid(__kernel_fsid_t *fsid) +{ + return hash_32(fsid->val[0], FANOTIFY_EVENT_HASH_BITS) ^ + hash_32(fsid->val[1], FANOTIFY_EVENT_HASH_BITS); +} + static bool fanotify_fh_equal(struct fanotify_fh *fh1, struct fanotify_fh *fh2) { @@ -38,6 +51,16 @@ static bool fanotify_fh_equal(struct fanotify_fh *fh1, !memcmp(fanotify_fh_buf(fh1), fanotify_fh_buf(fh2), fh1->len); } +static unsigned int fanotify_hash_fh(struct fanotify_fh *fh) +{ + long salt = (long)fh->type | (long)fh->len << 8; + + /* + * full_name_hash() works long by long, so it handles fh buf optimally. + */ + return full_name_hash((void *)salt, fanotify_fh_buf(fh), fh->len); +} + static bool fanotify_fid_event_equal(struct fanotify_fid_event *ffe1, struct fanotify_fid_event *ffe2) { @@ -88,16 +111,12 @@ static bool fanotify_name_event_equal(struct fanotify_name_event *fne1, return fanotify_info_equal(info1, info2); } -static bool fanotify_should_merge(struct fsnotify_event *old_fsn, - struct fsnotify_event *new_fsn) +static bool fanotify_should_merge(struct fanotify_event *old, + struct fanotify_event *new) { - struct fanotify_event *old, *new; - - pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); - old = FANOTIFY_E(old_fsn); - new = FANOTIFY_E(new_fsn); + pr_debug("%s: old=%p new=%p\n", __func__, old, new); - if (old_fsn->objectid != new_fsn->objectid || + if (old->hash != new->hash || old->type != new->type || old->pid != new->pid) return false; @@ -129,14 +148,20 @@ static bool fanotify_should_merge(struct fsnotify_event *old_fsn, return false; } +/* Limit event merges to limit CPU overhead per event */ +#define FANOTIFY_MAX_MERGE_EVENTS 128 + /* and the list better be locked by something too! */ -static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) +static int fanotify_merge(struct fsnotify_group *group, + struct fsnotify_event *event) { - struct fsnotify_event *test_event; - struct fanotify_event *new; + struct fanotify_event *old, *new = FANOTIFY_E(event); + unsigned int bucket = fanotify_event_hash_bucket(group, new); + struct hlist_head *hlist = &group->fanotify_data.merge_hash[bucket]; + int i = 0; - pr_debug("%s: list=%p event=%p\n", __func__, list, event); - new = FANOTIFY_E(event); + pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, + group, event, bucket); /* * Don't merge a permission event with any other event so that we know @@ -146,9 +171,11 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) if (fanotify_is_perm_event(new->mask)) return 0; - list_for_each_entry_reverse(test_event, list, list) { - if (fanotify_should_merge(test_event, event)) { - FANOTIFY_E(test_event)->mask |= new->mask; + hlist_for_each_entry(old, hlist, merge_list) { + if (++i > FANOTIFY_MAX_MERGE_EVENTS) + break; + if (fanotify_should_merge(old, new)) { + old->mask |= new->mask; return 1; } } @@ -184,8 +211,11 @@ static int fanotify_get_response(struct fsnotify_group *group, return ret; } /* Event not yet reported? Just remove it. */ - if (event->state == FAN_EVENT_INIT) + if (event->state == FAN_EVENT_INIT) { fsnotify_remove_queued_event(group, &event->fae.fse); + /* Permission events are not supposed to be hashed */ + WARN_ON_ONCE(!hlist_unhashed(&event->fae.merge_list)); + } /* * Event may be also answered in case signal delivery raced * with wakeup. In that case we have nothing to do besides @@ -329,7 +359,8 @@ static int fanotify_encode_fh_len(struct inode *inode) * Return 0 on failure to encode. */ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, - unsigned int fh_len, gfp_t gfp) + unsigned int fh_len, unsigned int *hash, + gfp_t gfp) { int dwords, type = 0; char *ext_buf = NULL; @@ -372,6 +403,9 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, fh->type = type; fh->len = fh_len; + /* Mix fh into event merge key */ + *hash ^= fanotify_hash_fh(fh); + return FANOTIFY_FH_HDR_LEN + fh_len; out_err: @@ -425,6 +459,7 @@ static struct inode *fanotify_dfid_inode(u32 event_mask, const void *data, } static struct fanotify_event *fanotify_alloc_path_event(const struct path *path, + unsigned int *hash, gfp_t gfp) { struct fanotify_path_event *pevent; @@ -435,6 +470,7 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path, pevent->fae.type = FANOTIFY_EVENT_TYPE_PATH; pevent->path = *path; + *hash ^= fanotify_hash_path(path); path_get(path); return &pevent->fae; @@ -460,6 +496,7 @@ static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path, static struct fanotify_event *fanotify_alloc_fid_event(struct inode *id, __kernel_fsid_t *fsid, + unsigned int *hash, gfp_t gfp) { struct fanotify_fid_event *ffe; @@ -470,16 +507,18 @@ static struct fanotify_event *fanotify_alloc_fid_event(struct inode *id, ffe->fae.type = FANOTIFY_EVENT_TYPE_FID; ffe->fsid = *fsid; + *hash ^= fanotify_hash_fsid(fsid); fanotify_encode_fh(&ffe->object_fh, id, fanotify_encode_fh_len(id), - gfp); + hash, gfp); return &ffe->fae; } static struct fanotify_event *fanotify_alloc_name_event(struct inode *id, __kernel_fsid_t *fsid, - const struct qstr *file_name, + const struct qstr *name, struct inode *child, + unsigned int *hash, gfp_t gfp) { struct fanotify_name_event *fne; @@ -492,24 +531,30 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id, size = sizeof(*fne) + FANOTIFY_FH_HDR_LEN + dir_fh_len; if (child_fh_len) size += FANOTIFY_FH_HDR_LEN + child_fh_len; - if (file_name) - size += file_name->len + 1; + if (name) + size += name->len + 1; fne = kmalloc(size, gfp); if (!fne) return NULL; fne->fae.type = FANOTIFY_EVENT_TYPE_FID_NAME; fne->fsid = *fsid; + *hash ^= fanotify_hash_fsid(fsid); info = &fne->info; fanotify_info_init(info); dfh = fanotify_info_dir_fh(info); - info->dir_fh_totlen = fanotify_encode_fh(dfh, id, dir_fh_len, 0); + info->dir_fh_totlen = fanotify_encode_fh(dfh, id, dir_fh_len, hash, 0); if (child_fh_len) { ffh = fanotify_info_file_fh(info); - info->file_fh_totlen = fanotify_encode_fh(ffh, child, child_fh_len, 0); + info->file_fh_totlen = fanotify_encode_fh(ffh, child, + child_fh_len, hash, 0); + } + if (name) { + long salt = name->len; + + fanotify_info_copy_name(info, name); + *hash ^= full_name_hash((void *)salt, name->name, name->len); } - if (file_name) - fanotify_info_copy_name(info, file_name); pr_debug("%s: ino=%lu size=%u dir_fh_len=%u child_fh_len=%u name_len=%u name='%.*s'\n", __func__, id->i_ino, size, dir_fh_len, child_fh_len, @@ -533,6 +578,9 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct mem_cgroup *old_memcg; struct inode *child = NULL; bool name_event = false; + unsigned int hash = 0; + bool ondir = mask & FAN_ONDIR; + struct pid *pid; if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) { /* @@ -540,8 +588,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, * report the child fid for events reported on a non-dir child * in addition to reporting the parent fid and maybe child name. */ - if ((fid_mode & FAN_REPORT_FID) && - id != dirid && !(mask & FAN_ONDIR)) + if ((fid_mode & FAN_REPORT_FID) && id != dirid && !ondir) child = id; id = dirid; @@ -562,8 +609,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, if (!(fid_mode & FAN_REPORT_NAME)) { name_event = !!child; file_name = NULL; - } else if ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) || - !(mask & FAN_ONDIR)) { + } else if ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) || !ondir) { name_event = true; } } @@ -586,26 +632,25 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, event = fanotify_alloc_perm_event(path, gfp); } else if (name_event && (file_name || child)) { event = fanotify_alloc_name_event(id, fsid, file_name, child, - gfp); + &hash, gfp); } else if (fid_mode) { - event = fanotify_alloc_fid_event(id, fsid, gfp); + event = fanotify_alloc_fid_event(id, fsid, &hash, gfp); } else { - event = fanotify_alloc_path_event(path, gfp); + event = fanotify_alloc_path_event(path, &hash, gfp); } if (!event) goto out; - /* - * Use the victim inode instead of the watching inode as the id for - * event queue, so event reported on parent is merged with event - * reported on child when both directory and child watches exist. - */ - fanotify_init_event(event, (unsigned long)id, mask); if (FAN_GROUP_FLAG(group, FAN_REPORT_TID)) - event->pid = get_pid(task_pid(current)); + pid = get_pid(task_pid(current)); else - event->pid = get_pid(task_tgid(current)); + pid = get_pid(task_tgid(current)); + + /* Mix event info, FAN_ONDIR flag and pid into event merge key */ + hash ^= hash_long((unsigned long)pid | ondir, FANOTIFY_EVENT_HASH_BITS); + fanotify_init_event(event, hash, mask); + event->pid = pid; out: set_active_memcg(old_memcg); @@ -645,6 +690,24 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info) return fsid; } +/* + * Add an event to hash table for faster merge. + */ +static void fanotify_insert_event(struct fsnotify_group *group, + struct fsnotify_event *fsn_event) +{ + struct fanotify_event *event = FANOTIFY_E(fsn_event); + unsigned int bucket = fanotify_event_hash_bucket(group, event); + struct hlist_head *hlist = &group->fanotify_data.merge_hash[bucket]; + + assert_spin_locked(&group->notification_lock); + + pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, + group, event, bucket); + + hlist_add_head(&event->merge_list, hlist); +} + static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, const void *data, int data_type, struct inode *dir, @@ -715,7 +778,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, } fsn_event = &event->fse; - ret = fsnotify_add_event(group, fsn_event, fanotify_merge); + ret = fsnotify_add_event(group, fsn_event, fanotify_merge, + fanotify_is_hashed_event(mask) ? + fanotify_insert_event : NULL); if (ret) { /* Permission events shouldn't be merged */ BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS); @@ -736,11 +801,10 @@ finish: static void fanotify_free_group_priv(struct fsnotify_group *group) { - struct user_struct *user; - - user = group->fanotify_data.user; - atomic_dec(&user->fanotify_listeners); - free_uid(user); + kfree(group->fanotify_data.merge_hash); + if (group->fanotify_data.ucounts) + dec_ucount(group->fanotify_data.ucounts, + UCOUNT_FANOTIFY_GROUPS); } static void fanotify_free_path_event(struct fanotify_event *event) @@ -796,6 +860,13 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) } } +static void fanotify_freeing_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group) +{ + if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) + dec_ucount(group->fanotify_data.ucounts, UCOUNT_FANOTIFY_MARKS); +} + static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) { kmem_cache_free(fanotify_mark_cache, fsn_mark); @@ -805,5 +876,6 @@ const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, .free_group_priv = fanotify_free_group_priv, .free_event = fanotify_free_event, + .freeing_mark = fanotify_freeing_mark, .free_mark = fanotify_free_mark, }; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 896c819a1786..4a5e555dc3d2 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -3,6 +3,7 @@ #include <linux/path.h> #include <linux/slab.h> #include <linux/exportfs.h> +#include <linux/hashtable.h> extern struct kmem_cache *fanotify_mark_cache; extern struct kmem_cache *fanotify_fid_event_cachep; @@ -115,6 +116,11 @@ static inline void fanotify_info_init(struct fanotify_info *info) info->name_len = 0; } +static inline unsigned int fanotify_info_len(struct fanotify_info *info) +{ + return info->dir_fh_totlen + info->file_fh_totlen + info->name_len; +} + static inline void fanotify_info_copy_name(struct fanotify_info *info, const struct qstr *name) { @@ -135,19 +141,31 @@ enum fanotify_event_type { FANOTIFY_EVENT_TYPE_PATH, FANOTIFY_EVENT_TYPE_PATH_PERM, FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */ + __FANOTIFY_EVENT_TYPE_NUM }; +#define FANOTIFY_EVENT_TYPE_BITS \ + (ilog2(__FANOTIFY_EVENT_TYPE_NUM - 1) + 1) +#define FANOTIFY_EVENT_HASH_BITS \ + (32 - FANOTIFY_EVENT_TYPE_BITS) + struct fanotify_event { struct fsnotify_event fse; + struct hlist_node merge_list; /* List for hashed merge */ u32 mask; - enum fanotify_event_type type; + struct { + unsigned int type : FANOTIFY_EVENT_TYPE_BITS; + unsigned int hash : FANOTIFY_EVENT_HASH_BITS; + }; struct pid *pid; }; static inline void fanotify_init_event(struct fanotify_event *event, - unsigned long id, u32 mask) + unsigned int hash, u32 mask) { - fsnotify_init_event(&event->fse, id); + fsnotify_init_event(&event->fse); + INIT_HLIST_NODE(&event->merge_list); + event->hash = hash; event->mask = mask; event->pid = NULL; } @@ -284,3 +302,25 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event) else return NULL; } + +/* + * Use 128 size hash table to speed up events merge. + */ +#define FANOTIFY_HTABLE_BITS (7) +#define FANOTIFY_HTABLE_SIZE (1 << FANOTIFY_HTABLE_BITS) +#define FANOTIFY_HTABLE_MASK (FANOTIFY_HTABLE_SIZE - 1) + +/* + * Permission events and overflow event do not get merged - don't hash them. + */ +static inline bool fanotify_is_hashed_event(u32 mask) +{ + return !fanotify_is_perm_event(mask) && !(mask & FS_Q_OVERFLOW); +} + +static inline unsigned int fanotify_event_hash_bucket( + struct fsnotify_group *group, + struct fanotify_event *event) +{ + return event->hash & FANOTIFY_HTABLE_MASK; +} diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9e0c1afac8bd..71fefb30e015 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -27,8 +27,61 @@ #include "fanotify.h" #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 -#define FANOTIFY_DEFAULT_MAX_MARKS 8192 -#define FANOTIFY_DEFAULT_MAX_LISTENERS 128 +#define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192 +#define FANOTIFY_DEFAULT_MAX_GROUPS 128 + +/* + * Legacy fanotify marks limits (8192) is per group and we introduced a tunable + * limit of marks per user, similar to inotify. Effectively, the legacy limit + * of fanotify marks per user is <max marks per group> * <max groups per user>. + * This default limit (1M) also happens to match the increased limit of inotify + * max_user_watches since v5.10. + */ +#define FANOTIFY_DEFAULT_MAX_USER_MARKS \ + (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS) + +/* + * Most of the memory cost of adding an inode mark is pinning the marked inode. + * The size of the filesystem inode struct is not uniform across filesystems, + * so double the size of a VFS inode is used as a conservative approximation. + */ +#define INODE_MARK_COST (2 * sizeof(struct inode)) + +/* configurable via /proc/sys/fs/fanotify/ */ +static int fanotify_max_queued_events __read_mostly; + +#ifdef CONFIG_SYSCTL + +#include <linux/sysctl.h> + +struct ctl_table fanotify_table[] = { + { + .procname = "max_user_groups", + .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "max_user_marks", + .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "max_queued_events", + .data = &fanotify_max_queued_events, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO + }, + { } +}; +#endif /* CONFIG_SYSCTL */ /* * All flags that may be specified in parameter event_f_flags of fanotify_init. @@ -90,6 +143,23 @@ static int fanotify_event_info_len(unsigned int fid_mode, } /* + * Remove an hashed event from merge hash table. + */ +static void fanotify_unhash_event(struct fsnotify_group *group, + struct fanotify_event *event) +{ + assert_spin_locked(&group->notification_lock); + + pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, + group, event, fanotify_event_hash_bucket(group, event)); + + if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list))) + return; + + hlist_del_init(&event->merge_list); +} + +/* * Get an fanotify notification event if one exists and is small * enough to fit in "count". Return an error pointer if the count * is not large enough. When permission event is dequeued, its state is @@ -100,26 +170,34 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, { size_t event_size = FAN_EVENT_METADATA_LEN; struct fanotify_event *event = NULL; + struct fsnotify_event *fsn_event; unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); spin_lock(&group->notification_lock); - if (fsnotify_notify_queue_is_empty(group)) + fsn_event = fsnotify_peek_first_event(group); + if (!fsn_event) goto out; - if (fid_mode) { - event_size += fanotify_event_info_len(fid_mode, - FANOTIFY_E(fsnotify_peek_first_event(group))); - } + event = FANOTIFY_E(fsn_event); + if (fid_mode) + event_size += fanotify_event_info_len(fid_mode, event); if (event_size > count) { event = ERR_PTR(-EINVAL); goto out; } - event = FANOTIFY_E(fsnotify_remove_first_event(group)); + + /* + * Held the notification_lock the whole time, so this is the + * same event we peeked above. + */ + fsnotify_remove_first_event(group); if (fanotify_is_perm_event(event->mask)) FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED; + if (fanotify_is_hashed_event(event->mask)) + fanotify_unhash_event(group, event); out: spin_unlock(&group->notification_lock); return event; @@ -341,6 +419,14 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, metadata.reserved = 0; metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; metadata.pid = pid_vnr(event->pid); + /* + * For an unprivileged listener, event->pid can be used to identify the + * events generated by the listener process itself, without disclosing + * the pids of other processes. + */ + if (!capable(CAP_SYS_ADMIN) && + task_tgid(current) != event->pid) + metadata.pid = 0; if (path && path->mnt && path->dentry) { fd = create_fd(group, path, &f); @@ -573,6 +659,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; + struct fsnotify_event *fsn_event; /* * Stop new events from arriving in the notification queue. since @@ -601,13 +688,12 @@ static int fanotify_release(struct inode *ignored, struct file *file) * dequeue them and set the response. They will be freed once the * response is consumed and fanotify_get_response() returns. */ - while (!fsnotify_notify_queue_is_empty(group)) { - struct fanotify_event *event; + while ((fsn_event = fsnotify_remove_first_event(group))) { + struct fanotify_event *event = FANOTIFY_E(fsn_event); - event = FANOTIFY_E(fsnotify_remove_first_event(group)); if (!(event->mask & FANOTIFY_PERM_EVENTS)) { spin_unlock(&group->notification_lock); - fsnotify_destroy_event(group, &event->fse); + fsnotify_destroy_event(group, fsn_event); } else { finish_permission_event(group, FANOTIFY_PERM(event), FAN_ALLOW); @@ -822,24 +908,38 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, unsigned int type, __kernel_fsid_t *fsid) { + struct ucounts *ucounts = group->fanotify_data.ucounts; struct fsnotify_mark *mark; int ret; - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + /* + * Enforce per user marks limits per user in all containing user ns. + * A group with FAN_UNLIMITED_MARKS does not contribute to mark count + * in the limited groups account. + */ + if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) && + !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) return ERR_PTR(-ENOSPC); mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); - if (!mark) - return ERR_PTR(-ENOMEM); + if (!mark) { + ret = -ENOMEM; + goto out_dec_ucounts; + } fsnotify_init_mark(mark, group); ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid); if (ret) { fsnotify_put_mark(mark); - return ERR_PTR(ret); + goto out_dec_ucounts; } return mark; + +out_dec_ucounts: + if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) + dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS); + return ERR_PTR(ret); } @@ -919,20 +1019,41 @@ static struct fsnotify_event *fanotify_alloc_overflow_event(void) return &oevent->fse; } +static struct hlist_head *fanotify_alloc_merge_hash(void) +{ + struct hlist_head *hash; + + hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS, + GFP_KERNEL_ACCOUNT); + if (!hash) + return NULL; + + __hash_init(hash, FANOTIFY_HTABLE_SIZE); + + return hash; +} + /* fanotify syscalls */ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) { struct fsnotify_group *group; int f_flags, fd; - struct user_struct *user; unsigned int fid_mode = flags & FANOTIFY_FID_BITS; unsigned int class = flags & FANOTIFY_CLASS_BITS; pr_debug("%s: flags=%x event_f_flags=%x\n", __func__, flags, event_f_flags); - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!capable(CAP_SYS_ADMIN)) { + /* + * An unprivileged user can setup an fanotify group with + * limited functionality - an unprivileged group is limited to + * notification events with file handles and it cannot use + * unlimited queue/marks. + */ + if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode) + return -EPERM; + } #ifdef CONFIG_AUDITSYSCALL if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT)) @@ -963,12 +1084,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID)) return -EINVAL; - user = get_current_user(); - if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { - free_uid(user); - return -EMFILE; - } - f_flags = O_RDWR | FMODE_NONOTIFY; if (flags & FAN_CLOEXEC) f_flags |= O_CLOEXEC; @@ -978,15 +1093,27 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops); if (IS_ERR(group)) { - free_uid(user); return PTR_ERR(group); } - group->fanotify_data.user = user; + /* Enforce groups limits per user in all containing user ns */ + group->fanotify_data.ucounts = inc_ucount(current_user_ns(), + current_euid(), + UCOUNT_FANOTIFY_GROUPS); + if (!group->fanotify_data.ucounts) { + fd = -EMFILE; + goto out_destroy_group; + } + group->fanotify_data.flags = flags; - atomic_inc(&user->fanotify_listeners); group->memcg = get_mem_cgroup_from_mm(current->mm); + group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); + if (!group->fanotify_data.merge_hash) { + fd = -ENOMEM; + goto out_destroy_group; + } + group->overflow_event = fanotify_alloc_overflow_event(); if (unlikely(!group->overflow_event)) { fd = -ENOMEM; @@ -1019,16 +1146,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) goto out_destroy_group; group->max_events = UINT_MAX; } else { - group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS; + group->max_events = fanotify_max_queued_events; } if (flags & FAN_UNLIMITED_MARKS) { fd = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto out_destroy_group; - group->fanotify_data.max_marks = UINT_MAX; - } else { - group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS; } if (flags & FAN_ENABLE_AUDIT) { @@ -1126,7 +1250,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, __func__, fanotify_fd, flags, dfd, pathname, mask); /* we only use the lower 32 bits as of right now. */ - if (mask & ((__u64)0xffffffff << 32)) + if (upper_32_bits(mask)) return -EINVAL; if (flags & ~FANOTIFY_MARK_FLAGS) @@ -1181,6 +1305,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, group = f.file->private_data; /* + * An unprivileged user is not allowed to watch a mount point nor + * a filesystem. + */ + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN) && + mark_type != FAN_MARK_INODE) + goto fput_and_out; + + /* * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not * allowed to set permissions events. */ @@ -1312,6 +1445,21 @@ SYSCALL32_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { + struct sysinfo si; + int max_marks; + + si_meminfo(&si); + /* + * Allow up to 1% of addressable memory to be accounted for per user + * marks limited to the range [8192, 1048576]. mount and sb marks are + * a lot cheaper than inode marks, but there is no reason for a user + * to have many of those, so calculate by the cost of inode marks. + */ + max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) / + INODE_MARK_COST; + max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS, + FANOTIFY_DEFAULT_MAX_USER_MARKS); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); @@ -1326,6 +1474,11 @@ static int __init fanotify_user_setup(void) KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); } + fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; + init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = + FANOTIFY_DEFAULT_MAX_GROUPS; + init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; + return 0; } device_initcall(fanotify_user_setup); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index f0d6b54be412..a712b2aaa9ac 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -144,7 +144,8 @@ void fanotify_show_fdinfo(struct seq_file *m, struct file *f) struct fsnotify_group *group = f->private_data; seq_printf(m, "fanotify flags:%x event-flags:%x\n", - group->fanotify_data.flags, group->fanotify_data.f_flags); + group->fanotify_data.flags, + group->fanotify_data.f_flags); show_fdinfo(m, f, fanotify_fdinfo); } diff --git a/fs/notify/group.c b/fs/notify/group.c index ffd723ffe46d..fb89c351295d 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -122,7 +122,6 @@ static struct fsnotify_group *__fsnotify_alloc_group( /* set to 0 when there a no external references to this group */ refcount_set(&group->refcnt, 1); - atomic_set(&group->num_marks, 0); atomic_set(&group->user_waits, 0); spin_lock_init(&group->notification_lock); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 1901d799909b..d1a64daa0171 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -46,9 +46,10 @@ static bool event_compare(struct fsnotify_event *old_fsn, return false; } -static int inotify_merge(struct list_head *list, - struct fsnotify_event *event) +static int inotify_merge(struct fsnotify_group *group, + struct fsnotify_event *event) { + struct list_head *list = &group->notification_list; struct fsnotify_event *last_event; last_event = list_entry(list->prev, struct fsnotify_event, list); @@ -107,7 +108,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, mask &= ~IN_ISDIR; fsn_event = &event->fse; - fsnotify_init_event(fsn_event, 0); + fsnotify_init_event(fsn_event); event->mask = mask; event->wd = i_mark->wd; event->sync_cookie = cookie; @@ -115,7 +116,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, if (len) strcpy(event->name, name->name); - ret = fsnotify_add_event(group, fsn_event, inotify_merge); + ret = fsnotify_add_event(group, fsn_event, inotify_merge, NULL); if (ret) { /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index c71be4fb7dc5..98f61b31745a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -146,10 +146,9 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t event_size = sizeof(struct inotify_event); struct fsnotify_event *event; - if (fsnotify_notify_queue_is_empty(group)) - return NULL; - event = fsnotify_peek_first_event(group); + if (!event) + return NULL; pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -642,7 +641,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) return ERR_PTR(-ENOMEM); } group->overflow_event = &oevent->fse; - fsnotify_init_event(group->overflow_event, 0); + fsnotify_init_event(group->overflow_event); oevent->mask = FS_Q_OVERFLOW; oevent->wd = -1; oevent->sync_cookie = 0; diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 8387937b9d01..d32ab349db74 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -391,8 +391,6 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) list_del_init(&mark->g_list); spin_unlock(&mark->lock); - atomic_dec(&group->num_marks); - /* Drop mark reference acquired in fsnotify_add_mark_locked() */ fsnotify_put_mark(mark); } @@ -656,7 +654,6 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; list_add(&mark->g_list, &group->marks_list); - atomic_inc(&group->num_marks); fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); @@ -674,7 +671,6 @@ err: FSNOTIFY_MARK_FLAG_ATTACHED); list_del_init(&mark->g_list); spin_unlock(&mark->lock); - atomic_dec(&group->num_marks); fsnotify_put_mark(mark); return ret; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 75d79d6d3ef0..32f45543b9c6 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -47,13 +47,6 @@ u32 fsnotify_get_cookie(void) } EXPORT_SYMBOL_GPL(fsnotify_get_cookie); -/* return true if the notify queue is empty, false otherwise */ -bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) -{ - assert_spin_locked(&group->notification_lock); - return list_empty(&group->notification_list) ? true : false; -} - void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event) { @@ -75,16 +68,22 @@ void fsnotify_destroy_event(struct fsnotify_group *group, } /* - * Add an event to the group notification queue. The group can later pull this - * event off the queue to deal with. The function returns 0 if the event was - * added to the queue, 1 if the event was merged with some other queued event, + * Try to add an event to the notification queue. + * The group can later pull this event off the queue to deal with. + * The group can use the @merge hook to merge the event with a queued event. + * The group can use the @insert hook to insert the event into hash table. + * The function returns: + * 0 if the event was added to a queue + * 1 if the event was merged with some other queued event * 2 if the event was not queued - either the queue of events has overflown - * or the group is shutting down. + * or the group is shutting down. */ int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, - int (*merge)(struct list_head *, - struct fsnotify_event *)) + int (*merge)(struct fsnotify_group *, + struct fsnotify_event *), + void (*insert)(struct fsnotify_group *, + struct fsnotify_event *)) { int ret = 0; struct list_head *list = &group->notification_list; @@ -111,7 +110,7 @@ int fsnotify_add_event(struct fsnotify_group *group, } if (!list_empty(list) && merge) { - ret = merge(list, event); + ret = merge(group, event); if (ret) { spin_unlock(&group->notification_lock); return ret; @@ -121,6 +120,8 @@ int fsnotify_add_event(struct fsnotify_group *group, queue: group->q_len++; list_add_tail(&event->list, list); + if (insert) + insert(group, event); spin_unlock(&group->notification_lock); wake_up(&group->notification_waitq); @@ -141,33 +142,36 @@ void fsnotify_remove_queued_event(struct fsnotify_group *group, } /* - * Remove and return the first event from the notification list. It is the - * responsibility of the caller to destroy the obtained event + * Return the first event on the notification list without removing it. + * Returns NULL if the list is empty. */ -struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) +struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) { - struct fsnotify_event *event; - assert_spin_locked(&group->notification_lock); - pr_debug("%s: group=%p\n", __func__, group); + if (fsnotify_notify_queue_is_empty(group)) + return NULL; - event = list_first_entry(&group->notification_list, - struct fsnotify_event, list); - fsnotify_remove_queued_event(group, event); - return event; + return list_first_entry(&group->notification_list, + struct fsnotify_event, list); } /* - * This will not remove the event, that must be done with - * fsnotify_remove_first_event() + * Remove and return the first event from the notification list. It is the + * responsibility of the caller to destroy the obtained event */ -struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) +struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) { - assert_spin_locked(&group->notification_lock); + struct fsnotify_event *event = fsnotify_peek_first_event(group); - return list_first_entry(&group->notification_list, - struct fsnotify_event, list); + if (!event) + return NULL; + + pr_debug("%s: group=%p event=%p\n", __func__, group, event); + + fsnotify_remove_queued_event(group, event); + + return event; } /* diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 049e36c69ed7..cd145d318b17 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1177,7 +1177,6 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) struct super_block *sb = dentry->d_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); enum zonefs_ztype t; - u64 fsid; buf->f_type = ZONEFS_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -1200,9 +1199,7 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) spin_unlock(&sbi->s_lock); - fsid = le64_to_cpup((void *)sbi->s_uuid.b) ^ - le64_to_cpup((void *)sbi->s_uuid.b + sizeof(u64)); - buf->f_fsid = u64_to_fsid(fsid); + buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b); return 0; } |