summaryrefslogtreecommitdiffstats
path: root/fs/notify/fanotify/fanotify_user.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/notify/fanotify/fanotify_user.c')
-rw-r--r--fs/notify/fanotify/fanotify_user.c219
1 files changed, 186 insertions, 33 deletions
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9e0c1afac8bd..71fefb30e015 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -27,8 +27,61 @@
#include "fanotify.h"
#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
-#define FANOTIFY_DEFAULT_MAX_MARKS 8192
-#define FANOTIFY_DEFAULT_MAX_LISTENERS 128
+#define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192
+#define FANOTIFY_DEFAULT_MAX_GROUPS 128
+
+/*
+ * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
+ * limit of marks per user, similar to inotify. Effectively, the legacy limit
+ * of fanotify marks per user is <max marks per group> * <max groups per user>.
+ * This default limit (1M) also happens to match the increased limit of inotify
+ * max_user_watches since v5.10.
+ */
+#define FANOTIFY_DEFAULT_MAX_USER_MARKS \
+ (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
+
+/*
+ * Most of the memory cost of adding an inode mark is pinning the marked inode.
+ * The size of the filesystem inode struct is not uniform across filesystems,
+ * so double the size of a VFS inode is used as a conservative approximation.
+ */
+#define INODE_MARK_COST (2 * sizeof(struct inode))
+
+/* configurable via /proc/sys/fs/fanotify/ */
+static int fanotify_max_queued_events __read_mostly;
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+struct ctl_table fanotify_table[] = {
+ {
+ .procname = "max_user_groups",
+ .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "max_user_marks",
+ .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "max_queued_events",
+ .data = &fanotify_max_queued_events,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO
+ },
+ { }
+};
+#endif /* CONFIG_SYSCTL */
/*
* All flags that may be specified in parameter event_f_flags of fanotify_init.
@@ -90,6 +143,23 @@ static int fanotify_event_info_len(unsigned int fid_mode,
}
/*
+ * Remove an hashed event from merge hash table.
+ */
+static void fanotify_unhash_event(struct fsnotify_group *group,
+ struct fanotify_event *event)
+{
+ assert_spin_locked(&group->notification_lock);
+
+ pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
+ group, event, fanotify_event_hash_bucket(group, event));
+
+ if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
+ return;
+
+ hlist_del_init(&event->merge_list);
+}
+
+/*
* Get an fanotify notification event if one exists and is small
* enough to fit in "count". Return an error pointer if the count
* is not large enough. When permission event is dequeued, its state is
@@ -100,26 +170,34 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
{
size_t event_size = FAN_EVENT_METADATA_LEN;
struct fanotify_event *event = NULL;
+ struct fsnotify_event *fsn_event;
unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
spin_lock(&group->notification_lock);
- if (fsnotify_notify_queue_is_empty(group))
+ fsn_event = fsnotify_peek_first_event(group);
+ if (!fsn_event)
goto out;
- if (fid_mode) {
- event_size += fanotify_event_info_len(fid_mode,
- FANOTIFY_E(fsnotify_peek_first_event(group)));
- }
+ event = FANOTIFY_E(fsn_event);
+ if (fid_mode)
+ event_size += fanotify_event_info_len(fid_mode, event);
if (event_size > count) {
event = ERR_PTR(-EINVAL);
goto out;
}
- event = FANOTIFY_E(fsnotify_remove_first_event(group));
+
+ /*
+ * Held the notification_lock the whole time, so this is the
+ * same event we peeked above.
+ */
+ fsnotify_remove_first_event(group);
if (fanotify_is_perm_event(event->mask))
FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
+ if (fanotify_is_hashed_event(event->mask))
+ fanotify_unhash_event(group, event);
out:
spin_unlock(&group->notification_lock);
return event;
@@ -341,6 +419,14 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
metadata.reserved = 0;
metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
metadata.pid = pid_vnr(event->pid);
+ /*
+ * For an unprivileged listener, event->pid can be used to identify the
+ * events generated by the listener process itself, without disclosing
+ * the pids of other processes.
+ */
+ if (!capable(CAP_SYS_ADMIN) &&
+ task_tgid(current) != event->pid)
+ metadata.pid = 0;
if (path && path->mnt && path->dentry) {
fd = create_fd(group, path, &f);
@@ -573,6 +659,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
static int fanotify_release(struct inode *ignored, struct file *file)
{
struct fsnotify_group *group = file->private_data;
+ struct fsnotify_event *fsn_event;
/*
* Stop new events from arriving in the notification queue. since
@@ -601,13 +688,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)
* dequeue them and set the response. They will be freed once the
* response is consumed and fanotify_get_response() returns.
*/
- while (!fsnotify_notify_queue_is_empty(group)) {
- struct fanotify_event *event;
+ while ((fsn_event = fsnotify_remove_first_event(group))) {
+ struct fanotify_event *event = FANOTIFY_E(fsn_event);
- event = FANOTIFY_E(fsnotify_remove_first_event(group));
if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
spin_unlock(&group->notification_lock);
- fsnotify_destroy_event(group, &event->fse);
+ fsnotify_destroy_event(group, fsn_event);
} else {
finish_permission_event(group, FANOTIFY_PERM(event),
FAN_ALLOW);
@@ -822,24 +908,38 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
unsigned int type,
__kernel_fsid_t *fsid)
{
+ struct ucounts *ucounts = group->fanotify_data.ucounts;
struct fsnotify_mark *mark;
int ret;
- if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+ /*
+ * Enforce per user marks limits per user in all containing user ns.
+ * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
+ * in the limited groups account.
+ */
+ if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
+ !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
return ERR_PTR(-ENOSPC);
mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
- if (!mark)
- return ERR_PTR(-ENOMEM);
+ if (!mark) {
+ ret = -ENOMEM;
+ goto out_dec_ucounts;
+ }
fsnotify_init_mark(mark, group);
ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
if (ret) {
fsnotify_put_mark(mark);
- return ERR_PTR(ret);
+ goto out_dec_ucounts;
}
return mark;
+
+out_dec_ucounts:
+ if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
+ dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
+ return ERR_PTR(ret);
}
@@ -919,20 +1019,41 @@ static struct fsnotify_event *fanotify_alloc_overflow_event(void)
return &oevent->fse;
}
+static struct hlist_head *fanotify_alloc_merge_hash(void)
+{
+ struct hlist_head *hash;
+
+ hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
+ GFP_KERNEL_ACCOUNT);
+ if (!hash)
+ return NULL;
+
+ __hash_init(hash, FANOTIFY_HTABLE_SIZE);
+
+ return hash;
+}
+
/* fanotify syscalls */
SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
{
struct fsnotify_group *group;
int f_flags, fd;
- struct user_struct *user;
unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
unsigned int class = flags & FANOTIFY_CLASS_BITS;
pr_debug("%s: flags=%x event_f_flags=%x\n",
__func__, flags, event_f_flags);
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if (!capable(CAP_SYS_ADMIN)) {
+ /*
+ * An unprivileged user can setup an fanotify group with
+ * limited functionality - an unprivileged group is limited to
+ * notification events with file handles and it cannot use
+ * unlimited queue/marks.
+ */
+ if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
+ return -EPERM;
+ }
#ifdef CONFIG_AUDITSYSCALL
if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
@@ -963,12 +1084,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
return -EINVAL;
- user = get_current_user();
- if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
- free_uid(user);
- return -EMFILE;
- }
-
f_flags = O_RDWR | FMODE_NONOTIFY;
if (flags & FAN_CLOEXEC)
f_flags |= O_CLOEXEC;
@@ -978,15 +1093,27 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
/* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
if (IS_ERR(group)) {
- free_uid(user);
return PTR_ERR(group);
}
- group->fanotify_data.user = user;
+ /* Enforce groups limits per user in all containing user ns */
+ group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
+ current_euid(),
+ UCOUNT_FANOTIFY_GROUPS);
+ if (!group->fanotify_data.ucounts) {
+ fd = -EMFILE;
+ goto out_destroy_group;
+ }
+
group->fanotify_data.flags = flags;
- atomic_inc(&user->fanotify_listeners);
group->memcg = get_mem_cgroup_from_mm(current->mm);
+ group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
+ if (!group->fanotify_data.merge_hash) {
+ fd = -ENOMEM;
+ goto out_destroy_group;
+ }
+
group->overflow_event = fanotify_alloc_overflow_event();
if (unlikely(!group->overflow_event)) {
fd = -ENOMEM;
@@ -1019,16 +1146,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
goto out_destroy_group;
group->max_events = UINT_MAX;
} else {
- group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
+ group->max_events = fanotify_max_queued_events;
}
if (flags & FAN_UNLIMITED_MARKS) {
fd = -EPERM;
if (!capable(CAP_SYS_ADMIN))
goto out_destroy_group;
- group->fanotify_data.max_marks = UINT_MAX;
- } else {
- group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
}
if (flags & FAN_ENABLE_AUDIT) {
@@ -1126,7 +1250,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
__func__, fanotify_fd, flags, dfd, pathname, mask);
/* we only use the lower 32 bits as of right now. */
- if (mask & ((__u64)0xffffffff << 32))
+ if (upper_32_bits(mask))
return -EINVAL;
if (flags & ~FANOTIFY_MARK_FLAGS)
@@ -1181,6 +1305,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
group = f.file->private_data;
/*
+ * An unprivileged user is not allowed to watch a mount point nor
+ * a filesystem.
+ */
+ ret = -EPERM;
+ if (!capable(CAP_SYS_ADMIN) &&
+ mark_type != FAN_MARK_INODE)
+ goto fput_and_out;
+
+ /*
* group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not
* allowed to set permissions events.
*/
@@ -1312,6 +1445,21 @@ SYSCALL32_DEFINE6(fanotify_mark,
*/
static int __init fanotify_user_setup(void)
{
+ struct sysinfo si;
+ int max_marks;
+
+ si_meminfo(&si);
+ /*
+ * Allow up to 1% of addressable memory to be accounted for per user
+ * marks limited to the range [8192, 1048576]. mount and sb marks are
+ * a lot cheaper than inode marks, but there is no reason for a user
+ * to have many of those, so calculate by the cost of inode marks.
+ */
+ max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
+ INODE_MARK_COST;
+ max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
+ FANOTIFY_DEFAULT_MAX_USER_MARKS);
+
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
@@ -1326,6 +1474,11 @@ static int __init fanotify_user_setup(void)
KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
}
+ fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
+ init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
+ FANOTIFY_DEFAULT_MAX_GROUPS;
+ init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
+
return 0;
}
device_initcall(fanotify_user_setup);