Age | Commit message (Collapse) | Author | Files | Lines |
|
This reverts commit 95cde3c59966f6371b6bcd9e4e2da2ba64ee9775.
The commit had good intentions, but it breaks kvm-tool and qemu-kvm.
With it in place, "lkvm run" just fails with
Error: KVM_CREATE_VM ioctl
Warning: Failed init: kvm__init
which isn't a wonderful error message, but bisection pinpointed the
problematic commit.
The problem is almost certainly due to the special kvm debugfs entries
created dynamically by kvm under /sys/kernel/debug/kvm/. See
kvm_create_vm_debugfs()
Bisected-and-reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Wanpeng Li <kernellwp@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Thomas Richter <tmricht@linux.ibm.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
|
|
Currently function debugfs_create_dir() creates a new
directory in the debugfs (usually mounted /sys/kernel/debug)
with permission rwxr-xr-x. This is hard coded.
Change this to use the parent directory permission.
Output before the patch:
root@s8360047 ~]# tree -dp -L 1 /sys/kernel/debug/
/sys/kernel/debug/
├── [drwxr-xr-x] bdi
├── [drwxr-xr-x] block
├── [drwxr-xr-x] dasd
├── [drwxr-xr-x] device_component
├── [drwxr-xr-x] extfrag
├── [drwxr-xr-x] hid
├── [drwxr-xr-x] kprobes
├── [drwxr-xr-x] kvm
├── [drwxr-xr-x] memblock
├── [drwxr-xr-x] pm_qos
├── [drwxr-xr-x] qdio
├── [drwxr-xr-x] s390
├── [drwxr-xr-x] s390dbf
└── [drwx------] tracing
14 directories
[root@s8360047 linux]#
Output after the patch:
[root@s8360047 ~]# tree -dp -L 1 /sys/kernel/debug/
sys/kernel/debug/
├── [drwx------] bdi
├── [drwx------] block
├── [drwx------] dasd
├── [drwx------] device_component
├── [drwx------] extfrag
├── [drwx------] hid
├── [drwx------] kprobes
├── [drwx------] kvm
├── [drwx------] memblock
├── [drwx------] pm_qos
├── [drwx------] qdio
├── [drwx------] s390
├── [drwx------] s390dbf
└── [drwx------] tracing
14 directories
[root@s8360047 linux]#
Here is the full diff output done with:
[root@s8360047 ~]# diff -u treefull.before treefull.after |
sed 's-^- # -' > treefull.diff
# --- treefull.before 2018-04-27 13:22:04.532824564 +0200
# +++ treefull.after 2018-04-27 13:24:12.106182062 +0200
# @@ -1,55 +1,55 @@
# /sys/kernel/debug/
# -├── [drwxr-xr-x] bdi
# -│ ├── [drwxr-xr-x] 1:0
# -│ ├── [drwxr-xr-x] 1:1
# -│ ├── [drwxr-xr-x] 1:10
# -│ ├── [drwxr-xr-x] 1:11
# -│ ├── [drwxr-xr-x] 1:12
# -│ ├── [drwxr-xr-x] 1:13
# -│ ├── [drwxr-xr-x] 1:14
# -│ ├── [drwxr-xr-x] 1:15
# -│ ├── [drwxr-xr-x] 1:2
# -│ ├── [drwxr-xr-x] 1:3
# -│ ├── [drwxr-xr-x] 1:4
# -│ ├── [drwxr-xr-x] 1:5
# -│ ├── [drwxr-xr-x] 1:6
# -│ ├── [drwxr-xr-x] 1:7
# -│ ├── [drwxr-xr-x] 1:8
# -│ ├── [drwxr-xr-x] 1:9
# -│ └── [drwxr-xr-x] 94:0
# -├── [drwxr-xr-x] block
# -├── [drwxr-xr-x] dasd
# -│ ├── [drwxr-xr-x] 0.0.e18a
# -│ ├── [drwxr-xr-x] dasda
# -│ └── [drwxr-xr-x] global
# -├── [drwxr-xr-x] device_component
# -├── [drwxr-xr-x] extfrag
# -├── [drwxr-xr-x] hid
# -├── [drwxr-xr-x] kprobes
# -├── [drwxr-xr-x] kvm
# -├── [drwxr-xr-x] memblock
# -├── [drwxr-xr-x] pm_qos
# -├── [drwxr-xr-x] qdio
# -│ └── [drwxr-xr-x] 0.0.f5f2
# -├── [drwxr-xr-x] s390
# -│ └── [drwxr-xr-x] stsi
# -├── [drwxr-xr-x] s390dbf
# -│ ├── [drwxr-xr-x] 0.0.e18a
# -│ ├── [drwxr-xr-x] cio_crw
# -│ ├── [drwxr-xr-x] cio_msg
# -│ ├── [drwxr-xr-x] cio_trace
# -│ ├── [drwxr-xr-x] dasd
# -│ ├── [drwxr-xr-x] kvm-trace
# -│ ├── [drwxr-xr-x] lgr
# -│ ├── [drwxr-xr-x] qdio_0.0.f5f2
# -│ ├── [drwxr-xr-x] qdio_error
# -│ ├── [drwxr-xr-x] qdio_setup
# -│ ├── [drwxr-xr-x] qeth_card_0.0.f5f0
# -│ ├── [drwxr-xr-x] qeth_control
# -│ ├── [drwxr-xr-x] qeth_msg
# -│ ├── [drwxr-xr-x] qeth_setup
# -│ ├── [drwxr-xr-x] vmcp
# -│ └── [drwxr-xr-x] vmur
# +├── [drwx------] bdi
# +│ ├── [drwx------] 1:0
# +│ ├── [drwx------] 1:1
# +│ ├── [drwx------] 1:10
# +│ ├── [drwx------] 1:11
# +│ ├── [drwx------] 1:12
# +│ ├── [drwx------] 1:13
# +│ ├── [drwx------] 1:14
# +│ ├── [drwx------] 1:15
# +│ ├── [drwx------] 1:2
# +│ ├── [drwx------] 1:3
# +│ ├── [drwx------] 1:4
# +│ ├── [drwx------] 1:5
# +│ ├── [drwx------] 1:6
# +│ ├── [drwx------] 1:7
# +│ ├── [drwx------] 1:8
# +│ ├── [drwx------] 1:9
# +│ └── [drwx------] 94:0
# +├── [drwx------] block
# +├── [drwx------] dasd
# +│ ├── [drwx------] 0.0.e18a
# +│ ├── [drwx------] dasda
# +│ └── [drwx------] global
# +├── [drwx------] device_component
# +├── [drwx------] extfrag
# +├── [drwx------] hid
# +├── [drwx------] kprobes
# +├── [drwx------] kvm
# +├── [drwx------] memblock
# +├── [drwx------] pm_qos
# +├── [drwx------] qdio
# +│ └── [drwx------] 0.0.f5f2
# +├── [drwx------] s390
# +│ └── [drwx------] stsi
# +├── [drwx------] s390dbf
# +│ ├── [drwx------] 0.0.e18a
# +│ ├── [drwx------] cio_crw
# +│ ├── [drwx------] cio_msg
# +│ ├── [drwx------] cio_trace
# +│ ├── [drwx------] dasd
# +│ ├── [drwx------] kvm-trace
# +│ ├── [drwx------] lgr
# +│ ├── [drwx------] qdio_0.0.f5f2
# +│ ├── [drwx------] qdio_error
# +│ ├── [drwx------] qdio_setup
# +│ ├── [drwx------] qeth_card_0.0.f5f0
# +│ ├── [drwx------] qeth_control
# +│ ├── [drwx------] qeth_msg
# +│ ├── [drwx------] qeth_setup
# +│ ├── [drwx------] vmcp
# +│ └── [drwx------] vmur
# └── [drwx------] tracing
# ├── [drwxr-xr-x] events
# │ ├── [drwxr-xr-x] alarmtimer
Fixes: edac65eaf8d5c ("debugfs: take mode-dependent parts of debugfs_get_inode() into callers")
Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Re-use kstrtobool_from_user() instead of open coded variant.
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
This is the mindless scripted replacement of kernel use of POLL*
variables as described by Al, done by this script:
for V in IN OUT PRI ERR RDNORM RDBAND WRNORM WRBAND HUP RDHUP NVAL MSG; do
L=`git grep -l -w POLL$V | grep -v '^t' | grep -v /um/ | grep -v '^sa' | grep -v '/poll.h$'|grep -v '^D'`
for f in $L; do sed -i "-es/^\([^\"]*\)\(\<POLL$V\>\)/\\1E\\2/" $f; done
done
with de-mangling cleanups yet to come.
NOTE! On almost all architectures, the EPOLL* constants have the same
values as the POLL* constants do. But they keyword here is "almost".
For various bad reasons they aren't the same, and epoll() doesn't
actually work quite correctly in some cases due to this on Sparc et al.
The next patch from Al will sort out the final differences, and we
should be all done.
Scripted-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
|
|
The only place that has any business including asm/poll.h
is linux/poll.h. Fortunately, asm/poll.h had only been
included in 3 places beyond that one, and all of them
are trivial to switch to using linux/poll.h.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
Now that the SPDX tag is in all debugfs files, that identifies the
license in a specific and legally-defined manner. So the extra GPL text
wording can be removed as it is no longer needed at all.
This is done on a quest to remove the 700+ different ways that files in
the kernel describe the GPL license text. And there's unneeded stuff
like the address (sometimes incorrect) for the FSF which is never
needed.
No copyright headers or other non-license-description text was removed.
Cc: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
It's good to have SPDX identifiers in all files to make it easier to
audit the kernel tree for correct licenses.
Update the debugfs files files with the correct SPDX license identifier
based on the license text in the file itself. The SPDX identifier is a
legally binding shorthand, which can be used instead of the full boiler
plate text.
This work is based on a script and data from Thomas Gleixner, Philippe
Ombredanne, and Kate Stewart.
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Currently, __debugfs_create_file allocates one struct debugfs_fsdata
instance for every file created. However, there are potentially many
debugfs file around, most of which are never touched by userspace.
Thus, defer the allocations to the first usage, i.e. to the first
debugfs_file_get().
A dentry's ->d_fsdata starts out to point to the "real", user provided
fops. After a debugfs_fsdata instance has been allocated (and the real
fops pointer has been moved over into its ->real_fops member),
->d_fsdata is changed to point to it from then on. The two cases are
distinguished by setting BIT(0) for the real fops case.
struct debugfs_fsdata's foremost purpose is to track active users and to
make debugfs_remove() block until they are done. Since no debugfs_fsdata
instance means no active users, make debugfs_remove() return immediately
in this case.
Take care of possible races between debugfs_file_get() and
debugfs_remove(): either debugfs_remove() must see a debugfs_fsdata
instance and thus wait for possible active users or debugfs_file_get() must
see a dead dentry and return immediately.
Make a dentry's ->d_release(), i.e. debugfs_release_dentry(), check whether
->d_fsdata is actually a debugfs_fsdata instance before kfree()ing it.
Similarly, make debugfs_real_fops() check whether ->d_fsdata is actually
a debugfs_fsdata instance before returning it, otherwise emit a warning.
The set of possible error codes returned from debugfs_file_get() has grown
from -EIO to -EIO and -ENOMEM. Make open_proxy_open() and full_proxy_open()
pass the -ENOMEM onwards to their callers.
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
The current implementation of debugfs_real_fops() relies on a
debugfs_fsdata instance to be installed at ->d_fsdata.
With future patches introducing lazy allocation of these, this requirement
will be guaranteed to be fullfilled only inbetween a
debugfs_file_get()/debugfs_file_put() pair.
The full proxies' fops implemented by debugfs happen to be the only
offenders. Fix them up by moving their debugfs_real_fops() calls past those
to debugfs_file_get().
full_proxy_release() is special as it doesn't invoke debugfs_file_get() at
all. Leave it alone for now.
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Purge the SRCU based file removal race protection in favour of the new,
refcount based debugfs_file_get()/debugfs_file_put() API.
Fixes: 49d200deaa68 ("debugfs: prevent access to removed files' private data")
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Convert all calls to the now obsolete debugfs_use_file_start() and
debugfs_use_file_finish() from the debugfs core itself to the new
debugfs_file_get() and debugfs_file_put() API.
Fixes: 49d200deaa68 ("debugfs: prevent access to removed files' private data")
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Currently, debugfs_real_fops() is annotated with a
__must_hold(&debugfs_srcu) sparse annotation.
With the conversion of the SRCU based protection of users against
concurrent file removals to a per-file refcount based scheme, this becomes
wrong.
Drop this annotation.
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Since commit 49d200deaa68 ("debugfs: prevent access to removed files'
private data"), accesses to a file's private data are protected from
concurrent removal by covering all file_operations with a SRCU read section
and sychronizing with those before returning from debugfs_remove() by means
of synchronize_srcu().
As pointed out by Johannes Berg, there are debugfs files with forever
blocking file_operations. Their corresponding SRCU read side sections would
block any debugfs_remove() forever as well, even unrelated ones. This
results in a livelock. Because a remover can't cancel any indefinite
blocking within foreign files, this is a problem.
Resolve this by introducing support for more granular protection on a
per-file basis.
This is implemented by introducing an 'active_users' refcount_t to the
per-file struct debugfs_fsdata state. At file creation time, it is set to
one and a debugfs_remove() will drop that initial reference. The new
debugfs_file_get() and debugfs_file_put(), intended to be used in place of
former debugfs_use_file_start() and debugfs_use_file_finish(), increment
and decrement it respectively. Once the count drops to zero,
debugfs_file_put() will signal a completion which is possibly being waited
for from debugfs_remove().
Thus, as long as there is a debugfs_file_get() not yet matched by a
corresponding debugfs_file_put() around, debugfs_remove() will block.
Actual users of debugfs_use_file_start() and -finish() will get converted
to the new debugfs_file_get() and debugfs_file_put() by followup patches.
Fixes: 49d200deaa68 ("debugfs: prevent access to removed files' private data")
Reported-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Currently, the user provided fops, "real_fops", are stored directly into
->d_fsdata.
In order to be able to store more per-file state and thus prepare for more
granular file removal protection, wrap the real_fops into a dynamically
allocated container struct, debugfs_fsdata.
A struct debugfs_fsdata gets allocated at file creation and freed from the
newly intoduced ->d_release().
Finally, move the implementation of debugfs_real_fops() out of the public
debugfs header such that struct debugfs_fsdata's declaration can be kept
private.
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull ->s_options removal from Al Viro:
"Preparations for fsmount/fsopen stuff (coming next cycle). Everything
gets moved to explicit ->show_options(), killing ->s_options off +
some cosmetic bits around fs/namespace.c and friends. Basically, the
stuff needed to work with fsmount series with minimum of conflicts
with other work.
It's not strictly required for this merge window, but it would reduce
the PITA during the coming cycle, so it would be nice to have those
bits and pieces out of the way"
* 'work.mount' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
isofs: Fix isofs_show_options()
VFS: Kill off s_options and helpers
orangefs: Implement show_options
9p: Implement show_options
isofs: Implement show_options
afs: Implement show_options
affs: Implement show_options
befs: Implement show_options
spufs: Implement show_options
bpf: Implement show_options
ramfs: Implement show_options
pstore: Implement show_options
omfs: Implement show_options
hugetlbfs: Implement show_options
VFS: Don't use save/replace_mount_options if not using generic_show_options
VFS: Provide empty name qstr
VFS: Make get_filesystem() return the affected filesystem
VFS: Clean up whitespace in fs/namespace.c and fs/super.c
Provide a function to create a NUL-terminated string from unterminated data
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull misc filesystem updates from Al Viro:
"Assorted normal VFS / filesystems stuff..."
* 'work.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
dentry name snapshots
Make statfs properly return read-only state after emergency remount
fs/dcache: init in_lookup_hashtable
minix: Deinline get_block, save 2691 bytes
fs: Reorder inode_owner_or_capable() to avoid needless
fs: warn in case userspace lied about modprobe return
|
|
take_dentry_name_snapshot() takes a safe snapshot of dentry name;
if the name is a short one, it gets copied into caller-supplied
structure, otherwise an extra reference to external name is grabbed
(those are never modified). In either case the pointer to stable
string is stored into the same structure.
dentry must be held by the caller of take_dentry_name_snapshot(),
but may be freely dropped afterwards - the snapshot will stay
until destroyed by release_dentry_name_snapshot().
Intended use:
struct name_snapshot s;
take_dentry_name_snapshot(&s, dentry);
...
access s.name
...
release_dentry_name_snapshot(&s);
Replaces fsnotify_oldname_...(), gets used in fsnotify to obtain the name
to pass down with event.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
btrfs, debugfs, reiserfs and tracefs call save_mount_options() and reiserfs
calls replace_mount_options(), but they then implement their own
->show_options() methods and don't touch s_options, rendering the saved
options unnecessary. I'm trying to eliminate s_options to make it easier
to implement a context-based mount where the mount options can be passed
individually over a file descriptor.
Remove the calls to save/replace_mount_options() call in these cases.
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Chris Mason <clm@fb.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Steven Rostedt <rostedt@goodmis.org>
cc: linux-btrfs@vger.kernel.org
cc: reiserfs-devel@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
The kernel-api book is now part of the core-api. Update its
location.
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
|
|
The filesystem documentation was moved from DocBook to
Documentation/filesystems/. Update it at the sources.
Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
|
|
simple_fill_super() is passed an array of tree_descr structures which
describe the files to create in the filesystem's root directory. Since
these arrays are never modified intentionally, they should be 'const' so
that they are placed in .rodata and benefit from memory protection.
This patch updates the function signature and all users, and also
constifies tree_descr.name.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull namespace updates from Eric Biederman:
"There is a lot here. A lot of these changes result in subtle user
visible differences in kernel behavior. I don't expect anything will
care but I will revert/fix things immediately if any regressions show
up.
From Seth Forshee there is a continuation of the work to make the vfs
ready for unpriviled mounts. We had thought the previous changes
prevented the creation of files outside of s_user_ns of a filesystem,
but it turns we missed the O_CREAT path. Ooops.
Pavel Tikhomirov and Oleg Nesterov worked together to fix a long
standing bug in the implemenation of PR_SET_CHILD_SUBREAPER where only
children that are forked after the prctl are considered and not
children forked before the prctl. The only known user of this prctl
systemd forks all children after the prctl. So no userspace
regressions will occur. Holding earlier forked children to the same
rules as later forked children creates a semantic that is sane enough
to allow checkpoing of processes that use this feature.
There is a long delayed change by Nikolay Borisov to limit inotify
instances inside a user namespace.
Michael Kerrisk extends the API for files used to maniuplate
namespaces with two new trivial ioctls to allow discovery of the
hierachy and properties of namespaces.
Konstantin Khlebnikov with the help of Al Viro adds code that when a
network namespace exits purges it's sysctl entries from the dcache. As
in some circumstances this could use a lot of memory.
Vivek Goyal fixed a bug with stacked filesystems where the permissions
on the wrong inode were being checked.
I continue previous work on ptracing across exec. Allowing a file to
be setuid across exec while being ptraced if the tracer has enough
credentials in the user namespace, and if the process has CAP_SETUID
in it's own namespace. Proc files for setuid or otherwise undumpable
executables are now owned by the root in the user namespace of their
mm. Allowing debugging of setuid applications in containers to work
better.
A bug I introduced with permission checking and automount is now
fixed. The big change is to mark the mounts that the kernel initiates
as a result of an automount. This allows the permission checks in sget
to be safely suppressed for this kind of mount. As the permission
check happened when the original filesystem was mounted.
Finally a special case in the mount namespace is removed preventing
unbounded chains in the mount hash table, and making the semantics
simpler which benefits CRIU.
The vfs fix along with related work in ima and evm I believe makes us
ready to finish developing and merge fully unprivileged mounts of the
fuse filesystem. The cleanups of the mount namespace makes discussing
how to fix the worst case complexity of umount. The stacked filesystem
fixes pave the way for adding multiple mappings for the filesystem
uids so that efficient and safer containers can be implemented"
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
proc/sysctl: Don't grab i_lock under sysctl_lock.
vfs: Use upper filesystem inode in bprm_fill_uid()
proc/sysctl: prune stale dentries during unregistering
mnt: Tuck mounts under others instead of creating shadow/side mounts.
prctl: propagate has_child_subreaper flag to every descendant
introduce the walk_process_tree() helper
nsfs: Add an ioctl() to return owner UID of a userns
fs: Better permission checking for submounts
exit: fix the setns() && PR_SET_CHILD_SUBREAPER interaction
vfs: open() with O_CREAT should not create inodes with unknown ids
nsfs: Add an ioctl() to return the namespace type
proc: Better ownership of files for non-dumpable tasks in user namespaces
exec: Remove LSM_UNSAFE_PTRACE_CAP
exec: Test the ptracer's saved cred to see if the tracee can gain caps
exec: Don't reset euid and egid when the tracee has CAP_SETUID
inotify: Convert to using per-namespace limits
|
|
We don't always have easy access to the dentry of a file or directory we
created in debugfs. Add a helper which allows us to get a dentry we
previously created.
The motivation for this change is a problem with blktrace and the blk-mq
debugfs entries introduced in 07e4fead45e6 ("blk-mq: create debugfs
directory tree"). Namely, in some cases, the directory that blktrace
needs to create may already exist, but in other cases, it may not. We
_could_ rely on a bunch of implied knowledge to decide whether to create
the directory or not, but it's much cleaner on our end to just look it
up.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
|
|
To support unprivileged users mounting filesystems two permission
checks have to be performed: a test to see if the user allowed to
create a mount in the mount namespace, and a test to see if
the user is allowed to access the specified filesystem.
The automount case is special in that mounting the original filesystem
grants permission to mount the sub-filesystems, to any user who
happens to stumble across the their mountpoint and satisfies the
ordinary filesystem permission checks.
Attempting to handle the automount case by using override_creds
almost works. It preserves the idea that permission to mount
the original filesystem is permission to mount the sub-filesystem.
Unfortunately using override_creds messes up the filesystems
ordinary permission checks.
Solve this by being explicit that a mount is a submount by introducing
vfs_submount, and using it where appropriate.
vfs_submount uses a new mount internal mount flags MS_SUBMOUNT, to let
sget and friends know that a mount is a submount so they can take appropriate
action.
sget and sget_userns are modified to not perform any permission checks
on submounts.
follow_automount is modified to stop using override_creds as that
has proven problemantic.
do_mount is modified to always remove the new MS_SUBMOUNT flag so
that we know userspace will never by able to specify it.
autofs4 is modified to stop using current_real_cred that was put in
there to handle the previous version of submount permission checking.
cifs is modified to pass the mountpoint all of the way down to vfs_submount.
debugfs is modified to pass the mountpoint all of the way down to
trace_automount by adding a new parameter. To make this change easier
a new typedef debugfs_automount_t is introduced to capture the type of
the debugfs automount function.
Cc: stable@vger.kernel.org
Fixes: 069d5ac9ae0d ("autofs: Fix automounts by using current_real_cred()->uid")
Fixes: aeaa4a79ff6a ("fs: Call d_automount with the filesystems creds")
Reviewed-by: Trond Myklebust <trond.myklebust@primarydata.com>
Reviewed-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull more vfs updates from Al Viro:
">rename2() work from Miklos + current_time() from Deepa"
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
fs: Replace current_fs_time() with current_time()
fs: Replace CURRENT_TIME_SEC with current_time() for inode timestamps
fs: Replace CURRENT_TIME with current_time() for inode timestamps
fs: proc: Delete inode time initializations in proc_alloc_inode()
vfs: Add current_time() api
vfs: add note about i_op->rename changes to porting
fs: rename "rename2" i_op to "rename"
vfs: remove unused i_op->rename
fs: make remaining filesystems use .rename2
libfs: support RENAME_NOREPLACE in simple_rename()
fs: support RENAME_NOREPLACE for local filesystems
ncpfs: fix unused variable warning
|
|
|
|
current_fs_time() uses struct super_block* as an argument.
As per Linus's suggestion, this is changed to take struct
inode* as a parameter instead. This is because the function
is primarily meant for vfs inode timestamps.
Also the function was renamed as per Arnd's suggestion.
Change all calls to current_fs_time() to use the new
current_time() function instead. current_fs_time() will be
deleted.
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
The result was being ignored and 0 was always returned.
Return the actual result instead.
Signed-off-by: Eric Engestrom <eric.engestrom@imgtec.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
This is trivial to do:
- add flags argument to simple_rename()
- check if flags doesn't have any other than RENAME_NOREPLACE
- assign simple_rename() to .rename2 instead of .rename
Filesystems converted:
hugetlbfs, ramfs, bpf.
Debugfs uses simple_rename() to implement debugfs_rename(), which is for
debugfs instances to rename files internally, not for userspace filesystem
access. For this case pass zero flags to simple_rename().
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Alexei Starovoitov <ast@kernel.org>
|
|
This patch introduces an accessor which can be used
by the users of debugfs (drivers, fs, ...) to get the
original file_operations struct. It also removes the
REAL_FOPS_DEREF macro in file.c and converts the code
to use the public version.
Previously, REAL_FOPS_DEREF was only available within
the file.c of debugfs. But having a public getter
available for debugfs users is important as some
drivers (carl9170 and b43) use the pointer of the
original file_operations in conjunction with container_of()
within their debugfs implementations.
Reviewed-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Christian Lamparter <chunkeey@gmail.com>
Cc: stable <stable@vger.kernel.org> # 4.7+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
debugfs_create_file_unsafe() is declared twice in exactly the same
manner each: once in fs/debugfs/internal.h and once in
include/linux/debugfs.h
All files that include the former also include the latter and thus,
the declaration in fs/debugfs/internal.h is superfluous.
Remove it.
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs into work.misc
|
|
Debugfs' open_proxy_open(), the ->open() installed at all inodes created
through debugfs_create_file_unsafe(),
- grabs a reference to the original file_operations instance passed to
debugfs_create_file_unsafe() via fops_get(),
- installs it at the file's ->f_op by means of replace_fops()
- and calls fops_put() on it.
Since the semantics of replace_fops() are such that the reference's
ownership is transferred, the subsequent fops_put() will result in a double
release when the file is eventually closed.
Currently, this is not an issue since fops_put() basically does a
module_put() on the file_operations' ->owner only and there don't exist any
modules calling debugfs_create_file_unsafe() yet. This is expected to
change in the future though, c.f. commit c64688081490 ("debugfs: add
support for self-protecting attribute file fops").
Remove the call to fops_put() from open_proxy_open().
Fixes: 9fd4dcece43a ("debugfs: prevent access to possibly dead
file_operations at file open")
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Debugfs' full_proxy_open(), the ->open() installed at all inodes created
through debugfs_create_file(),
- grabs a reference to the original struct file_operations instance passed
to debugfs_create_file(),
- dynamically allocates a proxy struct file_operations instance wrapping
the original
- and installs this at the file's ->f_op.
Afterwards, it calls the original ->open() and passes its return value back
to the VFS layer.
Now, if that return value indicates failure, the VFS layer won't ever call
->release() and thus, neither the reference to the original file_operations
nor the memory for the proxy file_operations will get released, i.e. both
are leaked.
Upon failure of the original fops' ->open(), undo the proxy installation.
That is:
- Set the struct file ->f_op to what it had been when full_proxy_open()
was entered.
- Drop the reference to the original file_operations.
- Free the memory holding the proxy file_operations.
Fixes: 49d200deaa68 ("debugfs: prevent access to removed files' private
data")
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|
|
We want those fixes in here as well.
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Starting with 4.1 the tracing subsystem has its own filesystem
which is automounted in the tracing subdirectory of debugfs.
Prior to this debugfs could be bind mounted in a cloned mount
namespace, but if tracefs has been mounted under debugfs this
now fails because there is a locked child mount. This creates
a regression for container software which bind mounts debugfs
to satisfy the assumption of some userspace software.
In other pseudo filesystems such as proc and sysfs we're already
creating mountpoints like this in such a way that no dirents can
be created in the directories, allowing them to be exceptions to
some MNT_LOCKED tests. In fact we're already do this for the
tracefs mountpoint in sysfs.
Do the same in debugfs_create_automount(), since the intention
here is clearly to create a mountpoint. This fixes the regression,
as locked child mounts on permanently empty directories do not
cause a bind mount to fail.
Cc: stable@vger.kernel.org # v4.1+
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
The struct file_operations u32_array_fops associated with files created
through debugfs_create_u32_array() has been lifetime aware already:
everything needed for subsequent operation is copied to a ->f_private
buffer at file opening time in u32_array_open(). Now, ->open() is always
protected against file removal issues by the debugfs core.
There is no need for the debugfs core to wrap the u32_array_fops
with a file lifetime managing proxy.
Make debugfs_create_u32_array() create its files in non-proxying operation
mode by means of debugfs_create_file_unsafe().
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Currently, the struct file_operations fops_blob associated with files
created through the debugfs_create_blob() helpers are not file
lifetime aware.
Thus, a lifetime managing proxy is created around fops_blob each time such
a file is opened which is an unnecessary waste of resources.
Implement file lifetime management for the fops_bool file_operations.
Namely, make read_file_blob() safe gainst file removals by means of
debugfs_use_file_start() and debugfs_use_file_finish().
Make debugfs_create_blob() create its files in non-proxying operation mode
by means of debugfs_create_file_unsafe().
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Currently, the struct file_operations fops_bool associated with files
created through the debugfs_create_bool() helpers are not file
lifetime aware.
Thus, a lifetime managing proxy is created around fops_bool each time such
a file is opened which is an unnecessary waste of resources.
Implement file lifetime management for the fops_bool file_operations.
Namely, make debugfs_read_file_bool() and debugfs_write_file_bool() safe
against file removals by means of debugfs_use_file_start() and
debugfs_use_file_finish().
Make debugfs_create_bool() create its files in non-proxying operation mode
through debugfs_create_mode_unsafe().
Finally, purge debugfs_create_mode() as debugfs_create_bool() had been its
last user.
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Currently, the struct file_operations associated with the integer attribute
style files created through the debugfs_create_*() helpers are not file
lifetime aware as they are defined by means of DEFINE_SIMPLE_ATTRIBUTE().
Thus, a lifetime managing proxy is created around the original fops each
time such a file is opened which is an unnecessary waste of resources.
Migrate all usages of DEFINE_SIMPLE_ATTRIBUTE() within debugfs itself
to DEFINE_DEBUGFS_ATTRIBUTE() in order to implement file lifetime managing
within the struct file_operations thus defined.
Introduce the debugfs_create_mode_unsafe() helper, analogous to
debugfs_create_mode(), but distinct in that it creates the files in
non-proxying operation mode through debugfs_create_file_unsafe().
Feed all struct file_operations migrated to DEFINE_DEBUGFS_ATTRIBUTE()
into debugfs_create_mode_unsafe() instead of former debugfs_create_mode().
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
In order to protect them against file removal issues, debugfs_create_file()
creates a lifetime managing proxy around each struct file_operations
handed in.
In cases where this struct file_operations is able to manage file lifetime
by itself already, the proxy created by debugfs is a waste of resources.
The most common class of struct file_operations given to debugfs are those
defined by means of the DEFINE_SIMPLE_ATTRIBUTE() macro.
Introduce a DEFINE_DEBUGFS_ATTRIBUTE() macro to allow any
struct file_operations of this class to be easily made file lifetime aware
and thus, to be operated unproxied.
Specifically, introduce debugfs_attr_read() and debugfs_attr_write()
which wrap simple_attr_read() and simple_attr_write() under the protection
of a debugfs_use_file_start()/debugfs_use_file_finish() pair.
Make DEFINE_DEBUGFS_ATTRIBUTE() set the defined struct file_operations'
->read() and ->write() members to these wrappers.
Export debugfs_create_file_unsafe() in order to allow debugfs users to
create their files in non-proxying operation mode.
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Upon return of debugfs_remove()/debugfs_remove_recursive(), it might
still be attempted to access associated private file data through
previously opened struct file objects. If that data has been freed by
the caller of debugfs_remove*() in the meanwhile, the reading/writing
process would either encounter a fault or, if the memory address in
question has been reassigned again, unrelated data structures could get
overwritten.
However, since debugfs files are seldomly removed, usually from module
exit handlers only, the impact is very low.
Currently, there are ~1000 call sites of debugfs_create_file() spread
throughout the whole tree and touching all of those struct file_operations
in order to make them file removal aware by means of checking the result of
debugfs_use_file_start() from within their methods is unfeasible.
Instead, wrap the struct file_operations by a lifetime managing proxy at
file open:
- In debugfs_create_file(), the original fops handed in has got stashed
away in ->d_fsdata already.
- In debugfs_create_file(), install a proxy file_operations factory,
debugfs_full_proxy_file_operations, at ->i_fop.
This proxy factory has got an ->open() method only. It carries out some
lifetime checks and if successful, dynamically allocates and sets up a new
struct file_operations proxy at ->f_op. Afterwards, it forwards to the
->open() of the original struct file_operations in ->d_fsdata, if any.
The dynamically set up proxy at ->f_op has got a lifetime managing wrapper
set for each of the methods defined in the original struct file_operations
in ->d_fsdata.
Its ->release()er frees the proxy again and forwards to the original
->release(), if any.
In order not to mislead the VFS layer, it is strictly necessary to leave
those fields blank in the proxy that have been NULL in the original
struct file_operations also, i.e. aren't supported. This is why there is a
need for dynamically allocated proxies. The choice made not to allocate a
proxy instance for every dentry at file creation, but for every
struct file object instantiated thereof is justified by the expected usage
pattern of debugfs, namely that in general very few files get opened more
than once at a time.
The wrapper methods set in the struct file_operations implement lifetime
managing by means of the SRCU protection facilities already in place for
debugfs:
They set up a SRCU read side critical section and check whether the dentry
is still alive by means of debugfs_use_file_start(). If so, they forward
the call to the original struct file_operation stored in ->d_fsdata, still
under the protection of the SRCU read side critical section.
This SRCU read side critical section prevents any pending debugfs_remove()
and friends to return to their callers. Since a file's private data must
only be freed after the return of debugfs_remove(), the ongoing proxied
call is guarded against any file removal race.
If, on the other hand, the initial call to debugfs_use_file_start() detects
that the dentry is dead, the wrapper simply returns -EIO and does not
forward the call. Note that the ->poll() wrapper is special in that its
signature does not allow for the return of arbitrary -EXXX values and thus,
POLLHUP is returned here.
In order not to pollute debugfs with wrapper definitions that aren't ever
needed, I chose not to define a wrapper for every struct file_operations
method possible. Instead, a wrapper is defined only for the subset of
methods which are actually set by any debugfs users.
Currently, these are:
->llseek()
->read()
->write()
->unlocked_ioctl()
->poll()
The ->release() wrapper is special in that it does not protect the original
->release() in any way from dead files in order not to leak resources.
Thus, any ->release() handed to debugfs must implement file lifetime
management manually, if needed.
For only 33 out of a total of 434 releasers handed in to debugfs, it could
not be verified immediately whether they access data structures that might
have been freed upon a debugfs_remove() return in the meanwhile.
Export debugfs_use_file_start() and debugfs_use_file_finish() in order to
allow any ->release() to manually implement file lifetime management.
For a set of common cases of struct file_operations implemented by the
debugfs_core itself, future patches will incorporate file lifetime
management directly within those in order to allow for their unproxied
operation. Rename the original, non-proxying "debugfs_create_file()" to
"debugfs_create_file_unsafe()" and keep it for future internal use by
debugfs itself. Factor out code common to both into the new
__debugfs_create_file().
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Nothing prevents a dentry found by path lookup before a return of
__debugfs_remove() to actually get opened after that return. Now, after
the return of __debugfs_remove(), there are no guarantees whatsoever
regarding the memory the corresponding inode's file_operations object
had been kept in.
Since __debugfs_remove() is seldomly invoked, usually from module exit
handlers only, the race is hard to trigger and the impact is very low.
A discussion of the problem outlined above as well as a suggested
solution can be found in the (sub-)thread rooted at
http://lkml.kernel.org/g/20130401203445.GA20862@ZenIV.linux.org.uk
("Yet another pipe related oops.")
Basically, Greg KH suggests to introduce an intermediate fops and
Al Viro points out that a pointer to the original ones may be stored in
->d_fsdata.
Follow this line of reasoning:
- Add SRCU as a reverse dependency of DEBUG_FS.
- Introduce a srcu_struct object for the debugfs subsystem.
- In debugfs_create_file(), store a pointer to the original
file_operations object in ->d_fsdata.
- Make debugfs_remove() and debugfs_remove_recursive() wait for a
SRCU grace period after the dentry has been delete()'d and before they
return to their callers.
- Introduce an intermediate file_operations object named
"debugfs_open_proxy_file_operations". It's ->open() functions checks,
under the protection of a SRCU read lock, whether the dentry is still
alive, i.e. has not been d_delete()'d and if so, tries to acquire a
reference on the owning module.
On success, it sets the file object's ->f_op to the original
file_operations and forwards the ongoing open() call to the original
->open().
- For clarity, rename the former debugfs_file_operations to
debugfs_noop_file_operations -- they are in no way canonical.
The choice of SRCU over "normal" RCU is justified by the fact, that the
former may also be used to protect ->i_private data from going away
during the execution of a file's readers and writers which may (and do)
sleep.
Finally, introduce the fs/debugfs/internal.h header containing some
declarations internal to the debugfs implementation.
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
CURRENT_TIME macro is not appropriate for filesystems as it
doesn't use the right granularity for filesystem timestamps.
Use current_fs_time() instead.
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
Directory inodes should start off with i_nlink == 2 (one extra ref
for "." entry). debugfs_create_automount() increases neither the
i_nlink reference for current inode nor for parent inode.
On attempt to remove the automount dentry, kernel complains:
[ 86.288070] WARNING: CPU: 1 PID: 3616 at fs/inode.c:273 drop_nlink+0x3e/0x50()
[ 86.288461] Modules linked in: debugfs_example2(O-)
[ 86.288745] CPU: 1 PID: 3616 Comm: rmmod Tainted: G O 4.4.0-rc3-next-20151207+ #135
[ 86.289197] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.2-20150617_082717-anatol 04/01/2014
[ 86.289696] ffffffff81be05c9 ffff8800b9e6fda0 ffffffff81352e2c 0000000000000000
[ 86.290110] ffff8800b9e6fdd8 ffffffff81065142 ffff8801399175e8 ffff8800bb78b240
[ 86.290507] ffff8801399175e8 ffff8800b73d7898 ffff8800b73d7840 ffff8800b9e6fde8
[ 86.290933] Call Trace:
[ 86.291080] [<ffffffff81352e2c>] dump_stack+0x4e/0x82
[ 86.291340] [<ffffffff81065142>] warn_slowpath_common+0x82/0xc0
[ 86.291640] [<ffffffff8106523a>] warn_slowpath_null+0x1a/0x20
[ 86.291932] [<ffffffff811ae62e>] drop_nlink+0x3e/0x50
[ 86.292208] [<ffffffff811ba35b>] simple_unlink+0x4b/0x60
[ 86.292481] [<ffffffff811ba3a7>] simple_rmdir+0x37/0x50
[ 86.292748] [<ffffffff812d9808>] __debugfs_remove.part.16+0xa8/0xd0
[ 86.293082] [<ffffffff812d9a0b>] debugfs_remove_recursive+0xdb/0x1c0
[ 86.293406] [<ffffffffa00004dd>] cleanup_module+0x2d/0x3b [debugfs_example2]
[ 86.293762] [<ffffffff810d959b>] SyS_delete_module+0x16b/0x220
[ 86.294077] [<ffffffff818ef857>] entry_SYSCALL_64_fastpath+0x12/0x6a
[ 86.294405] ---[ end trace c9fc53353fe14a36 ]---
[ 86.294639] ------------[ cut here ]------------
To reproduce the issue it is enough to invoke these lines:
autom = debugfs_create_automount("automount", NULL, vfsmount_cb, data);
BUG_ON(IS_ERR_OR_NULL(autom));
debugfs_remove(autom);
The issue is fixed by increasing inode i_nlink references for current
and parent inodes.
Signed-off-by: Roman Pen <r.peniaev@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
|
parallel to mutex_{lock,unlock,trylock,is_locked,lock_nested},
inode_foo(inode) being mutex_foo(&inode->i_mutex).
Please, use those for access to ->i_mutex; over the coming cycle
->i_mutex will become rwsem, with ->lookup() done with it held
only shared.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
|