From 9f2394c9be47a754bae9e4b6d382bdd4d77d0a11 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 10 Apr 2016 16:52:24 -0700 Subject: Revert "ext4: allow readdir()'s of large empty directories to be interrupted" This reverts commit 1028b55bafb7611dda1d8fed2aeca16a436b7dff. It's broken: it makes ext4 return an error at an invalid point, causing the readdir wrappers to write the the position of the last successful directory entry into the position field, which means that the next readdir will now return that last successful entry _again_. You can only return fatal errors (that terminate the readdir directory walk) from within the filesystem readdir functions, the "normal" errors (that happen when the readdir buffer fills up, for example) happen in the iterorator where we know the position of the actual failing entry. I do have a very different patch that does the "signal_pending()" handling inside the iterator function where it is allowable, but while that one passes all the sanity checks, I screwed up something like four times while emailing it out, so I'm not going to commit it today. So my track record is not good enough, and the stars will have to align better before that one gets committed. And it would be good to get some review too, of course, since celestial alignments are always an iffy debugging model. IOW, let's just revert the commit that caused the problem for now. Reported-by: Greg Thelen Cc: Theodore Ts'o Signed-off-by: Linus Torvalds --- fs/ext4/dir.c | 5 ----- fs/ext4/namei.c | 5 ----- 2 files changed, 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 4173bfe21114..561d7308b393 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -150,11 +150,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; - if (fatal_signal_pending(current)) { - err = -ERESTARTSYS; - goto errout; - } - cond_resched(); map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index db98f89f737f..48e4b8907826 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1107,11 +1107,6 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, } while (1) { - if (signal_pending(current)) { - err = -ERESTARTSYS; - goto errout; - } - cond_resched(); block = dx_get_block(frame->at); ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, start_hash, start_minor_hash); -- cgit v1.2.3 From d7d75352890447b55c13a81df316a6894ff32ecf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 11 Apr 2016 15:10:11 -0700 Subject: fscrypto: use dget_parent() in fscrypt_d_revalidate() This patch updates fscrypto along with the below ext4 crypto change. Fixes: 3d43bcfef5f0 ("ext4 crypto: use dget_parent() in ext4_d_revalidate()") Cc: Theodore Ts'o Signed-off-by: Jaegeuk Kim --- fs/crypto/crypto.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 7f5804537d30..58ae0ba91ca2 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -345,13 +345,17 @@ EXPORT_SYMBOL(fscrypt_zeroout_range); */ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { - struct inode *dir = d_inode(dentry->d_parent); - struct fscrypt_info *ci = dir->i_crypt_info; + struct dentry *dir; + struct fscrypt_info *ci; int dir_has_key, cached_with_key; - if (!dir->i_sb->s_cop->is_encrypted(dir)) + dir = dget_parent(dentry); + if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) { + dput(dir); return 0; + } + ci = d_inode(dir)->i_crypt_info; if (ci && ci->ci_keyring_key && (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED) | @@ -363,6 +367,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; spin_unlock(&dentry->d_lock); dir_has_key = (ci != NULL); + dput(dir); /* * If the dentry was cached without the key, and it is a -- cgit v1.2.3 From 33b1395124c63ed4a42e33c1dd14859f9b3f29c2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 11 Apr 2016 15:15:38 -0700 Subject: f2fs: use dget_parent and file_dentry in f2fs_file_open This patch synced with the below two ext4 crypto fixes together. In 4.6-rc1, f2fs newly introduced accessing f_path.dentry which crashes overlayfs. To fix, now we need to use file_dentry() to access that field. Fixes: c0a37d487884 ("ext4: use file_dentry()") Fixes: 9dd78d8c9a7b ("ext4: use dget_parent() in ext4_file_open()") Cc: Miklos Szeredi Cc: Theodore Ts'o Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 443e07705c2a..90d1157a09f9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -441,7 +441,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { int ret = generic_file_open(inode, filp); - struct inode *dir = filp->f_path.dentry->d_parent->d_inode; + struct dentry *dir; if (!ret && f2fs_encrypted_inode(inode)) { ret = fscrypt_get_encryption_info(inode); @@ -450,9 +450,13 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (!fscrypt_has_encryption_key(inode)) return -ENOKEY; } - if (f2fs_encrypted_inode(dir) && - !fscrypt_has_permitted_context(dir, inode)) + dir = dget_parent(file_dentry(filp)); + if (f2fs_encrypted_inode(d_inode(dir)) && + !fscrypt_has_permitted_context(d_inode(dir), inode)) { + dput(dir); return -EPERM; + } + dput(dir); return ret; } -- cgit v1.2.3 From b32e4482aadfd1322357f46d4ed8a990603664d9 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 11 Apr 2016 15:51:57 -0700 Subject: fscrypto: don't let data integrity writebacks fail with ENOMEM This patch fixes the issue introduced by the ext4 crypto fix in a same manner. For F2FS, however, we flush the pending IOs and wait for a while to acquire free memory. Fixes: c9af28fdd4492 ("ext4 crypto: don't let data integrity writebacks fail with ENOMEM") Cc: Theodore Ts'o Signed-off-by: Jaegeuk Kim --- fs/crypto/crypto.c | 36 ++++++++++++++++++++---------------- fs/f2fs/data.c | 16 +++++++++++++--- include/linux/fscrypto.h | 9 +++++---- 3 files changed, 38 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 58ae0ba91ca2..da70520f3ab4 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -81,13 +81,14 @@ EXPORT_SYMBOL(fscrypt_release_ctx); /** * fscrypt_get_ctx() - Gets an encryption context * @inode: The inode for which we are doing the crypto + * @gfp_flags: The gfp flag for memory allocation * * Allocates and initializes an encryption context. * * Return: An allocated and initialized encryption context on success; error * value or NULL otherwise. */ -struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode) +struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) { struct fscrypt_ctx *ctx = NULL; struct fscrypt_info *ci = inode->i_crypt_info; @@ -113,7 +114,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode) list_del(&ctx->free_list); spin_unlock_irqrestore(&fscrypt_ctx_lock, flags); if (!ctx) { - ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS); + ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, gfp_flags); if (!ctx) return ERR_PTR(-ENOMEM); ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL; @@ -147,7 +148,8 @@ typedef enum { static int do_page_crypto(struct inode *inode, fscrypt_direction_t rw, pgoff_t index, - struct page *src_page, struct page *dest_page) + struct page *src_page, struct page *dest_page, + gfp_t gfp_flags) { u8 xts_tweak[FS_XTS_TWEAK_SIZE]; struct skcipher_request *req = NULL; @@ -157,7 +159,7 @@ static int do_page_crypto(struct inode *inode, struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - req = skcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", @@ -199,10 +201,9 @@ static int do_page_crypto(struct inode *inode, return 0; } -static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx) +static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) { - ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, - GFP_NOWAIT); + ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) return ERR_PTR(-ENOMEM); ctx->flags |= FS_WRITE_PATH_FL; @@ -213,6 +214,7 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx) * fscypt_encrypt_page() - Encrypts a page * @inode: The inode for which the encryption should take place * @plaintext_page: The page to encrypt. Must be locked. + * @gfp_flags: The gfp flag for memory allocation * * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx * encryption context. @@ -225,7 +227,7 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx) * error value or NULL. */ struct page *fscrypt_encrypt_page(struct inode *inode, - struct page *plaintext_page) + struct page *plaintext_page, gfp_t gfp_flags) { struct fscrypt_ctx *ctx; struct page *ciphertext_page = NULL; @@ -233,18 +235,19 @@ struct page *fscrypt_encrypt_page(struct inode *inode, BUG_ON(!PageLocked(plaintext_page)); - ctx = fscrypt_get_ctx(inode); + ctx = fscrypt_get_ctx(inode, gfp_flags); if (IS_ERR(ctx)) return (struct page *)ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx); + ciphertext_page = alloc_bounce_page(ctx, gfp_flags); if (IS_ERR(ciphertext_page)) goto errout; ctx->w.control_page = plaintext_page; err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page); + plaintext_page, ciphertext_page, + gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); goto errout; @@ -275,7 +278,7 @@ int fscrypt_decrypt_page(struct page *page) BUG_ON(!PageLocked(page)); return do_page_crypto(page->mapping->host, - FS_DECRYPT, page->index, page, page); + FS_DECRYPT, page->index, page, page, GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_page); @@ -289,11 +292,11 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); - ctx = fscrypt_get_ctx(inode); + ctx = fscrypt_get_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) return PTR_ERR(ctx); - ciphertext_page = alloc_bounce_page(ctx); + ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); if (IS_ERR(ciphertext_page)) { err = PTR_ERR(ciphertext_page); goto errout; @@ -301,11 +304,12 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, while (len--) { err = do_page_crypto(inode, FS_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page); + ZERO_PAGE(0), ciphertext_page, + GFP_NOFS); if (err) goto errout; - bio = bio_alloc(GFP_KERNEL, 1); + bio = bio_alloc(GFP_NOWAIT, 1); if (!bio) { err = -ENOMEM; goto errout; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 53fec0872e60..5dafb9cef12e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -992,7 +992,7 @@ submit_and_realloc: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - ctx = fscrypt_get_ctx(inode); + ctx = fscrypt_get_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) goto set_error_page; @@ -1092,14 +1092,24 @@ int do_write_data_page(struct f2fs_io_info *fio) } if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + gfp_t gfp_flags = GFP_NOFS; /* wait for GCed encrypted page writeback */ f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), fio->old_blkaddr); - - fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page); +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + gfp_flags); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); + if (err == -ENOMEM) { + /* flush pending ios and wait for a while */ + f2fs_flush_merged_bios(F2FS_I_SB(inode)); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + err = 0; + goto retry_encrypt; + } goto out_writepage; } } diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h index cd91f75de49b..6027f6bbb061 100644 --- a/include/linux/fscrypto.h +++ b/include/linux/fscrypto.h @@ -263,9 +263,9 @@ static inline void fscrypt_set_d_op(struct dentry *dentry) extern struct kmem_cache *fscrypt_info_cachep; int fscrypt_initialize(void); -extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *); +extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t); extern void fscrypt_release_ctx(struct fscrypt_ctx *); -extern struct page *fscrypt_encrypt_page(struct inode *, struct page *); +extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t); extern int fscrypt_decrypt_page(struct page *); extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); extern void fscrypt_pullback_bio_page(struct page **, bool); @@ -299,7 +299,8 @@ extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, #endif /* crypto.c */ -static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i) +static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i, + gfp_t f) { return ERR_PTR(-EOPNOTSUPP); } @@ -310,7 +311,7 @@ static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c) } static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i, - struct page *p) + struct page *p, gfp_t f) { return ERR_PTR(-EOPNOTSUPP); } -- cgit v1.2.3 From 87243deb88671f70def4c52dfa7ca7830707bd31 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Wed, 9 Mar 2016 09:18:07 -0600 Subject: debugfs: Make automount point inodes permanently empty Starting with 4.1 the tracing subsystem has its own filesystem which is automounted in the tracing subdirectory of debugfs. Prior to this debugfs could be bind mounted in a cloned mount namespace, but if tracefs has been mounted under debugfs this now fails because there is a locked child mount. This creates a regression for container software which bind mounts debugfs to satisfy the assumption of some userspace software. In other pseudo filesystems such as proc and sysfs we're already creating mountpoints like this in such a way that no dirents can be created in the directories, allowing them to be exceptions to some MNT_LOCKED tests. In fact we're already do this for the tracefs mountpoint in sysfs. Do the same in debugfs_create_automount(), since the intention here is clearly to create a mountpoint. This fixes the regression, as locked child mounts on permanently empty directories do not cause a bind mount to fail. Cc: stable@vger.kernel.org # v4.1+ Signed-off-by: Seth Forshee Acked-by: Serge Hallyn Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index bece948b363d..8580831ed237 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -457,7 +457,7 @@ struct dentry *debugfs_create_automount(const char *name, if (unlikely(!inode)) return failed_creating(dentry); - inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + make_empty_dir_inode(inode); inode->i_flags |= S_AUTOMOUNT; inode->i_private = data; dentry->d_fsdata = (void *)f; -- cgit v1.2.3 From 03a8bb0e53d9562276045bdfcf2b5de2e4cff5a1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 12 Apr 2016 16:05:36 -0700 Subject: ext4/fscrypto: avoid RCU lookup in d_revalidate As Al pointed, d_revalidate should return RCU lookup before using d_inode. This was originally introduced by: commit 34286d666230 ("fs: rcu-walk aware d_revalidate method"). Reported-by: Al Viro Signed-off-by: Jaegeuk Kim Cc: Theodore Ts'o Cc: stable --- fs/crypto/crypto.c | 4 ++++ fs/ext4/crypto.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index da70520f3ab4..2fc8c43ce531 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -353,6 +354,9 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) struct fscrypt_info *ci; int dir_has_key, cached_with_key; + if (flags & LOOKUP_RCU) + return -ECHILD; + dir = dget_parent(dentry); if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) { dput(dir); diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index db9ae6e18154..6a6c27373b54 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "ext4_extents.h" #include "xattr.h" @@ -482,6 +483,9 @@ static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags) struct ext4_crypt_info *ci; int dir_has_key, cached_with_key; + if (flags & LOOKUP_RCU) + return -ECHILD; + dir = dget_parent(dentry); if (!ext4_encrypted_inode(d_inode(dir))) { dput(dir); -- cgit v1.2.3 From 34dbbcdbf63360661ff7bda6c5f52f99ac515f92 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 14 Apr 2016 11:22:00 -0700 Subject: Make file credentials available to the seqfile interfaces A lot of seqfile users seem to be using things like %pK that uses the credentials of the current process, but that is actually completely wrong for filesystem interfaces. The unix semantics for permission checking files is to check permissions at _open_ time, not at read or write time, and that is not just a small detail: passing off stdin/stdout/stderr to a suid application and making the actual IO happen in privileged context is a classic exploit technique. So if we want to be able to look at permissions at read time, we need to use the file open credentials, not the current ones. Normal file accesses can just use "f_cred" (or any of the helper functions that do that, like file_ns_capable()), but the seqfile interfaces do not have any such options. It turns out that seq_file _does_ save away the user_ns information of the file, though. Since user_ns is just part of the full credential information, replace that special case with saving off the cred pointer instead, and suddenly seq_file has all the permission information it needs. Signed-off-by: Linus Torvalds --- fs/seq_file.c | 7 ++++--- include/linux/seq_file.h | 13 ++++--------- 2 files changed, 8 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index e85664b7c7d9..19f532e7d35e 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -72,9 +72,10 @@ int seq_open(struct file *file, const struct seq_operations *op) mutex_init(&p->lock); p->op = op; -#ifdef CONFIG_USER_NS - p->user_ns = file->f_cred->user_ns; -#endif + + // No refcounting: the lifetime of 'p' is constrained + // to the lifetime of the file. + p->file = file; /* * Wrappers around seq_open(e.g. swaps_open) need to be diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index dde00defbaa5..f3d45dd42695 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -7,13 +7,10 @@ #include #include #include +#include +#include struct seq_operations; -struct file; -struct path; -struct inode; -struct dentry; -struct user_namespace; struct seq_file { char *buf; @@ -27,9 +24,7 @@ struct seq_file { struct mutex lock; const struct seq_operations *op; int poll_event; -#ifdef CONFIG_USER_NS - struct user_namespace *user_ns; -#endif + const struct file *file; void *private; }; @@ -147,7 +142,7 @@ int seq_release_private(struct inode *, struct file *); static inline struct user_namespace *seq_user_ns(struct seq_file *seq) { #ifdef CONFIG_USER_NS - return seq->user_ns; + return seq->file->f_cred->user_ns; #else extern struct user_namespace init_user_ns; return &init_user_ns; -- cgit v1.2.3 From 67245ff332064c01b760afa7a384ccda024bfd24 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Apr 2016 15:16:07 -0700 Subject: devpts: clean up interface to pty drivers This gets rid of the horrible notion of having that struct inode *ptmx_inode be the linchpin of the interface between the pty code and devpts. By de-emphasizing the ptmx inode, a lot of things actually get cleaner, and we will have a much saner way forward. In particular, this will allow us to associate with any particular devpts instance at open-time, and not be artificially tied to one particular ptmx inode. The patch itself is actually fairly straightforward, and apart from some locking and return path cleanups it's pretty mechanical: - the interfaces that devpts exposes all take "struct pts_fs_info *" instead of "struct inode *ptmx_inode" now. NOTE! The "struct pts_fs_info" thing is a completely opaque structure as far as the pty driver is concerned: it's still declared entirely internally to devpts. So the pty code can't actually access it in any way, just pass it as a "cookie" to the devpts code. - the "look up the pts fs info" is now a single clear operation, that also does the reference count increment on the pts superblock. So "devpts_add/del_ref()" is gone, and replaced by a "lookup and get ref" operation (devpts_get_ref(inode)), along with a "put ref" op (devpts_put_ref()). - the pty master "tty->driver_data" field now contains the pts_fs_info, not the ptmx inode. - because we don't care about the ptmx inode any more as some kind of base index, the ref counting can now drop the inode games - it just gets the ref on the superblock. - the pts_fs_info now has a back-pointer to the super_block. That's so that we can easily look up the information we actually need. Although quite often, the pts fs info was actually all we wanted, and not having to look it up based on some magical inode makes things more straightforward. In particular, now that "devpts_get_ref(inode)" operation should really be the *only* place we need to look up what devpts instance we're associated with, and we do it exactly once, at ptmx_open() time. The other side of this is that one ptmx node could now be associated with multiple different devpts instances - you could have a single /dev/ptmx node, and then have multiple mount namespaces with their own instances of devpts mounted on /dev/pts/. And that's all perfectly sane in a model where we just look up the pts instance at open time. This will eventually allow us to get rid of our odd single-vs-multiple pts instance model, but this patch in itself changes no semantics, only an internal binding model. Cc: Eric Biederman Cc: Peter Anvin Cc: Andy Lutomirski Cc: Al Viro Cc: Peter Hurley Cc: Serge Hallyn Cc: Willy Tarreau Cc: Aurelien Jarno Cc: Alan Cox Cc: Jann Horn Cc: Greg KH Cc: Jiri Slaby Cc: Florian Weimer Signed-off-by: Linus Torvalds --- drivers/tty/pty.c | 63 ++++++++++++++++++++++------------------------- fs/devpts/inode.c | 49 ++++++++++++++++++------------------ include/linux/devpts_fs.h | 34 ++++++++----------------- 3 files changed, 64 insertions(+), 82 deletions(-) (limited to 'fs') diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index e16a49b507ef..0058d9fbf931 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -663,14 +663,14 @@ static int pty_unix98_install(struct tty_driver *driver, struct tty_struct *tty) /* this is called once with whichever end is closed last */ static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty) { - struct inode *ptmx_inode; + struct pts_fs_info *fsi; if (tty->driver->subtype == PTY_TYPE_MASTER) - ptmx_inode = tty->driver_data; + fsi = tty->driver_data; else - ptmx_inode = tty->link->driver_data; - devpts_kill_index(ptmx_inode, tty->index); - devpts_del_ref(ptmx_inode); + fsi = tty->link->driver_data; + devpts_kill_index(fsi, tty->index); + devpts_put_ref(fsi); } static const struct tty_operations ptm_unix98_ops = { @@ -720,6 +720,7 @@ static const struct tty_operations pty_unix98_ops = { static int ptmx_open(struct inode *inode, struct file *filp) { + struct pts_fs_info *fsi; struct tty_struct *tty; struct inode *slave_inode; int retval; @@ -734,47 +735,41 @@ static int ptmx_open(struct inode *inode, struct file *filp) if (retval) return retval; + fsi = devpts_get_ref(inode, filp); + retval = -ENODEV; + if (!fsi) + goto out_free_file; + /* find a device that is not in use. */ mutex_lock(&devpts_mutex); - index = devpts_new_index(inode); - if (index < 0) { - retval = index; - mutex_unlock(&devpts_mutex); - goto err_file; - } - + index = devpts_new_index(fsi); mutex_unlock(&devpts_mutex); - mutex_lock(&tty_mutex); - tty = tty_init_dev(ptm_driver, index); + retval = index; + if (index < 0) + goto out_put_ref; - if (IS_ERR(tty)) { - retval = PTR_ERR(tty); - goto out; - } + mutex_lock(&tty_mutex); + tty = tty_init_dev(ptm_driver, index); /* The tty returned here is locked so we can safely drop the mutex */ mutex_unlock(&tty_mutex); - set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ - tty->driver_data = inode; + retval = PTR_ERR(tty); + if (IS_ERR(tty)) + goto out; /* - * In the case where all references to ptmx inode are dropped and we - * still have /dev/tty opened pointing to the master/slave pair (ptmx - * is closed/released before /dev/tty), we must make sure that the inode - * is still valid when we call the final pty_unix98_shutdown, thus we - * hold an additional reference to the ptmx inode. For the same /dev/tty - * last close case, we also need to make sure the super_block isn't - * destroyed (devpts instance unmounted), before /dev/tty is closed and - * on its release devpts_kill_index is called. + * From here on out, the tty is "live", and the index and + * fsi will be killed/put by the tty_release() */ - devpts_add_ref(inode); + set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ + tty->driver_data = fsi; tty_add_file(tty, filp); - slave_inode = devpts_pty_new(inode, + slave_inode = devpts_pty_new(fsi, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index), index, tty->link); if (IS_ERR(slave_inode)) { @@ -793,12 +788,14 @@ static int ptmx_open(struct inode *inode, struct file *filp) return 0; err_release: tty_unlock(tty); + // This will also put-ref the fsi tty_release(inode, filp); return retval; out: - mutex_unlock(&tty_mutex); - devpts_kill_index(inode, index); -err_file: + devpts_kill_index(fsi, index); +out_put_ref: + devpts_put_ref(fsi); +out_free_file: tty_free_file(filp); return retval; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 655f21f99160..0af8e7d70d27 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -128,6 +128,7 @@ static const match_table_t tokens = { struct pts_fs_info { struct ida allocated_ptys; struct pts_mount_opts mount_opts; + struct super_block *sb; struct dentry *ptmx_dentry; }; @@ -358,7 +359,7 @@ static const struct super_operations devpts_sops = { .show_options = devpts_show_options, }; -static void *new_pts_fs_info(void) +static void *new_pts_fs_info(struct super_block *sb) { struct pts_fs_info *fsi; @@ -369,6 +370,7 @@ static void *new_pts_fs_info(void) ida_init(&fsi->allocated_ptys); fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + fsi->sb = sb; return fsi; } @@ -384,7 +386,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_time_gran = 1; - s->s_fs_info = new_pts_fs_info(); + s->s_fs_info = new_pts_fs_info(s); if (!s->s_fs_info) goto fail; @@ -524,17 +526,14 @@ static struct file_system_type devpts_fs_type = { * to the System V naming convention */ -int devpts_new_index(struct inode *ptmx_inode) +int devpts_new_index(struct pts_fs_info *fsi) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi; int index; int ida_ret; - if (!sb) + if (!fsi) return -ENODEV; - fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -564,11 +563,8 @@ retry: return index; } -void devpts_kill_index(struct inode *ptmx_inode, int idx) +void devpts_kill_index(struct pts_fs_info *fsi, int idx) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); - mutex_lock(&allocated_ptys_lock); ida_remove(&fsi->allocated_ptys, idx); pty_count--; @@ -578,21 +574,25 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx) /* * pty code needs to hold extra references in case of last /dev/tty close */ - -void devpts_add_ref(struct inode *ptmx_inode) +struct pts_fs_info *devpts_get_ref(struct inode *ptmx_inode, struct file *file) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct super_block *sb; + struct pts_fs_info *fsi; + + sb = pts_sb_from_inode(ptmx_inode); + if (!sb) + return NULL; + fsi = DEVPTS_SB(sb); + if (!fsi) + return NULL; atomic_inc(&sb->s_active); - ihold(ptmx_inode); + return fsi; } -void devpts_del_ref(struct inode *ptmx_inode) +void devpts_put_ref(struct pts_fs_info *fsi) { - struct super_block *sb = pts_sb_from_inode(ptmx_inode); - - iput(ptmx_inode); - deactivate_super(sb); + deactivate_super(fsi->sb); } /** @@ -604,22 +604,21 @@ void devpts_del_ref(struct inode *ptmx_inode) * * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill. */ -struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, +struct inode *devpts_pty_new(struct pts_fs_info *fsi, dev_t device, int index, void *priv) { struct dentry *dentry; - struct super_block *sb = pts_sb_from_inode(ptmx_inode); + struct super_block *sb; struct inode *inode; struct dentry *root; - struct pts_fs_info *fsi; struct pts_mount_opts *opts; char s[12]; - if (!sb) + if (!fsi) return ERR_PTR(-ENODEV); + sb = fsi->sb; root = sb->s_root; - fsi = DEVPTS_SB(sb); opts = &fsi->mount_opts; inode = new_inode(sb); diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index e0ee0b3000b2..358a4db72a27 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -15,38 +15,24 @@ #include +struct pts_fs_info; + #ifdef CONFIG_UNIX98_PTYS -int devpts_new_index(struct inode *ptmx_inode); -void devpts_kill_index(struct inode *ptmx_inode, int idx); -void devpts_add_ref(struct inode *ptmx_inode); -void devpts_del_ref(struct inode *ptmx_inode); +/* Look up a pts fs info and get a ref to it */ +struct pts_fs_info *devpts_get_ref(struct inode *, struct file *); +void devpts_put_ref(struct pts_fs_info *); + +int devpts_new_index(struct pts_fs_info *); +void devpts_kill_index(struct pts_fs_info *, int); + /* mknod in devpts */ -struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, - void *priv); +struct inode *devpts_pty_new(struct pts_fs_info *, dev_t, int, void *); /* get private structure */ void *devpts_get_priv(struct inode *pts_inode); /* unlink */ void devpts_pty_kill(struct inode *inode); -#else - -/* Dummy stubs in the no-pty case */ -static inline int devpts_new_index(struct inode *ptmx_inode) { return -EINVAL; } -static inline void devpts_kill_index(struct inode *ptmx_inode, int idx) { } -static inline void devpts_add_ref(struct inode *ptmx_inode) { } -static inline void devpts_del_ref(struct inode *ptmx_inode) { } -static inline struct inode *devpts_pty_new(struct inode *ptmx_inode, - dev_t device, int index, void *priv) -{ - return ERR_PTR(-EINVAL); -} -static inline void *devpts_get_priv(struct inode *pts_inode) -{ - return NULL; -} -static inline void devpts_pty_kill(struct inode *inode) { } - #endif -- cgit v1.2.3