From 560d388950ceda5e7c7cdef7f3d9a8ff297bbf9d Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Wed, 3 May 2017 17:17:21 +0200 Subject: CIFS: silence lockdep splat in cifs_relock_file() cifs_relock_file() can perform a down_write() on the inode's lock_sem even though it was already performed in cifs_strict_readv(). Lockdep complains about this. AFAICS, there is no problem here, and lockdep just needs to be told that this nesting is OK. ============================================= [ INFO: possible recursive locking detected ] 4.11.0+ #20 Not tainted --------------------------------------------- cat/701 is trying to acquire lock: (&cifsi->lock_sem){++++.+}, at: cifs_reopen_file+0x7a7/0xc00 but task is already holding lock: (&cifsi->lock_sem){++++.+}, at: cifs_strict_readv+0x177/0x310 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&cifsi->lock_sem); lock(&cifsi->lock_sem); *** DEADLOCK *** May be due to missing lock nesting notation 1 lock held by cat/701: #0: (&cifsi->lock_sem){++++.+}, at: cifs_strict_readv+0x177/0x310 stack backtrace: CPU: 0 PID: 701 Comm: cat Not tainted 4.11.0+ #20 Call Trace: dump_stack+0x85/0xc2 __lock_acquire+0x17dd/0x2260 ? trace_hardirqs_on_thunk+0x1a/0x1c ? preempt_schedule_irq+0x6b/0x80 lock_acquire+0xcc/0x260 ? lock_acquire+0xcc/0x260 ? cifs_reopen_file+0x7a7/0xc00 down_read+0x2d/0x70 ? cifs_reopen_file+0x7a7/0xc00 cifs_reopen_file+0x7a7/0xc00 ? printk+0x43/0x4b cifs_readpage_worker+0x327/0x8a0 cifs_readpage+0x8c/0x2a0 generic_file_read_iter+0x692/0xd00 cifs_strict_readv+0x29f/0x310 generic_file_splice_read+0x11c/0x1c0 do_splice_to+0xa5/0xc0 splice_direct_to_actor+0xfa/0x350 ? generic_pipe_buf_nosteal+0x10/0x10 do_splice_direct+0xb5/0xe0 do_sendfile+0x278/0x3a0 SyS_sendfile64+0xc4/0xe0 entry_SYSCALL_64_fastpath+0x1f/0xbe Signed-off-by: Rabin Vincent Acked-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6ef78ad838e6..0fd081bd2a2f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -582,7 +582,7 @@ cifs_relock_file(struct cifsFileInfo *cfile) struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; - down_read(&cinode->lock_sem); + down_read_nested(&cinode->lock_sem, SINGLE_DEPTH_NESTING); if (cinode->can_cache_brlcks) { /* can cache locks - no need to relock */ up_read(&cinode->lock_sem); -- cgit v1.2.3 From de1892b887eeb85ce458a93979c2108e6f329618 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 4 May 2017 07:54:04 -0500 Subject: Don't delay freeing mids when blocked on slow socket write of request When processing responses, and in particular freeing mids (DeleteMidQEntry), which is very important since it also frees the associated buffers (cifs_buf_release), we can block a long time if (writes to) socket is slow due to low memory or networking issues. We can block in send (smb request) waiting for memory, and be blocked in processing responess (which could free memory if we let it) - since they both grab the server->srv_mutex. In practice, in the DeleteMidQEntry case - there is no reason we need to grab the srv_mutex so remove these around DeleteMidQEntry, and it allows us to free memory faster. Signed-off-by: Steve French Acked-by: Pavel Shilovsky --- fs/cifs/cifssmb.c | 7 ------- fs/cifs/smb2pdu.c | 7 ------- fs/cifs/transport.c | 2 -- 3 files changed, 16 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4c01b3f9abf0..4de3186d8a71 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -697,9 +697,7 @@ cifs_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(server, 1, CIFS_ECHO_OP); } @@ -1599,9 +1597,7 @@ cifs_readv_callback(struct mid_q_entry *mid) } queue_work(cifsiod_wq, &rdata->work); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(server, 1, 0); } @@ -2058,7 +2054,6 @@ cifs_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); - struct TCP_Server_Info *server = tcon->ses->server; unsigned int written; WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf; @@ -2095,9 +2090,7 @@ cifs_writev_callback(struct mid_q_entry *mid) } queue_work(cifsiod_wq, &wdata->work); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(tcon->ses->server, 1, 0); } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 48ff7703b919..e0517f499c38 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2173,9 +2173,7 @@ smb2_echo_callback(struct mid_q_entry *mid) if (mid->mid_state == MID_RESPONSE_RECEIVED) credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(server, credits_received, CIFS_ECHO_OP); } @@ -2433,9 +2431,7 @@ smb2_readv_callback(struct mid_q_entry *mid) cifs_stats_fail_inc(tcon, SMB2_READ_HE); queue_work(cifsiod_wq, &rdata->work); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(server, credits_received, 0); } @@ -2594,7 +2590,6 @@ smb2_writev_callback(struct mid_q_entry *mid) { struct cifs_writedata *wdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); - struct TCP_Server_Info *server = tcon->ses->server; unsigned int written; struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; unsigned int credits_received = 1; @@ -2634,9 +2629,7 @@ smb2_writev_callback(struct mid_q_entry *mid) cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); queue_work(cifsiod_wq, &wdata->work); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); add_credits(tcon->ses->server, credits_received, 0); } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 4d64b5b8fc9c..de589d0d3739 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -613,9 +613,7 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) } spin_unlock(&GlobalMid_Lock); - mutex_lock(&server->srv_mutex); DeleteMidQEntry(mid); - mutex_unlock(&server->srv_mutex); return rc; } -- cgit v1.2.3 From ecdcf622eb74b52cebde1387a7a1852a787d8050 Mon Sep 17 00:00:00 2001 From: Joe Perches via samba-technical Date: Sun, 7 May 2017 01:31:47 -0700 Subject: cifs: cifsacl: Use a temporary ops variable to reduce code length Create an ops variable to store tcon->ses->server->ops and cache indirections and reduce code size a trivial bit. $ size fs/cifs/cifsacl.o* text data bss dec hex filename 5338 136 8 5482 156a fs/cifs/cifsacl.o.new 5371 136 8 5515 158b fs/cifs/cifsacl.o.old Signed-off-by: Joe Perches Acked-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 15bac390dff9..b98436f5c7c7 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1135,20 +1135,19 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, u32 acllen = 0; int rc = 0; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); - struct cifs_tcon *tcon; + struct smb_version_operations *ops; cifs_dbg(NOISY, "converting ACL to mode for %s\n", path); if (IS_ERR(tlink)) return PTR_ERR(tlink); - tcon = tlink_tcon(tlink); - if (pfid && (tcon->ses->server->ops->get_acl_by_fid)) - pntsd = tcon->ses->server->ops->get_acl_by_fid(cifs_sb, pfid, - &acllen); - else if (tcon->ses->server->ops->get_acl) - pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path, - &acllen); + ops = tlink_tcon(tlink)->ses->server->ops; + + if (pfid && (ops->get_acl_by_fid)) + pntsd = ops->get_acl_by_fid(cifs_sb, pfid, &acllen); + else if (ops->get_acl) + pntsd = ops->get_acl(cifs_sb, inode, path, &acllen); else { cifs_put_tlink(tlink); return -EOPNOTSUPP; @@ -1181,23 +1180,23 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); - struct cifs_tcon *tcon; + struct smb_version_operations *ops; if (IS_ERR(tlink)) return PTR_ERR(tlink); - tcon = tlink_tcon(tlink); + + ops = tlink_tcon(tlink)->ses->server->ops; cifs_dbg(NOISY, "set ACL from mode for %s\n", path); /* Get the security descriptor */ - if (tcon->ses->server->ops->get_acl == NULL) { + if (ops->get_acl == NULL) { cifs_put_tlink(tlink); return -EOPNOTSUPP; } - pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path, - &secdesclen); + pntsd = ops->get_acl(cifs_sb, inode, path, &secdesclen); if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); @@ -1224,13 +1223,12 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); - if (tcon->ses->server->ops->set_acl == NULL) + if (ops->set_acl == NULL) rc = -EOPNOTSUPP; if (!rc) { /* Set the security descriptor */ - rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode, - path, aclflag); + rc = ops->set_acl(pnntsd, secdesclen, inode, path, aclflag); cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } cifs_put_tlink(tlink); -- cgit v1.2.3 From cd1230070ae1c12fd34cf6a557bfa81bf9311009 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 12 May 2017 17:59:32 +0200 Subject: SMB2: Fix share type handling In fs/cifs/smb2pdu.h, we have: #define SMB2_SHARE_TYPE_DISK 0x01 #define SMB2_SHARE_TYPE_PIPE 0x02 #define SMB2_SHARE_TYPE_PRINT 0x03 Knowing that, with the current code, the SMB2_SHARE_TYPE_PRINT case can never trigger and printer share would be interpreted as disk share. So, test the ShareType value for equality instead. Fixes: faaf946a7d5b ("CIFS: Add tree connect/disconnect capability for SMB2") Signed-off-by: Christophe JAILLET Acked-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index e0517f499c38..e4afdaae743f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1240,15 +1240,19 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, goto tcon_exit; } - if (rsp->ShareType & SMB2_SHARE_TYPE_DISK) + switch (rsp->ShareType) { + case SMB2_SHARE_TYPE_DISK: cifs_dbg(FYI, "connection to disk share\n"); - else if (rsp->ShareType & SMB2_SHARE_TYPE_PIPE) { + break; + case SMB2_SHARE_TYPE_PIPE: tcon->ipc = true; cifs_dbg(FYI, "connection to pipe share\n"); - } else if (rsp->ShareType & SMB2_SHARE_TYPE_PRINT) { - tcon->print = true; + break; + case SMB2_SHARE_TYPE_PRINT: + tcon->ipc = true; cifs_dbg(FYI, "connection to printer\n"); - } else { + break; + default: cifs_dbg(VFS, "unknown share type %d\n", rsp->ShareType); rc = -EOPNOTSUPP; goto tcon_error_exit; -- cgit v1.2.3 From 4328fea77ca30ef6af938ae3f263a3d055a9c0e6 Mon Sep 17 00:00:00 2001 From: Karim Eshapa Date: Fri, 12 May 2017 01:53:38 +0200 Subject: fs: cifs: transport: Use time_after for time comparison Use time_after kernel macro for time comparison that has safety check. Signed-off-by: Karim Eshapa Signed-off-by: Steve French --- fs/cifs/transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index de589d0d3739..47a125ece11e 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -94,7 +94,7 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) now = jiffies; /* commands taking longer than one second are indications that something is wrong, unless it is quite a slow link or server */ - if ((now - midEntry->when_alloc) > HZ) { + if (time_after(now, midEntry->when_alloc + HZ)) { if ((cifsFYI & CIFS_TIMER) && (midEntry->command != command)) { pr_debug(" CIFS slow rsp: cmd %d mid %llu", midEntry->command, midEntry->mid); -- cgit v1.2.3 From 67b4c889cc835a2a6e2ff4e20544a33e37e2875d Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 12 May 2017 20:59:10 -0500 Subject: [CIFS] Minor cleanup of xattr query function Some minor cleanup of cifs query xattr functions (will also make SMB3 xattr implementation cleaner as well). Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 2 +- fs/cifs/cifsproto.h | 3 +-- fs/cifs/cifssmb.c | 4 +++- fs/cifs/inode.c | 3 +-- fs/cifs/xattr.c | 6 ++---- 5 files changed, 8 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 8be55be70faf..bcc7d9acad64 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -418,7 +418,7 @@ struct smb_version_operations { int (*validate_negotiate)(const unsigned int, struct cifs_tcon *); ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *, const unsigned char *, const unsigned char *, char *, - size_t, const struct nls_table *, int); + size_t, struct cifs_sb_info *); int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *, const char *, const void *, const __u16, const struct nls_table *, int); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index e49958c3f8bb..6eb3147132e3 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -480,8 +480,7 @@ extern int CIFSSMBCopy(unsigned int xid, extern ssize_t CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, const unsigned char *ea_name, char *EAData, - size_t bufsize, const struct nls_table *nls_codepage, - int remap_special_chars); + size_t bufsize, struct cifs_sb_info *cifs_sb); extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const char *ea_name, const void *ea_value, const __u16 ea_value_len, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4de3186d8a71..fbb0d4cbda41 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -6069,11 +6069,13 @@ ssize_t CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, const unsigned char *ea_name, char *EAData, size_t buf_size, - const struct nls_table *nls_codepage, int remap) + struct cifs_sb_info *cifs_sb) { /* BB assumes one setup word */ TRANSACTION2_QPI_REQ *pSMB = NULL; TRANSACTION2_QPI_RSP *pSMBr = NULL; + int remap = cifs_remap(cifs_sb); + struct nls_table *nls_codepage = cifs_sb->local_nls; int rc = 0; int bytes_returned; int list_len; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index c3b2fa0b2ec8..4d1fcd76d022 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -563,8 +563,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path, "SETFILEBITS", ea_value, 4 /* size of buf */, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); + cifs_sb); cifs_put_tlink(tlink); if (rc < 0) return (int)rc; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 20af5187ba63..3cb5c9e2d4e7 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -235,8 +235,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler, if (pTcon->ses->server->ops->query_all_EAs) rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, - full_path, name, value, size, - cifs_sb->local_nls, cifs_remap(cifs_sb)); + full_path, name, value, size, cifs_sb); break; case XATTR_CIFS_ACL: { @@ -336,8 +335,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) if (pTcon->ses->server->ops->query_all_EAs) rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, - full_path, NULL, data, buf_size, - cifs_sb->local_nls, cifs_remap(cifs_sb)); + full_path, NULL, data, buf_size, cifs_sb); list_ea_exit: kfree(full_path); free_xid(xid); -- cgit v1.2.3 From f5705aa8cfed142d980ecac12bee0d81b756479e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 13 May 2017 16:31:05 -0700 Subject: dax, xfs, ext4: compile out iomap-dax paths in the FS_DAX=n case Tetsuo reports: fs/built-in.o: In function `xfs_file_iomap_end': xfs_iomap.c:(.text+0xe0ef9): undefined reference to `put_dax' fs/built-in.o: In function `xfs_file_iomap_begin': xfs_iomap.c:(.text+0xe1a7f): undefined reference to `dax_get_by_host' make: *** [vmlinux] Error 1 $ grep DAX .config CONFIG_DAX=m # CONFIG_DEV_DAX is not set # CONFIG_FS_DAX is not set When FS_DAX=n we can/must throw away the dax code in filesystems. Implement 'fs_' versions of dax_get_by_host() and put_dax() that are nops in the FS_DAX=n case. Cc: Cc: Cc: Jan Kara Cc: "Theodore Ts'o" Cc: "Darrick J. Wong" Cc: Ross Zwisler Tested-by: Tony Luck Fixes: ef51042472f5 ("block, dax: move 'select DAX' from BLOCK to FS_DAX") Reported-by: Tetsuo Handa Signed-off-by: Dan Williams --- fs/ext2/inode.c | 4 ++-- fs/ext4/inode.c | 4 ++-- fs/xfs/xfs_iomap.c | 4 ++-- include/linux/dax.h | 34 +++++++++++++++++++++++++++------- 4 files changed, 33 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 26d77f9f8c12..2dcbd5698884 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -817,7 +817,7 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->bdev = bdev; iomap->offset = (u64)first_block << blkbits; if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); else iomap->dax_dev = NULL; @@ -841,7 +841,7 @@ static int ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { - put_dax(iomap->dax_dev); + fs_put_dax(iomap->dax_dev); if (iomap->type == IOMAP_MAPPED && written < length && (flags & IOMAP_WRITE)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5834c4d76be8..1bd0bfa547f6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3412,7 +3412,7 @@ retry: bdev = inode->i_sb->s_bdev; iomap->bdev = bdev; if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); else iomap->dax_dev = NULL; iomap->offset = first_block << blkbits; @@ -3447,7 +3447,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, int blkbits = inode->i_blkbits; bool truncate = false; - put_dax(iomap->dax_dev); + fs_put_dax(iomap->dax_dev); if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) return 0; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index a63f61c256bd..94e5bdf7304c 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1068,7 +1068,7 @@ xfs_file_iomap_begin( /* optionally associate a dax device with the iomap bdev */ bdev = iomap->bdev; if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); else iomap->dax_dev = NULL; @@ -1149,7 +1149,7 @@ xfs_file_iomap_end( unsigned flags, struct iomap *iomap) { - put_dax(iomap->dax_dev); + fs_put_dax(iomap->dax_dev); if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, length, written, iomap); diff --git a/include/linux/dax.h b/include/linux/dax.h index 00ebac854bb7..5ec1f6c47716 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -18,6 +18,20 @@ struct dax_operations { void **, pfn_t *); }; +#if IS_ENABLED(CONFIG_DAX) +struct dax_device *dax_get_by_host(const char *host); +void put_dax(struct dax_device *dax_dev); +#else +static inline struct dax_device *dax_get_by_host(const char *host) +{ + return NULL; +} + +static inline void put_dax(struct dax_device *dax_dev) +{ +} +#endif + int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); #if IS_ENABLED(CONFIG_FS_DAX) int __bdev_dax_supported(struct super_block *sb, int blocksize); @@ -25,23 +39,29 @@ static inline int bdev_dax_supported(struct super_block *sb, int blocksize) { return __bdev_dax_supported(sb, blocksize); } + +static inline struct dax_device *fs_dax_get_by_host(const char *host) +{ + return dax_get_by_host(host); +} + +static inline void fs_put_dax(struct dax_device *dax_dev) +{ + put_dax(dax_dev); +} + #else static inline int bdev_dax_supported(struct super_block *sb, int blocksize) { return -EOPNOTSUPP; } -#endif -#if IS_ENABLED(CONFIG_DAX) -struct dax_device *dax_get_by_host(const char *host); -void put_dax(struct dax_device *dax_dev); -#else -static inline struct dax_device *dax_get_by_host(const char *host) +static inline struct dax_device *fs_dax_get_by_host(const char *host) { return NULL; } -static inline void put_dax(struct dax_device *dax_dev) +static inline void fs_put_dax(struct dax_device *dax_dev) { } #endif -- cgit v1.2.3 From 0daaecacb83bc6b656a56393ab77a31c28139bc7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 12 May 2017 10:44:08 -0700 Subject: xfs: fix indlen accounting error on partial delalloc conversion The delalloc -> real block conversion path uses an incorrect calculation in the case where the middle part of a delalloc extent is being converted. This is documented as a rare situation because XFS generally attempts to maximize contiguity by converting as much of a delalloc extent as possible. If this situation does occur, the indlen reservation for the two new delalloc extents left behind by the conversion of the middle range is calculated and compared with the original reservation. If more blocks are required, the delta is allocated from the global block pool. This delta value can be characterized as the difference between the new total requirement (temp + temp2) and the currently available reservation minus those blocks that have already been allocated (startblockval(PREV.br_startblock) - allocated). The problem is that the current code does not account for previously allocated blocks correctly. It subtracts the current allocation count from the (new - old) delta rather than the old indlen reservation. This means that more indlen blocks than have been allocated end up stashed in the remaining extents and free space accounting is broken as a result. Fix up the calculation to subtract the allocated block count from the original extent indlen and thus correctly allocate the reservation delta based on the difference between the new total requirement and the unused blocks from the original reservation. Also remove a bogus assert that contradicts the fact that the new indlen reservation can be larger than the original indlen reservation. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f02eb7673392..8adb91b05588 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -2065,8 +2065,10 @@ xfs_bmap_add_extent_delay_real( } temp = xfs_bmap_worst_indlen(bma->ip, temp); temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); - diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + diff = (int)(temp + temp2 - + (startblockval(PREV.br_startblock) - + (bma->cur ? + bma->cur->bc_private.b.allocated : 0))); if (diff > 0) { error = xfs_mod_fdblocks(bma->ip->i_mount, -((int64_t)diff), false); @@ -2123,7 +2125,6 @@ xfs_bmap_add_extent_delay_real( temp = da_new; if (bma->cur) temp += bma->cur->bc_private.b.allocated; - ASSERT(temp <= da_old); if (temp < da_old) xfs_mod_fdblocks(bma->ip->i_mount, (int64_t)(da_old - temp), false); -- cgit v1.2.3 From 6eadbf4c8ba816c10d1c97bed9aa861d9fd17809 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 12 May 2017 10:44:08 -0700 Subject: xfs: BMAPX shouldn't barf on inline-format directories When we're fulfilling a BMAPX request, jump out early if the data fork is in local format. This prevents us from hitting a debugging check in bmapi_read and barfing errors back to userspace. The on-disk extent count check later isn't sufficient for IF_DELALLOC mode because da extents are in memory and not on disk. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_bmap_util.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2b954308a1d6..2e8851ee6759 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -582,9 +582,13 @@ xfs_getbmap( } break; default: + /* Local format data forks report no extents. */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + bmv->bmv_entries = 0; + return 0; + } if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE && - ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) return -EINVAL; if (xfs_get_extsz_hint(ip) || -- cgit v1.2.3 From 6e747506dde195d3d05fe2bb8ef78aceba28a5e3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 12 May 2017 10:44:11 -0700 Subject: xfs: fix warnings about unused stack variables Reduce stack usage and get rid of compiler warnings by eliminating unused variables. Signed-off-by: Darrick J. Wong Reviewed-by: Carlos Maiolino --- fs/xfs/libxfs/xfs_bmap.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 8adb91b05588..a7048eafa8e6 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1280,7 +1280,6 @@ xfs_bmap_read_extents( xfs_bmbt_rec_t *frp; xfs_fsblock_t nextbno; xfs_extnum_t num_recs; - xfs_extnum_t start; num_recs = xfs_btree_get_numrecs(block); if (unlikely(i + num_recs > room)) { @@ -1303,7 +1302,6 @@ xfs_bmap_read_extents( * Copy records into the extent records. */ frp = XFS_BMBT_REC_ADDR(mp, block, 1); - start = i; for (j = 0; j < num_recs; j++, i++, frp++) { xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); trp->l0 = be64_to_cpu(frp->l0); -- cgit v1.2.3 From 892d2a5f705723b2cb488bfb38bcbdcf83273184 Mon Sep 17 00:00:00 2001 From: Zorro Lang Date: Mon, 15 May 2017 08:40:02 -0700 Subject: xfs: bad assertion for delalloc an extent that start at i_size By run fsstress long enough time enough in RHEL-7, I find an assertion failure (harder to reproduce on linux-4.11, but problem is still there): XFS: Assertion failed: (iflags & BMV_IF_DELALLOC) != 0, file: fs/xfs/xfs_bmap_util.c The assertion is in xfs_getbmap() funciton: if (map[i].br_startblock == DELAYSTARTBLOCK && --> map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) ASSERT((iflags & BMV_IF_DELALLOC) != 0); When map[i].br_startoff == XFS_B_TO_FSB(mp, XFS_ISIZE(ip)), the startoff is just at EOF. But we only need to make sure delalloc extents that are within EOF, not include EOF. Signed-off-by: Zorro Lang Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2e8851ee6759..9e3cc2146d5b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -716,7 +716,7 @@ xfs_getbmap( * extents. */ if (map[i].br_startblock == DELAYSTARTBLOCK && - map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + map[i].br_startoff < XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) ASSERT((iflags & BMV_IF_DELALLOC) != 0); if (map[i].br_startblock == HOLESTARTBLOCK && -- cgit v1.2.3 From ea9a46e1c49251331dbfda19ced7114337966178 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 12 May 2017 10:44:10 -0700 Subject: xfs: only return detailed fsmap info if the caller has CAP_SYS_ADMIN There were a number of handwaving complaints that one could "possibly" use inode numbers and extent maps to fingerprint a filesystem hosting multiple containers and somehow use the information to guess at the contents of other containers and attack them. Despite the total lack of any demonstration that this is actually possible, it's easier to restrict access now and broaden it later, so use the rmapbt fsmap backends only if the caller has CAP_SYS_ADMIN. Unprivileged users will just have to make do with only getting the free space and static metadata placement information. Signed-off-by: Darrick J. Wong Reviewed-by: Carlos Maiolino --- fs/xfs/xfs_fsmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 3683819887a5..814ed729881d 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -828,6 +828,7 @@ xfs_getfsmap( struct xfs_fsmap dkeys[2]; /* per-dev keys */ struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS]; struct xfs_getfsmap_info info = { NULL }; + bool use_rmap; int i; int error = 0; @@ -837,12 +838,14 @@ xfs_getfsmap( !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) return -EINVAL; + use_rmap = capable(CAP_SYS_ADMIN) && + xfs_sb_version_hasrmapbt(&mp->m_sb); head->fmh_entries = 0; /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (use_rmap) handlers[0].fn = xfs_getfsmap_datadev_rmapbt; else handlers[0].fn = xfs_getfsmap_datadev_bnobt; -- cgit v1.2.3 From 69c8ebf83213e6165b13d94ec599b861467ee2dc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 16 May 2017 12:22:22 +0200 Subject: fuseblk: Fix warning in super_setup_bdi_name() Commit 5f7f7543f52e "fuse: Convert to separately allocated bdi" didn't properly handle fuseblk filesystem. When fuse_bdi_init() is called for that filesystem type, sb->s_bdi is already initialized (by set_bdev_super()) to point to block device's bdi and consequently super_setup_bdi_name() complains about this fact when reseting bdi to the private one. Fix the problem by properly dropping bdi reference in fuse_bdi_init() before creating a private bdi in super_setup_bdi_name(). Fixes: 5f7f7543f52e ("fuse: Convert to separately allocated bdi") Reported-by: Rakesh Pandit Tested-by: Rakesh Pandit Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fuse/inode.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 73cf05135252..9da1a61276d1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -972,8 +972,15 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) int err; char *suffix = ""; - if (sb->s_bdev) + if (sb->s_bdev) { suffix = "-fuseblk"; + /* + * sb->s_bdi points to blkdev's bdi however we want to redirect + * it to our private bdi... + */ + bdi_put(sb->s_bdi); + sb->s_bdi = &noop_backing_dev_info; + } err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev), MINOR(fc->dev), suffix); if (err) -- cgit v1.2.3 From 3ecb3ac7b950ff8f6c6a61e8b7b0d6e3546429a0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 15 May 2017 19:16:15 -0700 Subject: xfs: avoid mount-time deadlock in CoW extent recovery If a malicious user corrupts the refcount btree to cause a cycle between different levels of the tree, the next mount attempt will deadlock in the CoW recovery routine while grabbing buffer locks. We can use the ability to re-grab a buffer that was previous locked to a transaction to avoid deadlocks, so do that here. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_refcount.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index b177ef33cd4c..82a38d86ebad 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1629,13 +1629,28 @@ xfs_refcount_recover_cow_leftovers( if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) return -EOPNOTSUPP; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + INIT_LIST_HEAD(&debris); + + /* + * In this first part, we use an empty transaction to gather up + * all the leftover CoW extents so that we can subsequently + * delete them. The empty transaction is used to avoid + * a buffer lock deadlock if there happens to be a loop in the + * refcountbt because we're allowed to re-grab a buffer that is + * already attached to our transaction. When we're done + * recording the CoW debris we cancel the (empty) transaction + * and everything goes away cleanly. + */ + error = xfs_trans_alloc_empty(mp, &tp); if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + if (error) + goto out_trans; + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); /* Find all the leftover CoW staging extents. */ - INIT_LIST_HEAD(&debris); memset(&low, 0, sizeof(low)); memset(&high, 0, sizeof(high)); low.rc.rc_startblock = XFS_REFC_COW_START; @@ -1645,10 +1660,11 @@ xfs_refcount_recover_cow_leftovers( if (error) goto out_cursor; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - xfs_buf_relse(agbp); + xfs_trans_brelse(tp, agbp); + xfs_trans_cancel(tp); /* Now iterate the list to free the leftovers */ - list_for_each_entry(rr, &debris, rr_list) { + list_for_each_entry_safe(rr, n, &debris, rr_list) { /* Set up transaction. */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); if (error) @@ -1676,8 +1692,16 @@ xfs_refcount_recover_cow_leftovers( error = xfs_trans_commit(tp); if (error) goto out_free; + + list_del(&rr->rr_list); + kmem_free(rr); } + return error; +out_defer: + xfs_defer_cancel(&dfops); +out_trans: + xfs_trans_cancel(tp); out_free: /* Free the leftover list */ list_for_each_entry_safe(rr, n, &debris, rr_list) { @@ -1688,11 +1712,6 @@ out_free: out_cursor: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - xfs_buf_relse(agbp); - goto out_free; - -out_defer: - xfs_defer_cancel(&dfops); - xfs_trans_cancel(tp); - goto out_free; + xfs_trans_brelse(tp, agbp); + goto out_trans; } -- cgit v1.2.3 From 42c99fc4c7069371da7b04b9099319dd1c633ee2 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Fri, 5 May 2017 18:28:44 +0100 Subject: ceph: check that the new inode size is within limits in ceph_fallocate() Currently the ceph client doesn't respect the rlimit in fallocate. This means that a user can allocate a file with size > RLIMIT_FSIZE. This patch adds the call to inode_newsize_ok() to verify filesystem limits and ulimits. This should make ceph successfully run xfstest generic/228. Signed-off-by: Luis Henriques Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3fdde0b283c9..29308a80d66f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1671,8 +1671,12 @@ static long ceph_fallocate(struct file *file, int mode, } size = i_size_read(inode); - if (!(mode & FALLOC_FL_KEEP_SIZE)) + if (!(mode & FALLOC_FL_KEEP_SIZE)) { endoff = offset + length; + ret = inode_newsize_ok(inode, endoff); + if (ret) + goto unlock; + } if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; -- cgit v1.2.3 From a4d768e702de224cc85e0c8eac9311763403b368 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 22 May 2017 19:54:10 -0700 Subject: xfs: fix unaligned access in xfs_btree_visit_blocks This structure copy was throwing unaligned access warnings on sparc64: Kernel unaligned access at TPC[1043c088] xfs_btree_visit_blocks+0x88/0xe0 [xfs] xfs_btree_copy_ptrs does a memcpy, which avoids it. Signed-off-by: Eric Sandeen Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 5392674bf893..3a673ba201aa 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4395,7 +4395,7 @@ xfs_btree_visit_blocks( xfs_btree_readahead_ptr(cur, ptr, 1); /* save for the next iteration of the loop */ - lptr = *ptr; + xfs_btree_copy_ptrs(cur, &lptr, ptr, 1); } /* for each buffer in the level */ -- cgit v1.2.3 From 8affebe16d79ebefb1d9d6d56a46dc89716f9453 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 23 May 2017 08:30:46 -0700 Subject: xfs: fix off-by-one on max nr_pages in xfs_find_get_desired_pgoff() xfs_find_get_desired_pgoff() is used to search for offset of hole or data in page range [index, end] (both inclusive), and the max number of pages to search should be at least one, if end == index. Otherwise the only page is missed and no hole or data is found, which is not correct. When block size is smaller than page size, this can be demonstrated by preallocating a file with size smaller than page size and writing data to the last block. E.g. run this xfs_io command on a 1k block size XFS on x86_64 host. # xfs_io -fc "falloc 0 3k" -c "pwrite 2k 1k" \ -c "seek -d 0" /mnt/xfs/testfile wrote 1024/1024 bytes at offset 2048 1 KiB, 1 ops; 0.0000 sec (33.675 MiB/sec and 34482.7586 ops/sec) Whence Result DATA EOF Data at offset 2k was missed, and lseek(2) returned ENXIO. This is uncovered by generic/285 subtest 07 and 08 on ppc64 host, where pagesize is 64k. Because a recent change to generic/285 reduced the preallocated file size to smaller than 64k. Cc: stable@vger.kernel.org # v3.7+ Signed-off-by: Eryu Guan Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 35703a801372..aefa2134a8cb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1049,7 +1049,7 @@ xfs_find_get_desired_pgoff( unsigned nr_pages; unsigned int i; - want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); /* -- cgit v1.2.3 From 5375023ae1266553a7baa0845e82917d8803f48c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 May 2017 16:36:22 -0700 Subject: xfs: Fix missed holes in SEEK_HOLE implementation XFS SEEK_HOLE implementation could miss a hole in an unwritten extent as can be seen by the following command: xfs_io -c "falloc 0 256k" -c "pwrite 0 56k" -c "pwrite 128k 8k" -c "seek -h 0" file wrote 57344/57344 bytes at offset 0 56 KiB, 14 ops; 0.0000 sec (49.312 MiB/sec and 12623.9856 ops/sec) wrote 8192/8192 bytes at offset 131072 8 KiB, 2 ops; 0.0000 sec (70.383 MiB/sec and 18018.0180 ops/sec) Whence Result HOLE 139264 Where we can see that hole at offset 56k was just ignored by SEEK_HOLE implementation. The bug is in xfs_find_get_desired_pgoff() which does not properly detect the case when pages are not contiguous. Fix the problem by properly detecting when found page has larger offset than expected. CC: stable@vger.kernel.org Fixes: d126d43f631f996daeee5006714fed914be32368 Signed-off-by: Jan Kara Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index aefa2134a8cb..f1517e9928c7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1076,17 +1076,6 @@ xfs_find_get_desired_pgoff( break; } - /* - * At lease we found one page. If this is the first time we - * step into the loop, and if the first page index offset is - * greater than the given search offset, a hole was found. - */ - if (type == HOLE_OFF && lastoff == startoff && - lastoff < page_offset(pvec.pages[0])) { - found = true; - break; - } - for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; loff_t b_offset; @@ -1098,18 +1087,18 @@ xfs_find_get_desired_pgoff( * file mapping. However, page->index will not change * because we have a reference on the page. * - * Searching done if the page index is out of range. - * If the current offset is not reaches the end of - * the specified search range, there should be a hole - * between them. + * If current page offset is beyond where we've ended, + * we've found a hole. */ - if (page->index > end) { - if (type == HOLE_OFF && lastoff < endoff) { - *offset = lastoff; - found = true; - } + if (type == HOLE_OFF && lastoff < endoff && + lastoff < page_offset(pvec.pages[i])) { + found = true; + *offset = lastoff; goto out; } + /* Searching done if the page index is out of range. */ + if (page->index > end) + goto out; lock_page(page); /* -- cgit v1.2.3 From d7fd24257aa60316bf81093f7f909dc9475ae974 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 May 2017 16:36:23 -0700 Subject: xfs: Fix off-by-in in loop termination in xfs_find_get_desired_pgoff() There is an off-by-one error in loop termination conditions in xfs_find_get_desired_pgoff() since 'end' may index a page beyond end of desired range if 'endoff' is page aligned. It doesn't have any visible effects but still it is good to fix it. Signed-off-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f1517e9928c7..dc0e4cb7029b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1043,7 +1043,7 @@ xfs_find_get_desired_pgoff( index = startoff >> PAGE_SHIFT; endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); - end = endoff >> PAGE_SHIFT; + end = (endoff - 1) >> PAGE_SHIFT; do { int want; unsigned nr_pages; -- cgit v1.2.3 From a54fba8f5a0dc36161cacdf2aa90f007f702ec1a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 May 2017 16:36:24 -0700 Subject: xfs: Move handling of missing page into one place in xfs_find_get_desired_pgoff() Currently several places in xfs_find_get_desired_pgoff() handle the case of a missing page. Make them all handled in one place after the loop has terminated. Signed-off-by: Jan Kara Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_file.c | 38 ++++++++------------------------------ 1 file changed, 8 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index dc0e4cb7029b..5fb5a0958a14 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1052,29 +1052,8 @@ xfs_find_get_desired_pgoff( want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); - /* - * No page mapped into given range. If we are searching holes - * and if this is the first time we got into the loop, it means - * that the given offset is landed in a hole, return it. - * - * If we have already stepped through some block buffers to find - * holes but they all contains data. In this case, the last - * offset is already updated and pointed to the end of the last - * mapped page, if it does not reach the endpoint to search, - * that means there should be a hole between them. - */ - if (nr_pages == 0) { - /* Data search found nothing */ - if (type == DATA_OFF) - break; - - ASSERT(type == HOLE_OFF); - if (lastoff == startoff || lastoff < endoff) { - found = true; - *offset = lastoff; - } + if (nr_pages == 0) break; - } for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1140,21 +1119,20 @@ xfs_find_get_desired_pgoff( /* * The number of returned pages less than our desired, search - * done. In this case, nothing was found for searching data, - * but we found a hole behind the last offset. + * done. */ - if (nr_pages < want) { - if (type == HOLE_OFF) { - *offset = lastoff; - found = true; - } + if (nr_pages < want) break; - } index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + /* No page at lastoff and we are not done - we found a hole. */ + if (type == HOLE_OFF && lastoff < endoff) { + *offset = lastoff; + found = true; + } out: pagevec_release(&pvec); return found; -- cgit v1.2.3 From 80b79dd0e2f29f06a6a54a5755c718f1c7ebb136 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sat, 27 May 2017 06:07:18 -0400 Subject: fs: locks: Fix some troubles at kernel-doc comments There are a few syntax violations that cause outputs of a few comments to not be properly parsed in ReST format. No functional changes. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jeff Layton --- fs/locks.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index af2031a1fcff..4a4543a7f9c1 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1858,8 +1858,8 @@ EXPORT_SYMBOL(generic_setlease); * * Call this to establish a lease on the file. The "lease" argument is not * used for F_UNLCK requests and may be NULL. For commands that set or alter - * an existing lease, the (*lease)->fl_lmops->lm_break operation must be set; - * if not, this function will return -ENOLCK (and generate a scary-looking + * an existing lease, the ``(*lease)->fl_lmops->lm_break`` operation must be + * set; if not, this function will return -ENOLCK (and generate a scary-looking * stack trace). * * The "priv" pointer is passed directly to the lm_setup function as-is. It @@ -1972,15 +1972,13 @@ EXPORT_SYMBOL(locks_lock_inode_wait); * @cmd: the type of lock to apply. * * Apply a %FL_FLOCK style lock to an open file descriptor. - * The @cmd can be one of + * The @cmd can be one of: * - * %LOCK_SH -- a shared lock. - * - * %LOCK_EX -- an exclusive lock. - * - * %LOCK_UN -- remove an existing lock. - * - * %LOCK_MAND -- a `mandatory' flock. This exists to emulate Windows Share Modes. + * - %LOCK_SH -- a shared lock. + * - %LOCK_EX -- an exclusive lock. + * - %LOCK_UN -- remove an existing lock. + * - %LOCK_MAND -- a 'mandatory' flock. + * This exists to emulate Windows Share Modes. * * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other * processes read and write access respectively. -- cgit v1.2.3 From a75d30c772078546ac00399a94ecdc82df1a4d72 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 May 2017 06:07:19 -0400 Subject: fs/locks: pass kernel struct flock to fcntl_getlk/setlk This will make it easier to implement a sane compat fcntl syscall. [ jlayton: fix undeclared identifiers in 32-bit fcntl64 syscall handler ] Signed-off-by: Christoph Hellwig Reviewed-by: Jeff Layton Signed-off-by: Jeff Layton --- fs/fcntl.c | 27 +++++++++++++++---- fs/locks.c | 79 +++++++++++++++--------------------------------------- include/linux/fs.h | 8 +++--- 3 files changed, 48 insertions(+), 66 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index f4e7267d117f..ed4283d500a3 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -246,6 +246,8 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { + void __user *argp = (void __user *)arg; + struct flock flock; long err = -EINVAL; switch (cmd) { @@ -273,7 +275,11 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_OFD_GETLK: #endif case F_GETLK: - err = fcntl_getlk(filp, cmd, (struct flock __user *) arg); + if (copy_from_user(&flock, argp, sizeof(flock))) + return -EFAULT; + err = fcntl_getlk(filp, cmd, &flock); + if (!err && copy_to_user(argp, &flock, sizeof(flock))) + return -EFAULT; break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ @@ -283,7 +289,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, /* Fallthrough */ case F_SETLK: case F_SETLKW: - err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg); + if (copy_from_user(&flock, argp, sizeof(flock))) + return -EFAULT; + err = fcntl_setlk(fd, filp, cmd, &flock); break; case F_GETOWN: /* @@ -383,7 +391,9 @@ out: SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { + void __user *argp = (void __user *)arg; struct fd f = fdget_raw(fd); + struct flock64 flock; long err = -EBADF; if (!f.file) @@ -401,14 +411,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, switch (cmd) { case F_GETLK64: case F_OFD_GETLK: - err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg); + err = -EFAULT; + if (copy_from_user(&flock, argp, sizeof(flock))) + break; + err = fcntl_getlk64(f.file, cmd, &flock); + if (!err && copy_to_user(argp, &flock, sizeof(flock))) + err = -EFAULT; break; case F_SETLK64: case F_SETLKW64: case F_OFD_SETLK: case F_OFD_SETLKW: - err = fcntl_setlk64(fd, f.file, cmd, - (struct flock64 __user *) arg); + err = -EFAULT; + if (copy_from_user(&flock, argp, sizeof(flock))) + break; + err = fcntl_setlk64(fd, f.file, cmd, &flock); break; default: err = do_fcntl(fd, cmd, arg, f.file); diff --git a/fs/locks.c b/fs/locks.c index 4a4543a7f9c1..afefeb4ad6de 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2084,26 +2084,22 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). */ -int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) +int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) { struct file_lock file_lock; - struct flock flock; int error; - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; error = -EINVAL; - if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK)) + if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; - error = flock_to_posix_lock(filp, &file_lock, &flock); + error = flock_to_posix_lock(filp, &file_lock, flock); if (error) goto out; if (cmd == F_OFD_GETLK) { error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_GETLK; @@ -2115,15 +2111,12 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) if (error) goto out; - flock.l_type = file_lock.fl_type; + flock->l_type = file_lock.fl_type; if (file_lock.fl_type != F_UNLCK) { - error = posix_lock_to_flock(&flock, &file_lock); + error = posix_lock_to_flock(flock, &file_lock); if (error) goto rel_priv; } - error = -EFAULT; - if (!copy_to_user(l, &flock, sizeof(flock))) - error = 0; rel_priv: locks_release_private(&file_lock); out: @@ -2216,26 +2209,16 @@ check_fmode_for_setlk(struct file_lock *fl) * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, - struct flock __user *l) + struct flock *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct flock flock; - struct inode *inode; + struct inode *inode = locks_inode(filp); struct file *f; int error; if (file_lock == NULL) return -ENOLCK; - inode = locks_inode(filp); - - /* - * This might block, so we do it before checking the inode. - */ - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; - /* Don't allow mandatory locks on files that may be memory mapped * and shared. */ @@ -2244,7 +2227,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, goto out; } - error = flock_to_posix_lock(filp, file_lock, &flock); + error = flock_to_posix_lock(filp, file_lock, flock); if (error) goto out; @@ -2259,7 +2242,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, switch (cmd) { case F_OFD_SETLK: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLK; @@ -2268,7 +2251,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, break; case F_OFD_SETLKW: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLKW; @@ -2313,26 +2296,22 @@ out: /* Report the first existing lock that would conflict with l. * This implements the F_GETLK command of fcntl(). */ -int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) +int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) { struct file_lock file_lock; - struct flock64 flock; int error; - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; error = -EINVAL; - if ((flock.l_type != F_RDLCK) && (flock.l_type != F_WRLCK)) + if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; - error = flock64_to_posix_lock(filp, &file_lock, &flock); + error = flock64_to_posix_lock(filp, &file_lock, flock); if (error) goto out; if (cmd == F_OFD_GETLK) { error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_GETLK64; @@ -2344,13 +2323,9 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) if (error) goto out; - flock.l_type = file_lock.fl_type; + flock->l_type = file_lock.fl_type; if (file_lock.fl_type != F_UNLCK) - posix_lock_to_flock64(&flock, &file_lock); - - error = -EFAULT; - if (!copy_to_user(l, &flock, sizeof(flock))) - error = 0; + posix_lock_to_flock64(flock, &file_lock); locks_release_private(&file_lock); out: @@ -2361,26 +2336,16 @@ out: * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, - struct flock64 __user *l) + struct flock64 *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct flock64 flock; - struct inode *inode; + struct inode *inode = locks_inode(filp); struct file *f; int error; if (file_lock == NULL) return -ENOLCK; - /* - * This might block, so we do it before checking the inode. - */ - error = -EFAULT; - if (copy_from_user(&flock, l, sizeof(flock))) - goto out; - - inode = locks_inode(filp); - /* Don't allow mandatory locks on files that may be memory mapped * and shared. */ @@ -2389,7 +2354,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, goto out; } - error = flock64_to_posix_lock(filp, file_lock, &flock); + error = flock64_to_posix_lock(filp, file_lock, flock); if (error) goto out; @@ -2404,7 +2369,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, switch (cmd) { case F_OFD_SETLK: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLK64; @@ -2413,7 +2378,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, break; case F_OFD_SETLKW: error = -EINVAL; - if (flock.l_pid != 0) + if (flock->l_pid != 0) goto out; cmd = F_SETLKW64; diff --git a/include/linux/fs.h b/include/linux/fs.h index 803e5a9b2654..aa4affb38c39 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1038,14 +1038,14 @@ static inline struct inode *locks_inode(const struct file *f) } #ifdef CONFIG_FILE_LOCKING -extern int fcntl_getlk(struct file *, unsigned int, struct flock __user *); +extern int fcntl_getlk(struct file *, unsigned int, struct flock *); extern int fcntl_setlk(unsigned int, struct file *, unsigned int, - struct flock __user *); + struct flock *); #if BITS_PER_LONG == 32 -extern int fcntl_getlk64(struct file *, unsigned int, struct flock64 __user *); +extern int fcntl_getlk64(struct file *, unsigned int, struct flock64 *); extern int fcntl_setlk64(unsigned int, struct file *, unsigned int, - struct flock64 __user *); + struct flock64 *); #endif extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); -- cgit v1.2.3 From 94073ad77fff221b5e66b8b9863a546ba212d6a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 May 2017 06:07:20 -0400 Subject: fs/locks: don't mess with the address limit in compat_fcntl64 Instead write a proper compat syscall that calls common helpers. [ jlayton: fix pointer dereferencing in fixup_compat_flock ] Signed-off-by: Christoph Hellwig Reviewed-by: Jeff Layton Signed-off-by: Jeff Layton --- fs/fcntl.c | 118 +++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 67 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index ed4283d500a3..bbf80344c125 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -506,76 +506,92 @@ convert_fcntl_cmd(unsigned int cmd) return cmd; } +/* + * GETLK was successful and we need to return the data, but it needs to fit in + * the compat structure. + * l_start shouldn't be too big, unless the original start + end is greater than + * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return + * -EOVERFLOW in that case. l_len could be too big, in which case we just + * truncate it, and only allow the app to see that part of the conflicting lock + * that might make sense to it anyway + */ +static int fixup_compat_flock(struct flock *flock) +{ + if (flock->l_start > COMPAT_OFF_T_MAX) + return -EOVERFLOW; + if (flock->l_len > COMPAT_OFF_T_MAX) + flock->l_len = COMPAT_OFF_T_MAX; + return 0; +} + COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { - mm_segment_t old_fs; - struct flock f; - long ret; - unsigned int conv_cmd; + struct fd f = fdget_raw(fd); + struct flock flock; + long err = -EBADF; + + if (!f.file) + return err; + + if (unlikely(f.file->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) + goto out_put; + } + + err = security_file_fcntl(f.file, cmd, arg); + if (err) + goto out_put; switch (cmd) { case F_GETLK: + err = get_compat_flock(&flock, compat_ptr(arg)); + if (err) + break; + err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock); + if (err) + break; + err = fixup_compat_flock(&flock); + if (err) + return err; + err = put_compat_flock(&flock, compat_ptr(arg)); + break; + case F_GETLK64: + case F_OFD_GETLK: + err = get_compat_flock64(&flock, compat_ptr(arg)); + if (err) + break; + err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock); + if (err) + break; + err = fixup_compat_flock(&flock); + if (err) + return err; + err = put_compat_flock64(&flock, compat_ptr(arg)); + break; case F_SETLK: case F_SETLKW: - ret = get_compat_flock(&f, compat_ptr(arg)); - if (ret != 0) + err = get_compat_flock(&flock, compat_ptr(arg)); + if (err) break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_fcntl(fd, cmd, (unsigned long)&f); - set_fs(old_fs); - if (cmd == F_GETLK && ret == 0) { - /* GETLK was successful and we need to return the data... - * but it needs to fit in the compat structure. - * l_start shouldn't be too big, unless the original - * start + end is greater than COMPAT_OFF_T_MAX, in which - * case the app was asking for trouble, so we return - * -EOVERFLOW in that case. - * l_len could be too big, in which case we just truncate it, - * and only allow the app to see that part of the conflicting - * lock that might make sense to it anyway - */ - - if (f.l_start > COMPAT_OFF_T_MAX) - ret = -EOVERFLOW; - if (f.l_len > COMPAT_OFF_T_MAX) - f.l_len = COMPAT_OFF_T_MAX; - if (ret == 0) - ret = put_compat_flock(&f, compat_ptr(arg)); - } + err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock); break; - - case F_GETLK64: case F_SETLK64: case F_SETLKW64: - case F_OFD_GETLK: case F_OFD_SETLK: case F_OFD_SETLKW: - ret = get_compat_flock64(&f, compat_ptr(arg)); - if (ret != 0) + err = get_compat_flock64(&flock, compat_ptr(arg)); + if (err) break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - conv_cmd = convert_fcntl_cmd(cmd); - ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); - set_fs(old_fs); - if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { - /* need to return lock information - see above for commentary */ - if (f.l_start > COMPAT_LOFF_T_MAX) - ret = -EOVERFLOW; - if (f.l_len > COMPAT_LOFF_T_MAX) - f.l_len = COMPAT_LOFF_T_MAX; - if (ret == 0) - ret = put_compat_flock64(&f, compat_ptr(arg)); - } + err = fcntl_setlk(fd, f.file, convert_fcntl_cmd(cmd), &flock); break; - default: - ret = sys_fcntl(fd, cmd, arg); + err = do_fcntl(fd, cmd, arg, f.file); break; } - return ret; +out_put: + fdput(f); + return err; } COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, -- cgit v1.2.3 From 393cc3f51135ea2520521f776ef3afdf3395c797 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 13 Jun 2017 13:35:50 +0200 Subject: fs/fcntl: f_setown, allow returning error Allow f_setown to return an error value. We will fail in the next patch with EINVAL for bad input to f_setown, so tile the path for the later patch. Signed-off-by: Jiri Slaby Reviewed-by: Jeff Layton Cc: Jeff Layton Cc: "J. Bruce Fields" Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Jeff Layton --- fs/fcntl.c | 7 ++++--- include/linux/fs.h | 2 +- net/socket.c | 3 +-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index bbf80344c125..313eba860346 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -109,7 +109,7 @@ void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, } EXPORT_SYMBOL(__f_setown); -void f_setown(struct file *filp, unsigned long arg, int force) +int f_setown(struct file *filp, unsigned long arg, int force) { enum pid_type type; struct pid *pid; @@ -123,6 +123,8 @@ void f_setown(struct file *filp, unsigned long arg, int force) pid = find_vpid(who); __f_setown(filp, pid, type, force); rcu_read_unlock(); + + return 0; } EXPORT_SYMBOL(f_setown); @@ -305,8 +307,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, force_successful_syscall_return(); break; case F_SETOWN: - f_setown(filp, arg, 1); - err = 0; + err = f_setown(filp, arg, 1); break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); diff --git a/include/linux/fs.h b/include/linux/fs.h index aa4affb38c39..25ee1ff6d45b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1249,7 +1249,7 @@ extern void fasync_free(struct fasync_struct *); extern void kill_fasync(struct fasync_struct **, int, int); extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); -extern void f_setown(struct file *filp, unsigned long arg, int force); +extern int f_setown(struct file *filp, unsigned long arg, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct fown_struct *fown); diff --git a/net/socket.c b/net/socket.c index c2564eb25c6b..a30a1e324390 100644 --- a/net/socket.c +++ b/net/socket.c @@ -950,8 +950,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) err = -EFAULT; if (get_user(pid, (int __user *)argp)) break; - f_setown(sock->file, pid, 1); - err = 0; + err = f_setown(sock->file, pid, 1); break; case FIOGETOWN: case SIOCGPGRP: -- cgit v1.2.3 From fc3dc67471461c0efcb1ed22fb7595121d65fad9 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 13 Jun 2017 13:35:51 +0200 Subject: fs/fcntl: f_setown, avoid undefined behaviour fcntl(0, F_SETOWN, 0x80000000) triggers: UBSAN: Undefined behaviour in fs/fcntl.c:118:7 negation of -2147483648 cannot be represented in type 'int': CPU: 1 PID: 18261 Comm: syz-executor Not tainted 4.8.1-0-syzkaller #1 ... Call Trace: ... [] ? f_setown+0x1d8/0x200 [] ? SyS_fcntl+0x999/0xf30 [] ? entry_SYSCALL_64_fastpath+0x23/0xc1 Fix that by checking the arg parameter properly (against INT_MAX) before "who = -who". And return immediatelly with -EINVAL in case it is wrong. Note that according to POSIX we can return EINVAL: http://pubs.opengroup.org/onlinepubs/9699919799/functions/fcntl.html [EINVAL] The cmd argument is F_SETOWN and the value of the argument is not valid as a process or process group identifier. [v2] returns an error, v1 used to fail silently [v3] implement proper check for the bad value INT_MIN Signed-off-by: Jiri Slaby Cc: Jeff Layton Cc: "J. Bruce Fields" Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Jeff Layton --- fs/fcntl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index 313eba860346..693322e28751 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -116,6 +116,10 @@ int f_setown(struct file *filp, unsigned long arg, int force) int who = arg; type = PIDTYPE_PID; if (who < 0) { + /* avoid overflow below */ + if (who == INT_MIN) + return -EINVAL; + type = PIDTYPE_PGID; who = -who; } -- cgit v1.2.3 From f73127356f344483c82632accda2e72b7e0e5f25 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 14 Jun 2017 09:11:54 -0400 Subject: fs/fcntl: return -ESRCH in f_setown when pid/pgid can't be found The current implementation of F_SETOWN doesn't properly vet the argument passed in and only returns an error if INT_MIN is passed in. If the argument doesn't specify a valid pid/pgid, then we just end up cleaning out the file->f_owner structure. What we really want is to only clean that out only in the case where userland passed in an argument of 0. For anything else, we want to return ESRCH if it doesn't refer to a valid pid. The relevant POSIX spec page is here: http://pubs.opengroup.org/onlinepubs/9699919799/functions/fcntl.html Cc: Jiri Slaby Cc: zhong jiang Signed-off-by: Jeff Layton --- fs/fcntl.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index 693322e28751..afed3b364979 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -112,8 +112,9 @@ EXPORT_SYMBOL(__f_setown); int f_setown(struct file *filp, unsigned long arg, int force) { enum pid_type type; - struct pid *pid; - int who = arg; + struct pid *pid = NULL; + int who = arg, ret = 0; + type = PIDTYPE_PID; if (who < 0) { /* avoid overflow below */ @@ -123,12 +124,19 @@ int f_setown(struct file *filp, unsigned long arg, int force) type = PIDTYPE_PGID; who = -who; } + rcu_read_lock(); - pid = find_vpid(who); - __f_setown(filp, pid, type, force); + if (who) { + pid = find_vpid(who); + if (!pid) + ret = -ESRCH; + } + + if (!ret) + __f_setown(filp, pid, type, force); rcu_read_unlock(); - return 0; + return ret; } EXPORT_SYMBOL(f_setown); -- cgit v1.2.3