From e2690695ce5085677b84fbf2e38d2ed57cad39cd Mon Sep 17 00:00:00 2001 From: Peter Huewe Date: Sun, 15 Apr 2012 02:20:27 +0200 Subject: fuse: Convert to kstrtoul_from_user This patch replaces the code for getting an number from a userspace buffer by a simple call to kstroul_from_user. This makes it easier to read and less error prone. Signed-off-by: Peter Huewe Signed-off-by: Miklos Szeredi --- fs/fuse/control.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 42593c587d48..03ff5b1eba93 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -75,19 +75,13 @@ static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf, unsigned global_limit) { unsigned long t; - char tmp[32]; unsigned limit = (1 << 16) - 1; int err; - if (*ppos || count >= sizeof(tmp) - 1) - return -EINVAL; - - if (copy_from_user(tmp, buf, count)) + if (*ppos) return -EINVAL; - tmp[count] = '\0'; - - err = strict_strtoul(tmp, 0, &t); + err = kstrtoul_from_user(buf, count, 0, &t); if (err) return err; -- cgit v1.2.3 From 05ba1f0823004e947748523782e9c2f07f3bff0d Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Sun, 22 Apr 2012 18:45:24 -0700 Subject: fuse: add FALLOCATE operation fallocate filesystem operation preallocates media space for the given file. If fallocate returns success then any subsequent write to the given range never fails with 'not enough space' error. Signed-off-by: Anatol Pomozov Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 33 +++++++++++++++++++++++++++++++++ include/linux/fuse.h | 14 +++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 504e61b7fd75..e3fee88831d4 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2171,6 +2171,37 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, return ret; } +long fuse_file_fallocate(struct file *file, int mode, loff_t offset, + loff_t length) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + struct fuse_req *req; + struct fuse_fallocate_in inarg = { + .fh = ff->fh, + .offset = offset, + .length = length, + .mode = mode + }; + int err; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.h.opcode = FUSE_FALLOCATE; + req->in.h.nodeid = ff->nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + + return err; +} +EXPORT_SYMBOL_GPL(fuse_file_fallocate); + static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, .read = do_sync_read, @@ -2188,6 +2219,7 @@ static const struct file_operations fuse_file_operations = { .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, + .fallocate = fuse_file_fallocate, }; static const struct file_operations fuse_direct_io_file_operations = { @@ -2204,6 +2236,7 @@ static const struct file_operations fuse_direct_io_file_operations = { .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, + .fallocate = fuse_file_fallocate, /* no splice_read */ }; diff --git a/include/linux/fuse.h b/include/linux/fuse.h index 8f2ab8fef929..9303348965fb 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -54,6 +54,9 @@ * 7.18 * - add FUSE_IOCTL_DIR flag * - add FUSE_NOTIFY_DELETE + * + * 7.19 + * - add FUSE_FALLOCATE */ #ifndef _LINUX_FUSE_H @@ -85,7 +88,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 18 +#define FUSE_KERNEL_MINOR_VERSION 19 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -278,6 +281,7 @@ enum fuse_opcode { FUSE_POLL = 40, FUSE_NOTIFY_REPLY = 41, FUSE_BATCH_FORGET = 42, + FUSE_FALLOCATE = 43, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -571,6 +575,14 @@ struct fuse_notify_poll_wakeup_out { __u64 kh; }; +struct fuse_fallocate_in { + __u64 fh; + __u64 offset; + __u64 length; + __u32 mode; + __u32 padding; +}; + struct fuse_in_header { __u32 len; __u32 opcode; -- cgit v1.2.3 From 519c6040ce04474bc893774f866fd8d907b20429 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 26 Apr 2012 10:56:36 +0200 Subject: fuse: optimize fallocate on permanent failure If userspace filesystem doesn't support fallocate, remember this and don't send request next time. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 7 +++++++ fs/fuse/fuse_i.h | 3 +++ 2 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e3fee88831d4..bbfd571b37e1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2185,6 +2185,9 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset, }; int err; + if (fc->no_fallocate) + return -EOPNOTSUPP; + req = fuse_get_req(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -2196,6 +2199,10 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset, req->in.args[0].value = &inarg; fuse_request_send(fc, req); err = req->out.h.error; + if (err == -ENOSYS) { + fc->no_fallocate = 1; + err = -EOPNOTSUPP; + } fuse_put_request(fc, req); return err; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 572cefc78012..f38fb795f03c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -478,6 +478,9 @@ struct fuse_conn { /** Are BSD file locking primitives not implemented by fs? */ unsigned no_flock:1; + /** Is fallocate not implemented by fs? */ + unsigned no_fallocate:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; -- cgit v1.2.3 From 45c72cd73c788dd18c8113d4a404d6b4a01decf1 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 10 May 2012 19:49:38 +0400 Subject: fuse: fix stat call on 32 bit platforms Now we store attr->ino at inode->i_ino, return attr->ino at the first time and then return inode->i_ino if the attribute timeout isn't expired. That's wrong on 32 bit platforms because attr->ino is 64 bit and inode->i_ino is 32 bit in this case. Fix this by saving 64 bit ino in fuse_inode structure and returning it every time we call getattr. Also squash attr->ino into inode->i_ino explicitly. Signed-off-by: Pavel Shilovsky Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 1 + fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 17 ++++++++++++++++- 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index df5ac048dc74..bc438320cac5 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -863,6 +863,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, if (stat) { generic_fillattr(inode, stat); stat->mode = fi->orig_i_mode; + stat->ino = fi->orig_ino; } } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f38fb795f03c..771fb6322c07 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -82,6 +82,9 @@ struct fuse_inode { preserve the original mode */ umode_t orig_i_mode; + /** 64 bit inode number */ + u64 orig_ino; + /** Version of last attribute change */ u64 attr_version; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 26783eb2b1fc..a59cf5e673d7 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -91,6 +91,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->nlookup = 0; fi->attr_version = 0; fi->writectr = 0; + fi->orig_ino = 0; INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); INIT_LIST_HEAD(&fi->writepages); @@ -139,6 +140,18 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) return 0; } +/* + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. + */ +static ino_t fuse_squash_ino(u64 ino64) +{ + ino_t ino = (ino_t) ino64; + if (sizeof(ino_t) < sizeof(u64)) + ino ^= ino64 >> (sizeof(u64) - sizeof(ino_t)) * 8; + return ino; +} + void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, u64 attr_valid) { @@ -148,7 +161,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, fi->attr_version = ++fc->attr_version; fi->i_time = attr_valid; - inode->i_ino = attr->ino; + inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); set_nlink(inode, attr->nlink); inode->i_uid = attr->uid; @@ -174,6 +187,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, fi->orig_i_mode = inode->i_mode; if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) inode->i_mode &= ~S_ISVTX; + + fi->orig_ino = attr->ino; } void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, -- cgit v1.2.3 From 203627bbc90377c509e32450c67c5d957ba2d989 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 10 May 2012 19:49:38 +0400 Subject: fuse: fix blksize calculation Don't use inode->i_blkbits which might be stale, instead calculate the blksize information from the freshly obtained attributes. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index bc438320cac5..334e0b18a014 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -775,6 +775,8 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, struct kstat *stat) { + unsigned int blkbits; + stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -790,7 +792,13 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, stat->ctime.tv_nsec = attr->ctimensec; stat->size = attr->size; stat->blocks = attr->blocks; - stat->blksize = (1 << inode->i_blkbits); + + if (attr->blksize != 0) + blkbits = ilog2(attr->blksize); + else + blkbits = inode->i_sb->s_blocksize_bits; + + stat->blksize = 1 << blkbits; } static int fuse_do_getattr(struct inode *inode, struct kstat *stat, -- cgit v1.2.3 From 0013fb4ca3171c64a4a5d3851fb591bb575e7f04 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 31 May 2012 13:03:26 +0400 Subject: CIFS: Fix possible wrong memory allocation when cifs_reconnect sets maxBuf to 0 and we try to calculate a size of memory we need to store locks. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 253170dfa716..de8abb6f7b56 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -876,7 +876,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) struct cifsLockInfo *li, *tmp; struct cifs_tcon *tcon; struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); - unsigned int num, max_num; + unsigned int num, max_num, max_buf; LOCKING_ANDX_RANGE *buf, *cur; int types[] = {LOCKING_ANDX_LARGE_FILES, LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; @@ -892,8 +892,19 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) return rc; } - max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / - sizeof(LOCKING_ANDX_RANGE); + /* + * Accessing maxBuf is racy with cifs_reconnect - need to store value + * and check it for zero before using. + */ + max_buf = tcon->ses->server->maxBuf; + if (!max_buf) { + mutex_unlock(&cinode->lock_mutex); + FreeXid(xid); + return -EINVAL; + } + + max_num = (max_buf - sizeof(struct smb_hdr)) / + sizeof(LOCKING_ANDX_RANGE); buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) { mutex_unlock(&cinode->lock_mutex); @@ -1218,7 +1229,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) int types[] = {LOCKING_ANDX_LARGE_FILES, LOCKING_ANDX_SHARED_LOCK | LOCKING_ANDX_LARGE_FILES}; unsigned int i; - unsigned int max_num, num; + unsigned int max_num, num, max_buf; LOCKING_ANDX_RANGE *buf, *cur; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); @@ -1228,8 +1239,16 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) INIT_LIST_HEAD(&tmp_llist); - max_num = (tcon->ses->server->maxBuf - sizeof(struct smb_hdr)) / - sizeof(LOCKING_ANDX_RANGE); + /* + * Accessing maxBuf is racy with cifs_reconnect - need to store value + * and check it for zero before using. + */ + max_buf = tcon->ses->server->maxBuf; + if (!max_buf) + return -EINVAL; + + max_num = (max_buf - sizeof(struct smb_hdr)) / + sizeof(LOCKING_ANDX_RANGE); buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) return -ENOMEM; -- cgit v1.2.3 From ea319d57d3372a9dbee0b3807d75bb36b8d54adc Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 31 May 2012 13:59:36 +0400 Subject: CIFS: Improve identation in cifs_unlock_range Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/file.c | 75 +++++++++++++++++++++++++++------------------------------- 1 file changed, 35 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index de8abb6f7b56..513adbc211d7 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1266,46 +1266,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) continue; if (types[i] != li->type) continue; - if (!cinode->can_cache_brlcks) { - cur->Pid = cpu_to_le16(li->pid); - cur->LengthLow = cpu_to_le32((u32)li->length); - cur->LengthHigh = - cpu_to_le32((u32)(li->length>>32)); - cur->OffsetLow = cpu_to_le32((u32)li->offset); - cur->OffsetHigh = - cpu_to_le32((u32)(li->offset>>32)); - /* - * We need to save a lock here to let us add - * it again to the file's list if the unlock - * range request fails on the server. - */ - list_move(&li->llist, &tmp_llist); - if (++num == max_num) { - stored_rc = cifs_lockv(xid, tcon, - cfile->netfid, - li->type, num, - 0, buf); - if (stored_rc) { - /* - * We failed on the unlock range - * request - add all locks from - * the tmp list to the head of - * the file's list. - */ - cifs_move_llist(&tmp_llist, - &cfile->llist); - rc = stored_rc; - } else - /* - * The unlock range request - * succeed - free the tmp list. - */ - cifs_free_llist(&tmp_llist); - cur = buf; - num = 0; - } else - cur++; - } else { + if (cinode->can_cache_brlcks) { /* * We can cache brlock requests - simply remove * a lock from the file's list. @@ -1313,7 +1274,41 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, int xid) list_del(&li->llist); cifs_del_lock_waiters(li); kfree(li); + continue; } + cur->Pid = cpu_to_le16(li->pid); + cur->LengthLow = cpu_to_le32((u32)li->length); + cur->LengthHigh = cpu_to_le32((u32)(li->length>>32)); + cur->OffsetLow = cpu_to_le32((u32)li->offset); + cur->OffsetHigh = cpu_to_le32((u32)(li->offset>>32)); + /* + * We need to save a lock here to let us add it again to + * the file's list if the unlock range request fails on + * the server. + */ + list_move(&li->llist, &tmp_llist); + if (++num == max_num) { + stored_rc = cifs_lockv(xid, tcon, cfile->netfid, + li->type, num, 0, buf); + if (stored_rc) { + /* + * We failed on the unlock range + * request - add all locks from the tmp + * list to the head of the file's list. + */ + cifs_move_llist(&tmp_llist, + &cfile->llist); + rc = stored_rc; + } else + /* + * The unlock range request succeed - + * free the tmp list. + */ + cifs_free_llist(&tmp_llist); + cur = buf; + num = 0; + } else + cur++; } if (num) { stored_rc = cifs_lockv(xid, tcon, cfile->netfid, -- cgit v1.2.3 From 7f0adb53bcf5bdb92236cda8ec92ea5e40993028 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 28 May 2012 15:50:10 +0400 Subject: CIFS: Make accessing is_valid_oplock/dump_detail ops struct field safe Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/connect.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ccafdedd0dbc..3647b1a4540b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1058,13 +1058,15 @@ cifs_demultiplex_thread(void *p) if (mid_entry != NULL) { if (!mid_entry->multiRsp || mid_entry->multiEnd) mid_entry->callback(mid_entry); - } else if (!server->ops->is_oplock_break(buf, server)) { + } else if (!server->ops->is_oplock_break || + !server->ops->is_oplock_break(buf, server)) { cERROR(1, "No task to wake, unknown frame received! " "NumMids %d", atomic_read(&midCount)); cifs_dump_mem("Received Data is: ", buf, HEADER_SIZE(server)); #ifdef CONFIG_CIFS_DEBUG2 - server->ops->dump_detail(buf); + if (server->ops->dump_detail) + server->ops->dump_detail(buf); cifs_dump_mids(server); #endif /* CIFS_DEBUG2 */ -- cgit v1.2.3 From 88257360605f9362dc4d79326c268dd334f61c90 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 23 May 2012 14:01:59 +0400 Subject: CIFS: Move get_next_mid to ops struct Reviewed-by: Jeff Layton Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 7 +++++ fs/cifs/cifsproto.h | 1 - fs/cifs/cifssmb.c | 8 ++--- fs/cifs/connect.c | 2 +- fs/cifs/misc.c | 89 +---------------------------------------------------- fs/cifs/smb1ops.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/transport.c | 2 +- 7 files changed, 103 insertions(+), 95 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 20350a93ed99..6df0cbe1cbc9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -174,6 +174,7 @@ struct smb_version_operations { void (*add_credits)(struct TCP_Server_Info *, const unsigned int); void (*set_credits)(struct TCP_Server_Info *, const int); int * (*get_credits_field)(struct TCP_Server_Info *); + __u64 (*get_next_mid)(struct TCP_Server_Info *); /* data offset from read response message */ unsigned int (*read_data_offset)(char *); /* data length from read response message */ @@ -399,6 +400,12 @@ set_credits(struct TCP_Server_Info *server, const int val) server->ops->set_credits(server, val); } +static inline __u64 +get_next_mid(struct TCP_Server_Info *server) +{ + return server->ops->get_next_mid(server); +} + /* * Macros to allow the TCP_Server_Info->net field and related code to drop out * when CONFIG_NET_NS isn't set. diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 5ec21ecf7980..0a6cbfe2761e 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -114,7 +114,6 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct, void **request_buf); extern int CIFS_SessSetup(unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_cp); -extern __u64 GetNextMid(struct TCP_Server_Info *server); extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index b5ad716b2642..5b400730c213 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -268,7 +268,7 @@ small_smb_init_no_tc(const int smb_command, const int wct, return rc; buffer = (struct smb_hdr *)*request_buf; - buffer->Mid = GetNextMid(ses->server); + buffer->Mid = get_next_mid(ses->server); if (ses->capabilities & CAP_UNICODE) buffer->Flags2 |= SMBFLG2_UNICODE; if (ses->capabilities & CAP_STATUS32) @@ -402,7 +402,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifs_ses *ses) cFYI(1, "secFlags 0x%x", secFlags); - pSMB->hdr.Mid = GetNextMid(server); + pSMB->hdr.Mid = get_next_mid(server); pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) @@ -782,7 +782,7 @@ CIFSSMBLogoff(const int xid, struct cifs_ses *ses) return rc; } - pSMB->hdr.Mid = GetNextMid(ses->server); + pSMB->hdr.Mid = get_next_mid(ses->server); if (ses->server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) @@ -4762,7 +4762,7 @@ getDFSRetry: /* server pointer checked in called function, but should never be null here anyway */ - pSMB->hdr.Mid = GetNextMid(ses->server); + pSMB->hdr.Mid = get_next_mid(ses->server); pSMB->hdr.Tid = ses->ipc_tid; pSMB->hdr.Uid = ses->Suid; if (ses->capabilities & CAP_STATUS32) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 3647b1a4540b..78db68a5cf44 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3940,7 +3940,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, NULL /*no tid */ , 4 /*wct */ ); - smb_buffer->Mid = GetNextMid(ses->server); + smb_buffer->Mid = get_next_mid(ses->server); smb_buffer->Uid = ses->Suid; pSMB = (TCONX_REQ *) smb_buffer; pSMBr = (TCONX_RSP *) smb_buffer_response; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index e2552d2b2e42..557506ae1e2a 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -212,93 +212,6 @@ cifs_small_buf_release(void *buf_to_free) return; } -/* - * Find a free multiplex id (SMB mid). Otherwise there could be - * mid collisions which might cause problems, demultiplexing the - * wrong response to this request. Multiplex ids could collide if - * one of a series requests takes much longer than the others, or - * if a very large number of long lived requests (byte range - * locks or FindNotify requests) are pending. No more than - * 64K-1 requests can be outstanding at one time. If no - * mids are available, return zero. A future optimization - * could make the combination of mids and uid the key we use - * to demultiplex on (rather than mid alone). - * In addition to the above check, the cifs demultiplex - * code already used the command code as a secondary - * check of the frame and if signing is negotiated the - * response would be discarded if the mid were the same - * but the signature was wrong. Since the mid is not put in the - * pending queue until later (when it is about to be dispatched) - * we do have to limit the number of outstanding requests - * to somewhat less than 64K-1 although it is hard to imagine - * so many threads being in the vfs at one time. - */ -__u64 GetNextMid(struct TCP_Server_Info *server) -{ - __u64 mid = 0; - __u16 last_mid, cur_mid; - bool collision; - - spin_lock(&GlobalMid_Lock); - - /* mid is 16 bit only for CIFS/SMB */ - cur_mid = (__u16)((server->CurrentMid) & 0xffff); - /* we do not want to loop forever */ - last_mid = cur_mid; - cur_mid++; - - /* - * This nested loop looks more expensive than it is. - * In practice the list of pending requests is short, - * fewer than 50, and the mids are likely to be unique - * on the first pass through the loop unless some request - * takes longer than the 64 thousand requests before it - * (and it would also have to have been a request that - * did not time out). - */ - while (cur_mid != last_mid) { - struct mid_q_entry *mid_entry; - unsigned int num_mids; - - collision = false; - if (cur_mid == 0) - cur_mid++; - - num_mids = 0; - list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { - ++num_mids; - if (mid_entry->mid == cur_mid && - mid_entry->mid_state == MID_REQUEST_SUBMITTED) { - /* This mid is in use, try a different one */ - collision = true; - break; - } - } - - /* - * if we have more than 32k mids in the list, then something - * is very wrong. Possibly a local user is trying to DoS the - * box by issuing long-running calls and SIGKILL'ing them. If - * we get to 2^16 mids then we're in big trouble as this - * function could loop forever. - * - * Go ahead and assign out the mid in this situation, but force - * an eventual reconnect to clean out the pending_mid_q. - */ - if (num_mids > 32768) - server->tcpStatus = CifsNeedReconnect; - - if (!collision) { - mid = (__u64)cur_mid; - server->CurrentMid = mid; - break; - } - cur_mid++; - } - spin_unlock(&GlobalMid_Lock); - return mid; -} - /* NB: MID can not be set if treeCon not passed in, in that case it is responsbility of caller to set the mid */ void @@ -334,7 +247,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , /* Uid is not converted */ buffer->Uid = treeCon->ses->Suid; - buffer->Mid = GetNextMid(treeCon->ses->server); + buffer->Mid = get_next_mid(treeCon->ses->server); } if (treeCon->Flags & SMB_SHARE_IS_IN_DFS) buffer->Flags2 |= SMBFLG2_DFS; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index d9d615fbed3f..6dec38f5522d 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -125,6 +125,94 @@ cifs_get_credits_field(struct TCP_Server_Info *server) return &server->credits; } +/* + * Find a free multiplex id (SMB mid). Otherwise there could be + * mid collisions which might cause problems, demultiplexing the + * wrong response to this request. Multiplex ids could collide if + * one of a series requests takes much longer than the others, or + * if a very large number of long lived requests (byte range + * locks or FindNotify requests) are pending. No more than + * 64K-1 requests can be outstanding at one time. If no + * mids are available, return zero. A future optimization + * could make the combination of mids and uid the key we use + * to demultiplex on (rather than mid alone). + * In addition to the above check, the cifs demultiplex + * code already used the command code as a secondary + * check of the frame and if signing is negotiated the + * response would be discarded if the mid were the same + * but the signature was wrong. Since the mid is not put in the + * pending queue until later (when it is about to be dispatched) + * we do have to limit the number of outstanding requests + * to somewhat less than 64K-1 although it is hard to imagine + * so many threads being in the vfs at one time. + */ +static __u64 +cifs_get_next_mid(struct TCP_Server_Info *server) +{ + __u64 mid = 0; + __u16 last_mid, cur_mid; + bool collision; + + spin_lock(&GlobalMid_Lock); + + /* mid is 16 bit only for CIFS/SMB */ + cur_mid = (__u16)((server->CurrentMid) & 0xffff); + /* we do not want to loop forever */ + last_mid = cur_mid; + cur_mid++; + + /* + * This nested loop looks more expensive than it is. + * In practice the list of pending requests is short, + * fewer than 50, and the mids are likely to be unique + * on the first pass through the loop unless some request + * takes longer than the 64 thousand requests before it + * (and it would also have to have been a request that + * did not time out). + */ + while (cur_mid != last_mid) { + struct mid_q_entry *mid_entry; + unsigned int num_mids; + + collision = false; + if (cur_mid == 0) + cur_mid++; + + num_mids = 0; + list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { + ++num_mids; + if (mid_entry->mid == cur_mid && + mid_entry->mid_state == MID_REQUEST_SUBMITTED) { + /* This mid is in use, try a different one */ + collision = true; + break; + } + } + + /* + * if we have more than 32k mids in the list, then something + * is very wrong. Possibly a local user is trying to DoS the + * box by issuing long-running calls and SIGKILL'ing them. If + * we get to 2^16 mids then we're in big trouble as this + * function could loop forever. + * + * Go ahead and assign out the mid in this situation, but force + * an eventual reconnect to clean out the pending_mid_q. + */ + if (num_mids > 32768) + server->tcpStatus = CifsNeedReconnect; + + if (!collision) { + mid = (__u64)cur_mid; + server->CurrentMid = mid; + break; + } + cur_mid++; + } + spin_unlock(&GlobalMid_Lock); + return mid; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -133,6 +221,7 @@ struct smb_version_operations smb1_operations = { .add_credits = cifs_add_credits, .set_credits = cifs_set_credits, .get_credits_field = cifs_get_credits_field, + .get_next_mid = cifs_get_next_mid, .read_data_offset = cifs_read_data_offset, .read_data_length = cifs_read_data_length, .map_error = map_smb_to_linux_error, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 1b36ffe6a47b..3097ee58fd7d 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -779,7 +779,7 @@ send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon, pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES; pSMB->Timeout = 0; - pSMB->hdr.Mid = GetNextMid(ses->server); + pSMB->hdr.Mid = get_next_mid(ses->server); return SendReceive(xid, ses, in_buf, out_buf, &bytes_returned, 0); -- cgit v1.2.3 From 4d5a0565cebf12c2ef854e4ac1961f13a710a950 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 30 Apr 2012 11:17:25 +0200 Subject: Btrfs: remove call to btrfs_header_nritems with no effect This is a leftover from cleanup patch 559af821. Before the cleanup, btrfs_header_nritems was called inside an if condition. As it has no side effects we need to preserve here, it should simply be dropped. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d7a96cfdc50a..836e4e03edca 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1650,8 +1650,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - btrfs_header_nritems(mid); - left = read_node_slot(root, parent, pslot - 1); if (left) { btrfs_tree_lock(left); @@ -1681,7 +1679,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, wret = push_node_left(trans, root, left, mid, 1); if (wret < 0) ret = wret; - btrfs_header_nritems(mid); } /* -- cgit v1.2.3 From 0640113be25d283e0ff77a9f041e1242182387f0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 4 Jun 2012 11:00:45 -0700 Subject: vfs: Fix /proc//fdinfo/ file handling Cyrill Gorcunov reports that I broke the fdinfo files with commit 30a08bf2d31d ("proc: move fd symlink i_mode calculations into tid_fd_revalidate()"), and he's quite right. The tid_fd_revalidate() function is not just used for the /fd symlinks, it's also used for the /fdinfo/ files, and the permission model for those are different. So do the dynamic symlink permission handling just for symlinks, making the fdinfo files once more appear as the proper regular files they are. Of course, Al Viro argued (probably correctly) that we shouldn't do the symlink permission games at all, and make the symlinks always just be the normal 'lrwxrwxrwx'. That would have avoided this issue too, but since somebody noticed that the permissions had changed (which was the reason for that original commit 30a08bf2d31d in the first place), people do apparently use this feature. [ Basically, you can use the symlink permission data as a cheap "fdinfo" replacement, since you see whether the file is open for reading and/or writing by just looking at st_mode of the symlink. So the feature does make sense, even if the pain it has caused means we probably shouldn't have done it to begin with. ] Reported-and-tested-by: Cyrill Gorcunov Signed-off-by: Linus Torvalds --- fs/proc/base.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 616f41a7cde6..437195f204e1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1803,7 +1803,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - unsigned i_mode, f_mode = file->f_mode; + unsigned f_mode = file->f_mode; rcu_read_unlock(); put_files_struct(files); @@ -1819,12 +1819,14 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = GLOBAL_ROOT_GID; } - i_mode = S_IFLNK; - if (f_mode & FMODE_READ) - i_mode |= S_IRUSR | S_IXUSR; - if (f_mode & FMODE_WRITE) - i_mode |= S_IWUSR | S_IXUSR; - inode->i_mode = i_mode; + if (S_ISLNK(inode->i_mode)) { + unsigned i_mode = S_IFLNK; + if (f_mode & FMODE_READ) + i_mode |= S_IRUSR | S_IXUSR; + if (f_mode & FMODE_WRITE) + i_mode |= S_IWUSR | S_IXUSR; + inode->i_mode = i_mode; + } security_task_to_inode(task, inode); put_task_struct(task); @@ -1859,6 +1861,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, ei = PROC_I(inode); ei->fd = fd; + inode->i_mode = S_IFLNK; inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; -- cgit v1.2.3 From 1549210fcc17e9ae20c09ac8cd4c48a8dfd431bd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2012 09:16:47 -0400 Subject: NFSv4: Fix an Oops in the open recovery code The open recovery code does not need to request a new value for the mdsthreshold, and so does not allocate a struct nfs4_threshold. The problem is that encode_getfattr_open() will still request an mdsthreshold, and so we end up Oopsing in decode_attr_mdsthreshold. This patch fixes encode_getfattr_open so that it doesn't request an mdsthreshold when the caller isn't asking for one. It also fixes decode_attr_mdsthreshold so that it errors if the server returns an mdsthreshold that we didn't ask for (instead of Oopsing). Signed-off-by: Trond Myklebust Cc: Andy Adamson --- fs/nfs/nfs4_fs.h | 2 +- fs/nfs/nfs4proc.c | 22 +++++++++++++++++++++- fs/nfs/nfs4xdr.c | 12 ++++++++---- include/linux/nfs_xdr.h | 1 + 4 files changed, 31 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c6827f93ab57..cc5900ac61b5 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -295,7 +295,7 @@ is_ds_client(struct nfs_client *clp) extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; -extern const u32 nfs4_fattr_bitmap[2]; +extern const u32 nfs4_fattr_bitmap[3]; extern const u32 nfs4_statfs_bitmap[2]; extern const u32 nfs4_pathconf_bitmap[2]; extern const u32 nfs4_fsinfo_bitmap[3]; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d48dbefa0e71..ad1515521a6a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -116,7 +116,7 @@ static int nfs4_map_errors(int err) /* * This is our standard bitmap for GETATTR requests. */ -const u32 nfs4_fattr_bitmap[2] = { +const u32 nfs4_fattr_bitmap[3] = { FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE @@ -133,6 +133,24 @@ const u32 nfs4_fattr_bitmap[2] = { | FATTR4_WORD1_TIME_MODIFY }; +static const u32 nfs4_pnfs_open_bitmap[3] = { + FATTR4_WORD0_TYPE + | FATTR4_WORD0_CHANGE + | FATTR4_WORD0_SIZE + | FATTR4_WORD0_FSID + | FATTR4_WORD0_FILEID, + FATTR4_WORD1_MODE + | FATTR4_WORD1_NUMLINKS + | FATTR4_WORD1_OWNER + | FATTR4_WORD1_OWNER_GROUP + | FATTR4_WORD1_RAWDEV + | FATTR4_WORD1_SPACE_USED + | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_METADATA + | FATTR4_WORD1_TIME_MODIFY, + FATTR4_WORD2_MDSTHRESHOLD +}; + const u32 nfs4_statfs_bitmap[2] = { FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE @@ -844,6 +862,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.name = &dentry->d_name; p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; if (attrs != NULL && attrs->ia_valid != 0) { __be32 verf[2]; @@ -1820,6 +1839,7 @@ static int _nfs4_do_open(struct inode *dir, opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); if (!opendata->f_attr.mdsthreshold) goto err_opendata_put; + opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; } if (dentry->d_inode != NULL) opendata->state = nfs4_get_open_state(dentry->d_inode, sp); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index ee4a74db95d0..9ca1428da9d9 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1198,12 +1198,13 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c } static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, + const u32 *open_bitmap, struct compound_hdr *hdr) { encode_getattr_three(xdr, - bitmask[0] & nfs4_fattr_bitmap[0], - bitmask[1] & nfs4_fattr_bitmap[1], - bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD, + bitmask[0] & open_bitmap[0], + bitmask[1] & open_bitmap[1], + bitmask[2] & open_bitmap[2], hdr); } @@ -2221,7 +2222,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_open(xdr, args, &hdr); encode_getfh(xdr, &hdr); - encode_getfattr_open(xdr, args->bitmask, &hdr); + encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); encode_nops(&hdr); } @@ -4360,6 +4361,9 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr, if (unlikely(bitmap[2] & (FATTR4_WORD2_MDSTHRESHOLD - 1U))) return -EIO; if (likely(bitmap[2] & FATTR4_WORD2_MDSTHRESHOLD)) { + /* Did the server return an unrequested attribute? */ + if (unlikely(res == NULL)) + return -EREMOTEIO; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index d1a7bf51c326..7519baef025b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -348,6 +348,7 @@ struct nfs_openargs { const struct qstr * name; const struct nfs_server *server; /* Needed for ID mapping */ const u32 * bitmask; + const u32 * open_bitmap; __u32 claim; struct nfs4_sequence_args seq_args; }; -- cgit v1.2.3 From 029c53473727f21c1dd73237e8d630a6f007a2fe Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2012 09:35:44 -0400 Subject: NFSv4: Fix up decode_attr_mdsthreshold Fix an incorrect use of 'likely()'. The FATTR4_WORD2_MDSTHRESHOLD bit is only expected in NFSv4.1 OPEN calls, and so is actually rather _unlikely_. decode_attr_mdsthreshold needs to clear FATTR4_WORD2_MDSTHRESHOLD from the attribute bitmap after it has decoded the data. Signed-off-by: Trond Myklebust Cc: Andy Adamson --- fs/nfs/nfs4xdr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 9ca1428da9d9..18fae29b0301 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4360,7 +4360,7 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr, if (unlikely(bitmap[2] & (FATTR4_WORD2_MDSTHRESHOLD - 1U))) return -EIO; - if (likely(bitmap[2] & FATTR4_WORD2_MDSTHRESHOLD)) { + if (bitmap[2] & FATTR4_WORD2_MDSTHRESHOLD) { /* Did the server return an unrequested attribute? */ if (unlikely(res == NULL)) return -EREMOTEIO; @@ -4376,6 +4376,7 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr, __func__); status = decode_first_threshold_item4(xdr, res); + bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD; } return status; out_overflow: -- cgit v1.2.3 From 08106ac7c892cad769c5026cb9dd877a39aed603 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2012 10:08:24 -0400 Subject: NFSv4.1: Convert a trivial printk into a dprintk There is no need to bug the user about the server returning an error on destroy_session. The error will be handled by the state manager, without any need for further input from anyone else. So convert that printk into a debugging dprintk. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ad1515521a6a..7c218fd1ec21 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5766,8 +5766,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session, status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (status) - printk(KERN_WARNING - "NFS: Got error %d from the server on DESTROY_SESSION. " + dprintk("NFS: Got error %d from the server on DESTROY_SESSION. " "Session has been destroyed regardless...\n", status); dprintk("<-- nfs4_proc_destroy_session\n"); -- cgit v1.2.3 From bda197f5d71d56b95a84c0ccbcb8ce2691106cca Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2012 10:22:14 -0400 Subject: NFSv4.1: Ensure we clear session state flags after a session creation Both nfs4_reset_session and nfs41_init_clientid need to clear all the session related state flags on success. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index c679b9ecef63..f38300e9f171 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -244,6 +244,16 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) return nfs4_wait_on_slot_tbl(&ses->fc_slot_table); } +static void nfs41_finish_session_reset(struct nfs_client *clp) +{ + clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + /* create_session negotiated new slot table */ + clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); + clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + nfs41_setup_state_renewal(clp); +} + int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) { int status; @@ -259,8 +269,7 @@ do_confirm: status = nfs4_proc_create_session(clp, cred); if (status != 0) goto out; - clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); - nfs41_setup_state_renewal(clp); + nfs41_finish_session_reset(clp); nfs_mark_client_ready(clp, NFS_CS_READY); out: return status; @@ -1772,16 +1781,9 @@ static int nfs4_reset_session(struct nfs_client *clp) status = nfs4_handle_reclaim_lease_error(clp, status); goto out; } - clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); - /* create_session negotiated new slot table */ - clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); - clear_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + nfs41_finish_session_reset(clp); dprintk("%s: session reset was successful for server %s!\n", __func__, clp->cl_hostname); - - /* Let the state manager reestablish state */ - if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) - nfs41_setup_state_renewal(clp); out: if (cred) put_rpccred(cred); -- cgit v1.2.3 From cdf66442fab82916fe38f928b4f91815195a294c Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Tue, 5 Jun 2012 14:59:54 -0400 Subject: NFS4: Set parsed mount data version to 4 This patch only affects mounting through "-t nfs4" since it doesn't set up an nfs version to use in the mount data. The nfs client was trying to mount using NFS v0, causing either a BUG() or a protocol not supported message. Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ff656c022684..bdd673141e9e 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2637,6 +2637,8 @@ static int nfs4_validate_mount_data(void *options, if (data == NULL) goto out_no_data; + args->version = 4; + switch (data->version) { case 1: if (data->host_addrlen > sizeof(args->nfs_server.address)) -- cgit v1.2.3 From 9bce008bae8b57bc7b007bcc2071d1247a527120 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2012 18:32:03 -0400 Subject: NFS: Fix a commit bug The new commit code fails to copy the verifier into the wb_verf field of _all_ the nfs_page structures; it only copies it into the first entry. The consequence is that most requests end up failing to match in nfs_commit_release. Fix is to copy the verifier into the req->wb_verf field in nfs_write_completion. Signed-off-by: Trond Myklebust Cc: Fred Isaman --- fs/nfs/direct.c | 4 ++-- fs/nfs/write.c | 7 ++++--- include/linux/nfs_xdr.h | 2 ++ 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 23d170bc44f4..b5385a7efd56 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -710,12 +710,12 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) bit = NFS_IOHDR_NEED_RESCHED; else if (dreq->flags == 0) { - memcpy(&dreq->verf, &req->wb_verf, + memcpy(&dreq->verf, hdr->verf, sizeof(dreq->verf)); bit = NFS_IOHDR_NEED_COMMIT; dreq->flags = NFS_ODIRECT_DO_COMMIT; } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { - if (memcmp(&dreq->verf, &req->wb_verf, sizeof(dreq->verf))) { + if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) { dreq->flags = NFS_ODIRECT_RESCHED_WRITES; bit = NFS_IOHDR_NEED_RESCHED; } else diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e6fe3d69d14c..4d6861c0dc14 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -80,6 +80,7 @@ struct nfs_write_header *nfs_writehdr_alloc(void) INIT_LIST_HEAD(&hdr->rpc_list); spin_lock_init(&hdr->lock); atomic_set(&hdr->refcnt, 0); + hdr->verf = &p->verf; } return p; } @@ -619,6 +620,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) goto next; } if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { + memcpy(&req->wb_verf, hdr->verf, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo); goto next; } @@ -1255,15 +1257,14 @@ static void nfs_writeback_release_common(void *calldata) struct nfs_write_data *data = calldata; struct nfs_pgio_header *hdr = data->header; int status = data->task.tk_status; - struct nfs_page *req = hdr->req; if ((status >= 0) && nfs_write_need_commit(data)) { spin_lock(&hdr->lock); if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) ; /* Do nothing */ else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) - memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); - else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) + memcpy(hdr->verf, &data->verf, sizeof(*hdr->verf)); + else if (memcmp(hdr->verf, &data->verf, sizeof(*hdr->verf))) set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags); spin_unlock(&hdr->lock); } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 7519baef025b..8aadd90b808a 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1237,6 +1237,7 @@ struct nfs_pgio_header { struct list_head rpc_list; atomic_t refcnt; struct nfs_page *req; + struct nfs_writeverf *verf; struct pnfs_layout_segment *lseg; loff_t io_start; const struct rpc_call_ops *mds_ops; @@ -1274,6 +1275,7 @@ struct nfs_write_data { struct nfs_write_header { struct nfs_pgio_header header; struct nfs_write_data rpc_data; + struct nfs_writeverf verf; }; struct nfs_mds_commit_info { -- cgit v1.2.3 From f25efd851cc8c0f63d74334bc784a437f4df8ee4 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Wed, 6 Jun 2012 14:12:07 -0400 Subject: NFS: Map minor mismatch error to protocol not support error. Sservers that only have NFSv4.1 support the NFS4ERR_MINOR_VERS_MISMATCH error is return on v4.0 mounts. Mapping that error to EPROTONOSUPPORT will cause the mount to back off to v3 instead of failing. Signed-off-by: Steve Dickson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7c218fd1ec21..bfc919fd3f06 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -105,6 +105,8 @@ static int nfs4_map_errors(int err) return -EINVAL; case -NFS4ERR_SHARE_DENIED: return -EACCES; + case -NFS4ERR_MINOR_VERS_MISMATCH: + return -EPROTONOSUPPORT; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); -- cgit v1.2.3 From 818039c7d597db3b1d30964a8f9489ac42c0642d Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 6 Jun 2012 16:03:10 +0300 Subject: UBIFS: fix debugfs-less systems support Commit "f70b7e5 UBIFS: remove Kconfig debugging option" broke UBIFS and it refuses to initialize if debugfs (CONFIG_DEBUG_FS) is disabled. I incorrectly assumed that debugfs files creation function will return success if debugfs is disabled, but they actually return -ENODEV. This patch fixes the issue. Reported-by: Paul Parsons Signed-off-by: Artem Bityutskiy Tested-by: Paul Parsons --- fs/ubifs/debug.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 685a83756b2b..84a7e6f3c046 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2918,6 +2918,9 @@ int dbg_debugfs_init_fs(struct ubifs_info *c) struct dentry *dent; struct ubifs_debug_info *d = c->dbg; + if (!IS_ENABLED(DEBUG_FS)) + return 0; + n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, c->vi.ubi_num, c->vi.vol_id); if (n == UBIFS_DFS_DIR_LEN) { @@ -3010,7 +3013,8 @@ out: */ void dbg_debugfs_exit_fs(struct ubifs_info *c) { - debugfs_remove_recursive(c->dbg->dfs_dir); + if (IS_ENABLED(DEBUG_FS)) + debugfs_remove_recursive(c->dbg->dfs_dir); } struct ubifs_global_debug_info ubifs_dbg; @@ -3095,6 +3099,9 @@ int dbg_debugfs_init(void) const char *fname; struct dentry *dent; + if (!IS_ENABLED(DEBUG_FS)) + return 0; + fname = "ubifs"; dent = debugfs_create_dir(fname, NULL); if (IS_ERR_OR_NULL(dent)) @@ -3159,7 +3166,8 @@ out: */ void dbg_debugfs_exit(void) { - debugfs_remove_recursive(dfs_rootdir); + if (IS_ENABLED(DEBUG_FS)) + debugfs_remove_recursive(dfs_rootdir); } /** -- cgit v1.2.3 From f07936f2c446bd5310e669c8e6eab3f812ea665a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 7 Jun 2012 10:21:03 -0400 Subject: NFS: Remove incorrect BUG_ON in nfs_found_client It is perfectly valid for nfs_get_client() to return a nfs_client that is in the process of setting up the NFSv4.1 session. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 7d108753af81..17ba6b995659 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -544,8 +544,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init, smp_rmb(); - BUG_ON(clp->cl_cons_state != NFS_CS_READY); - dprintk("<-- %s found nfs_client %p for %s\n", __func__, clp, cl_init->hostname ?: ""); return clp; -- cgit v1.2.3 From fb47ddc9d5ded3d202335ea29743b9242d54eb5a Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 7 Jun 2012 11:42:12 -0400 Subject: NFS4: Fix open bug when pnfs module blacklisted Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 29fd23c0efdc..64f90d845f6a 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -365,7 +365,7 @@ static inline bool pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, struct nfs_server *nfss) { - return (dst && src && src->bm != 0 && + return (dst && src && src->bm != 0 && nfss->pnfs_curr_ld && nfss->pnfs_curr_ld->id == src->l_type); } -- cgit v1.2.3 From 02c67525cf325131aa06c6b0c6fd457bd57161e3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 7 Jun 2012 13:45:53 -0400 Subject: NFSv4.1: Convert another trivial printk into a dprintk Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index bfc919fd3f06..5881f2cc5d81 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5297,7 +5297,7 @@ static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (status) - pr_warn("NFS: Got error %d from the server %s on " + dprintk("NFS: Got error %d from the server %s on " "DESTROY_CLIENTID.", status, clp->cl_hostname); return status; } -- cgit v1.2.3 From 40af1bbdca47e5c8a2044039bb78ca8fd8b20f94 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 7 Jun 2012 14:21:14 -0700 Subject: mm: correctly synchronize rss-counters at exit/exec mm->rss_stat counters have per-task delta: task->rss_stat. Before changing task->mm pointer the kernel must flush this delta with sync_mm_rss(). do_exit() already calls sync_mm_rss() to flush the rss-counters before committing the rss statistics into task->signal->maxrss, taskstats, audit and other stuff. Unfortunately the kernel does this before calling mm_release(), which can call put_user() for processing task->clear_child_tid. So at this point we can trigger page-faults and task->rss_stat becomes non-zero again. As a result mm->rss_stat becomes inconsistent and check_mm() will print something like this: | BUG: Bad rss-counter state mm:ffff88020813c380 idx:1 val:-1 | BUG: Bad rss-counter state mm:ffff88020813c380 idx:2 val:1 This patch moves sync_mm_rss() into mm_release(), and moves mm_release() out of do_exit() and calls it earlier. After mm_release() there should be no pagefaults. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Konstantin Khlebnikov Reported-by: Markus Trippelsdorf Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: [3.4.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 1 - kernel/exit.c | 13 ++++++++----- kernel/fork.c | 8 ++++++++ 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index a79786a8d2c8..b926ed19301e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -819,7 +819,6 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; - sync_mm_rss(old_mm); mm_release(tsk, old_mm); if (old_mm) { diff --git a/kernel/exit.c b/kernel/exit.c index 34867cc5b42a..804fb6bb8161 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -423,6 +423,7 @@ void daemonize(const char *name, ...) * user space pages. We don't need them, and if we didn't close them * they would be locked into memory. */ + mm_release(current, current->mm); exit_mm(current); /* * We don't want to get frozen, in case system-wide hibernation @@ -640,7 +641,6 @@ static void exit_mm(struct task_struct * tsk) struct mm_struct *mm = tsk->mm; struct core_state *core_state; - mm_release(tsk, mm); if (!mm) return; /* @@ -960,9 +960,13 @@ void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - /* sync mm's RSS info before statistics gathering */ - if (tsk->mm) - sync_mm_rss(tsk->mm); + + /* Set exit_code before complete_vfork_done() in mm_release() */ + tsk->exit_code = code; + + /* Release mm and sync mm's RSS info before statistics gathering */ + mm_release(tsk, tsk->mm); + group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); @@ -975,7 +979,6 @@ void do_exit(long code) tty_audit_exit(); audit_free(tsk); - tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index ab5211b9e622..0560781c6904 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -619,6 +619,14 @@ void mmput(struct mm_struct *mm) module_put(mm->binfmt->module); mmdrop(mm); } + + /* + * Final rss-counter synchronization. After this point there must be + * no pagefaults into this mm from the current context. Otherwise + * mm->rss_stat will be inconsistent. + */ + if (mm) + sync_mm_rss(mm); } EXPORT_SYMBOL_GPL(mmput); -- cgit v1.2.3 From b0dd6b70f0fda17ae9762fbb72d98e40a4f66556 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 7 Jun 2012 18:56:06 -0400 Subject: ext4: fix the free blocks calculation for ext3 file systems w/ uninit_bg Ext3 filesystems that are converted to use as many ext4 file system features as possible will enable uninit_bg to speed up e2fsck times. These file systems will have a native ext3 layout of inode tables and block allocation bitmaps (as opposed to ext4's flex_bg layout). Unfortunately, in these cases, when first allocating a block in an uninitialized block group, ext4 would incorrectly calculate the number of free blocks in that block group, and then errorneously report that the file system was corrupt: EXT4-fs error (device vdd): ext4_mb_generate_buddy:741: group 30, 32254 clusters in bitmap, 32258 in gd This problem can be reproduced via: mke2fs -q -t ext4 -O ^flex_bg /dev/vdd 5g mount -t ext4 /dev/vdd /mnt fallocate -l 4600m /mnt/test The problem was caused by a bone headed mistake in the check to see if a particular metadata block was part of the block group. Many thanks to Kees Cook for finding and bisecting the buggy commit which introduced this bug (commit fd034a84e1, present since v3.2). Reported-by: Sander Eikelenboom Reported-by: Kees Cook Signed-off-by: "Theodore Ts'o" Tested-by: Kees Cook Cc: stable@kernel.org --- fs/ext4/balloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 99b6324290db..cee7812cc3cf 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -90,8 +90,8 @@ unsigned ext4_num_overhead_clusters(struct super_block *sb, * unusual file system layouts. */ if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { - block_cluster = EXT4_B2C(sbi, (start - - ext4_block_bitmap(sb, gdp))); + block_cluster = EXT4_B2C(sbi, + ext4_block_bitmap(sb, gdp) - start); if (block_cluster < num_clusters) block_cluster = -1; else if (block_cluster == num_clusters) { @@ -102,7 +102,7 @@ unsigned ext4_num_overhead_clusters(struct super_block *sb, if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { inode_cluster = EXT4_B2C(sbi, - start - ext4_inode_bitmap(sb, gdp)); + ext4_inode_bitmap(sb, gdp) - start); if (inode_cluster < num_clusters) inode_cluster = -1; else if (inode_cluster == num_clusters) { @@ -114,7 +114,7 @@ unsigned ext4_num_overhead_clusters(struct super_block *sb, itbl_blk = ext4_inode_table(sb, gdp); for (i = 0; i < sbi->s_itb_per_group; i++) { if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { - c = EXT4_B2C(sbi, start - itbl_blk + i); + c = EXT4_B2C(sbi, itbl_blk + i - start); if ((c < num_clusters) || (c == inode_cluster) || (c == block_cluster) || (c == itbl_cluster)) continue; -- cgit v1.2.3 From b22b1f178f6799278d3178d894f37facb2085765 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 7 Jun 2012 19:04:19 -0400 Subject: ext4: don't set i_flags in EXT4_IOC_SETFLAGS Commit 7990696 uses the ext4_{set,clear}_inode_flags() functions to change the i_flags automatically but fails to remove the error setting of i_flags. So we still have the problem of trashing state flags. Fix this by removing the assignment. Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/ioctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 8ad112ae0ade..e34deac3f366 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -123,7 +123,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) else ext4_clear_inode_flag(inode, i); } - ei->i_flags = flags; ext4_set_inode_flags(inode); inode->i_ctime = ext4_current_time(inode); -- cgit v1.2.3 From 48d212a2eecaca2e1875925837ad27b2f43f48a3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 7 Jun 2012 17:54:07 -0700 Subject: Revert "mm: correctly synchronize rss-counters at exit/exec" This reverts commit 40af1bbdca47e5c8a2044039bb78ca8fd8b20f94. It's horribly and utterly broken for at least the following reasons: - calling sync_mm_rss() from mmput() is fundamentally wrong, because there's absolutely no reason to believe that the task that does the mmput() always does it on its own VM. Example: fork, ptrace, /proc - you name it. - calling it *after* having done mmdrop() on it is doubly insane, since the mm struct may well be gone now. - testing mm against NULL before you call it is insane too, since a NULL mm there would have caused oopses long before. .. and those are just the three bugs I found before I decided to give up looking for me and revert it asap. I should have caught it before I even took it, but I trusted Andrew too much. Cc: Konstantin Khlebnikov Cc: Markus Trippelsdorf Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 1 + kernel/exit.c | 13 +++++-------- kernel/fork.c | 8 -------- 3 files changed, 6 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index b926ed19301e..a79786a8d2c8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -819,6 +819,7 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; + sync_mm_rss(old_mm); mm_release(tsk, old_mm); if (old_mm) { diff --git a/kernel/exit.c b/kernel/exit.c index 804fb6bb8161..34867cc5b42a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -423,7 +423,6 @@ void daemonize(const char *name, ...) * user space pages. We don't need them, and if we didn't close them * they would be locked into memory. */ - mm_release(current, current->mm); exit_mm(current); /* * We don't want to get frozen, in case system-wide hibernation @@ -641,6 +640,7 @@ static void exit_mm(struct task_struct * tsk) struct mm_struct *mm = tsk->mm; struct core_state *core_state; + mm_release(tsk, mm); if (!mm) return; /* @@ -960,13 +960,9 @@ void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - - /* Set exit_code before complete_vfork_done() in mm_release() */ - tsk->exit_code = code; - - /* Release mm and sync mm's RSS info before statistics gathering */ - mm_release(tsk, tsk->mm); - + /* sync mm's RSS info before statistics gathering */ + if (tsk->mm) + sync_mm_rss(tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); @@ -979,6 +975,7 @@ void do_exit(long code) tty_audit_exit(); audit_free(tsk); + tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 0560781c6904..ab5211b9e622 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -619,14 +619,6 @@ void mmput(struct mm_struct *mm) module_put(mm->binfmt->module); mmdrop(mm); } - - /* - * Final rss-counter synchronization. After this point there must be - * no pagefaults into this mm from the current context. Otherwise - * mm->rss_stat will be inconsistent. - */ - if (mm) - sync_mm_rss(mm); } EXPORT_SYMBOL_GPL(mmput); -- cgit v1.2.3 From 2d0dbc6ae8a5194aaecb9cfffb9053f38fce8b86 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 8 Jun 2012 10:58:09 -0400 Subject: NFSv4: Fix unnecessary delegation returns in nfs4_do_open While nfs4_do_open() expects the fmode argument to be restricted to combinations of FMODE_READ and FMODE_WRITE, both nfs4_atomic_open() and nfs4_proc_create will pass the nfs_open_context->mode, which contains the full fmode_t. This patch ensures that nfs4_do_open strips the other fmode_t bits, fixing a problem in which the nfs4_do_open call would result in an unnecessary delegation return. Reported-by: Fred Isaman Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5881f2cc5d81..f23977e4ac0f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1902,6 +1902,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct nfs4_state *res; int status; + fmode &= FMODE_READ|FMODE_WRITE; do { status = _nfs4_do_open(dir, dentry, fmode, flags, sattr, cred, &res, ctx_th); -- cgit v1.2.3 From 32ba9c3fcab960f0b0d332c86ebcd2c4870d9bb8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 8 Jun 2012 10:34:03 -0700 Subject: Revert "vfs: stop d_splice_alias creating directory aliases" This reverts commit 7732a557b1342c6e6966efb5f07effcf99f56167 (and commit 3f50fff4dace23d3cfeb195d5cd4ee813cee68b7, which was a follow-up cleanup). We're chasing an elusive bug that Dave Jones can apparently reproduce using his system call fuzzer tool, and that looks like some kind of locking ordering problem on the directory i_mutex chain. Our i_mutex locking is rather complex, and depends on the topological ordering of the directories, which is why we have been very wary of splicing directory entries around. Of course, we really don't want to ever see aliased unconnected directories anyway, so none of this should ever happen, but this revert aims to basically get us back to a known older state. Bruce points to some of the previous discussion at http://marc.info/?i=<20110310105821.GE22723@ZenIV.linux.org.uk> and in particular a long post from Neil: http://marc.info/?i=<20110311150749.2fa2be66@notabene.brown> It should be noted that it's possible that Dave's problems come from other changes altohgether, including possibly just the fact that Dave constantly is teachning his fuzzer new tricks. So what appears to be a new bug could in fact be an old one that just gets newly triggered, but reverting these patches as "still under heavy discussion" is the right thing regardless. Requested-by: Al Viro Acked-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- fs/dcache.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 85c9e2bff8e6..40469044088d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -683,6 +683,8 @@ EXPORT_SYMBOL(dget_parent); /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question + * @want_discon: flag, used by d_splice_alias, to request + * that only a DISCONNECTED alias be returned. * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. @@ -691,9 +693,10 @@ EXPORT_SYMBOL(dget_parent); * of a filesystem. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer - * any other hashed alias over that. + * any other hashed alias over that one unless @want_discon is set, + * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias. */ -static struct dentry *__d_find_alias(struct inode *inode) +static struct dentry *__d_find_alias(struct inode *inode, int want_discon) { struct dentry *alias, *discon_alias; @@ -705,7 +708,7 @@ again: if (IS_ROOT(alias) && (alias->d_flags & DCACHE_DISCONNECTED)) { discon_alias = alias; - } else { + } else if (!want_discon) { __dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; @@ -736,7 +739,7 @@ struct dentry *d_find_alias(struct inode *inode) if (!list_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); - de = __d_find_alias(inode); + de = __d_find_alias(inode, 0); spin_unlock(&inode->i_lock); } return de; @@ -1647,8 +1650,9 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) if (inode && S_ISDIR(inode->i_mode)) { spin_lock(&inode->i_lock); - new = __d_find_any_alias(inode); + new = __d_find_alias(inode, 1); if (new) { + BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); spin_unlock(&inode->i_lock); security_d_instantiate(new, inode); d_move(new, dentry); @@ -2478,7 +2482,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) struct dentry *alias; /* Does an aliased dentry already exist? */ - alias = __d_find_alias(inode); + alias = __d_find_alias(inode, 0); if (alias) { actual = alias; write_seqlock(&rename_lock); -- cgit v1.2.3 From ead188f9f930fb5d7f0c49315a7fce3d8bd16b7e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 8 Jun 2012 17:07:36 +0200 Subject: writeback: Fix lock imbalance in writeback_sb_inodes() Fix bug introduced by 169ebd90. We have to have wb_list_lock locked when restarting writeback loop after having waited for inode writeback. Bug description by Ted Tso: I can reproduce this fairly easily by using ext4 w/o a journal, running under KVM with 1024megs memory, with fsstress (xfstests #13): [ 45.153294] ===================================== [ 45.154784] [ BUG: bad unlock balance detected! ] [ 45.155591] 3.5.0-rc1-00002-gb22b1f1 #124 Not tainted [ 45.155591] ------------------------------------- [ 45.155591] flush-254:16/2499 is trying to release lock (&(&wb->list_lock)->rlock) at: [ 45.155591] [] writeback_sb_inodes+0x160/0x327 [ 45.155591] but there are no more locks to release! Reported-by: Theodore Ts'o Tested-by: Theodore Ts'o Signed-off-by: Jan Kara Signed-off-by: Fengguang Wu --- fs/fs-writeback.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8d2fb8c88cf3..41a3ccff18d8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -664,6 +664,7 @@ static long writeback_sb_inodes(struct super_block *sb, /* Wait for I_SYNC. This function drops i_lock... */ inode_sleep_on_writeback(inode); /* Inode may be gone, start again */ + spin_lock(&wb->list_lock); continue; } inode->i_state |= I_SYNC; -- cgit v1.2.3 From 906369e43c29001c39c7dfed8a01b9dff24ace75 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Fri, 8 Jun 2012 16:48:33 -0400 Subject: NFS: fix directio refcount bug on commit This reverts a hunk from commit 04277086577 "NFS: Clean up - Simplify reference counting in fs/nfs/direct.c" The cleanups in that patch affect the write path, but by the time processing hits commit the removed reference has been added back by nfs_scan_commit_list(). Without this reversion, any page that is sent to commit holds on to an unbalanced reference that is never freed. The immediate effect is an imbalance over the wire between OPENs and CLOSEs. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index b5385a7efd56..05099890a929 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -517,9 +517,9 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) nfs_list_remove_request(req); if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) { /* Note the rewrite will go through mds */ - kref_get(&req->wb_kref); nfs_mark_request_commit(req, NULL, &cinfo); - } + } else + nfs_release_request(req); nfs_unlock_and_release_request(req); } -- cgit v1.2.3 From c5afc8da5b881633717bfc0510792428aa01fa3f Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Fri, 8 Jun 2012 11:32:56 -0400 Subject: NFS: Use the NFS_DEFAULT_VERSION for v2 and v3 mounts Older versions of nfs utils don't always pass a "vers=" mount option for NFS. This chould lead to attempts at using NFS v0 due to a zeroed out nfs_parsed_mount_data struct. I solve this by setting the default NFS version to NFS_DEFAULT_VERSION in the v2 and v3 cases (v4 has already been taken care of by a similar patch). Reported-by: Joerg Roedel Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index bdd673141e9e..906f09c7d842 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1867,6 +1867,7 @@ static int nfs23_validate_mount_data(void *options, if (data == NULL) goto out_no_data; + args->version = NFS_DEFAULT_VERSION; switch (data->version) { case 1: data->namlen = 0; -- cgit v1.2.3 From 64f9a83665e4f168ff50ce1db9eb175f954048fa Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Thu, 31 May 2012 17:45:00 +0100 Subject: NFSv2: EOF incorrectly set on short read In cases where the server returns fewer bytes then those requested, we can incorrectly set the eof flag for the file. Fixing this allows the request to be retried with updated offset and count arguments. Signed-off-by: Sachin Prabhu Signed-off-by: Trond Myklebust --- fs/nfs/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index a706b6bcc286..617c7419a08e 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -651,7 +651,7 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) /* Emulate the eof flag, which isn't normally needed in NFSv2 * as it is guaranteed to always return the file attributes */ - if (data->args.offset + data->args.count >= data->res.fattr->size) + if (data->args.offset + data->res.count >= data->res.fattr->size) data->res.eof = 1; } return 0; -- cgit v1.2.3 From 2669940db8d02147181e338bb6db98394569f31a Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Wed, 30 May 2012 16:12:24 -0400 Subject: NFSv4 do not send an empty SETATTR compound Commit 536e43d12b9517bbbf6114cd1a12be27857a4d7a ATTR_OPEN check can result in an ia_valid with only ATTR_FILE set, and no NFS_VALID_ATTRS attributes to request from the server. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f23977e4ac0f..15fc7e4664ed 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2549,6 +2549,14 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, nfs_fattr_init(fattr); + /* Deal with open(O_TRUNC) */ + if (sattr->ia_valid & ATTR_OPEN) + sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); + + /* Optimization: if the end result is no change, don't RPC */ + if ((sattr->ia_valid & ~(ATTR_FILE)) == 0) + return 0; + /* Search for an existing open(O_WRITE) file */ if (sattr->ia_valid & ATTR_FILE) { struct nfs_open_context *ctx; @@ -2560,10 +2568,6 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } } - /* Deal with open(O_TRUNC) */ - if (sattr->ia_valid & ATTR_OPEN) - sattr->ia_valid &= ~(ATTR_MTIME|ATTR_CTIME|ATTR_OPEN); - status = nfs4_do_setattr(inode, cred, fattr, sattr, state); if (status == 0) nfs_setattr_update_inode(inode, sattr); -- cgit v1.2.3 From b84297197ce60bb5da14cbd5516cf5130b0cd380 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 11 Jun 2012 16:46:55 -0700 Subject: exofs: fix sparse non-ANSI function warning Fix sparse non-ANSI function warning: fs/exofs/sys.c:112:28: warning: non-ANSI function declaration of function 'exofs_sysfs_dbg_print' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- fs/exofs/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c index e32bc919e4e3..5a7b691e748b 100644 --- a/fs/exofs/sys.c +++ b/fs/exofs/sys.c @@ -109,7 +109,7 @@ static struct kobj_type odev_ktype = { static struct kobj_type uuid_ktype = { }; -void exofs_sysfs_dbg_print() +void exofs_sysfs_dbg_print(void) { #ifdef CONFIG_EXOFS_DEBUG struct kobject *k_name, *k_tmp; -- cgit v1.2.3 From 0439f31c35d1da0b28988b308ea455e38e6a350d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 12 Jun 2012 10:37:08 +0300 Subject: NFSv4.1: integer overflow in decode_cb_sequence_args() This seems like it could overflow on 32 bits. Use kmalloc_array() which has overflow protection built in. Signed-off-by: Dan Carpenter Signed-off-by: Trond Myklebust --- fs/nfs/callback_xdr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 95bfc243992c..27c2969a9d02 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -455,9 +455,9 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, args->csa_nrclists = ntohl(*p++); args->csa_rclists = NULL; if (args->csa_nrclists) { - args->csa_rclists = kmalloc(args->csa_nrclists * - sizeof(*args->csa_rclists), - GFP_KERNEL); + args->csa_rclists = kmalloc_array(args->csa_nrclists, + sizeof(*args->csa_rclists), + GFP_KERNEL); if (unlikely(args->csa_rclists == NULL)) goto out; -- cgit v1.2.3 From e216c8c771c9a77f14d7e8b4131846b038f6c145 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 12 Jun 2012 10:37:39 +0300 Subject: NFS: add an endian notation for sparse This is supposed to be a __be32 value. Sparse complains a lot: fs/nfs/callback_xdr.c:699:30: warning: incorrect type in initializer (different base types) fs/nfs/callback_xdr.c:699:30: expected unsigned int [unsigned] status fs/nfs/callback_xdr.c:699:30: got restricted __be32 const [usertype] csr_status fs/nfs/callback_xdr.c:715:9: warning: cast to restricted __be32 fs/nfs/callback_xdr.c:716:16: warning: incorrect type in return expression (different base types) fs/nfs/callback_xdr.c:716:16: expected restricted __be32 fs/nfs/callback_xdr.c:716:16: got unsigned int [unsigned] status Signed-off-by: Dan Carpenter Signed-off-by: Trond Myklebust --- fs/nfs/callback_xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 27c2969a9d02..e64b01d2a338 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -696,7 +696,7 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, const struct cb_sequenceres *res) { __be32 *p; - unsigned status = res->csr_status; + __be32 status = res->csr_status; if (unlikely(status != 0)) goto out; -- cgit v1.2.3 From 201e4aca5aa179e6c69a4dcd36a3562e56b8d670 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Sat, 26 May 2012 06:07:49 -0700 Subject: pstore/ram: Should update old dmesg buffer before reading Without the update, we'll only see the new dmesg buffer after the reboot, but previously we could see it right away. Making an oops visible in pstore filesystem before reboot is a somewhat dubious feature, but removing it wasn't an intentional change, so let's restore it. For this we have to make persistent_ram_save_old() safe for calling multiple times, and also extern it. Signed-off-by: Anton Vorontsov Acked-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- fs/pstore/ram.c | 2 ++ fs/pstore/ram_core.c | 15 ++++++++------- include/linux/pstore_ram.h | 1 + 3 files changed, 11 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 9123cce28c1e..16ff7332eae0 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -106,6 +106,8 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, time->tv_sec = 0; time->tv_nsec = 0; + /* Update old/shadowed buffer. */ + persistent_ram_save_old(prz); size = persistent_ram_old_size(prz); *buf = kmalloc(size, GFP_KERNEL); if (*buf == NULL) diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 31f8d184f3a0..235513c46aaf 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -250,23 +250,24 @@ static void notrace persistent_ram_update(struct persistent_ram_zone *prz, persistent_ram_update_ecc(prz, start, count); } -static void __init -persistent_ram_save_old(struct persistent_ram_zone *prz) +void persistent_ram_save_old(struct persistent_ram_zone *prz) { struct persistent_ram_buffer *buffer = prz->buffer; size_t size = buffer_size(prz); size_t start = buffer_start(prz); - char *dest; - persistent_ram_ecc_old(prz); + if (!size) + return; - dest = kmalloc(size, GFP_KERNEL); - if (dest == NULL) { + if (!prz->old_log) { + persistent_ram_ecc_old(prz); + prz->old_log = kmalloc(size, GFP_KERNEL); + } + if (!prz->old_log) { pr_err("persistent_ram: failed to allocate buffer\n"); return; } - prz->old_log = dest; prz->old_log_size = size; memcpy(prz->old_log, &buffer->data[start], size - start); memcpy(prz->old_log + size - start, &buffer->data[0], start); diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h index 7ed7fd4dba49..4491e8ff36e6 100644 --- a/include/linux/pstore_ram.h +++ b/include/linux/pstore_ram.h @@ -75,6 +75,7 @@ struct persistent_ram_zone *persistent_ram_init_ringbuffer(struct device *dev, int persistent_ram_write(struct persistent_ram_zone *prz, const void *s, unsigned int count); +void persistent_ram_save_old(struct persistent_ram_zone *prz); size_t persistent_ram_old_size(struct persistent_ram_zone *prz); void *persistent_ram_old(struct persistent_ram_zone *prz); void persistent_ram_free_old(struct persistent_ram_zone *prz); -- cgit v1.2.3 From 25b63da64708212985c06c7f8b089d356efdd9cf Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Sat, 26 May 2012 06:07:50 -0700 Subject: pstore/ram_core: Do not reset restored zone's position and size Otherwise, the files will survive just one reboot, and on a subsequent boot they will disappear. Signed-off-by: Anton Vorontsov Signed-off-by: Greg Kroah-Hartman --- fs/pstore/ram_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 235513c46aaf..f6650d12c0c1 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -406,6 +406,7 @@ static int __init persistent_ram_post_init(struct persistent_ram_zone *prz, bool " size %zu, start %zu\n", buffer_size(prz), buffer_start(prz)); persistent_ram_save_old(prz); + return 0; } } else { pr_info("persistent_ram: no valid data in buffer" -- cgit v1.2.3 From fce397930475f7efc712a1345dc0dad269a10544 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Sat, 26 May 2012 06:07:51 -0700 Subject: pstore/ram_core: Factor persistent_ram_zap() out of post_init() A handy function that we will use outside of ram_core soon. But so far just factor it out and start using it in post_init(). Signed-off-by: Anton Vorontsov Acked-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- fs/pstore/ram_core.c | 11 ++++++++--- include/linux/pstore_ram.h | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index f6650d12c0c1..c5fbdbbf81ac 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -320,6 +320,13 @@ void persistent_ram_free_old(struct persistent_ram_zone *prz) prz->old_log_size = 0; } +void persistent_ram_zap(struct persistent_ram_zone *prz) +{ + atomic_set(&prz->buffer->start, 0); + atomic_set(&prz->buffer->size, 0); + persistent_ram_update_header_ecc(prz); +} + static void *persistent_ram_vmap(phys_addr_t start, size_t size) { struct page **pages; @@ -414,8 +421,7 @@ static int __init persistent_ram_post_init(struct persistent_ram_zone *prz, bool } prz->buffer->sig = PERSISTENT_RAM_SIG; - atomic_set(&prz->buffer->start, 0); - atomic_set(&prz->buffer->size, 0); + persistent_ram_zap(prz); return 0; } @@ -450,7 +456,6 @@ struct persistent_ram_zone * __init persistent_ram_new(phys_addr_t start, goto err; persistent_ram_post_init(prz, ecc); - persistent_ram_update_header_ecc(prz); return prz; err: diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h index 4491e8ff36e6..3b823d49a85a 100644 --- a/include/linux/pstore_ram.h +++ b/include/linux/pstore_ram.h @@ -69,6 +69,7 @@ struct persistent_ram_zone * __init persistent_ram_new(phys_addr_t start, size_t size, bool ecc); void persistent_ram_free(struct persistent_ram_zone *prz); +void persistent_ram_zap(struct persistent_ram_zone *prz); struct persistent_ram_zone *persistent_ram_init_ringbuffer(struct device *dev, bool ecc); -- cgit v1.2.3 From 93cce049682a1aebd49766f29af363e5b8770aed Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Sat, 26 May 2012 06:07:52 -0700 Subject: pstore/ram: Should zap persistent zone on unlink Otherwise, unlinked file will reappear on the next boot. Reported-by: Kees Cook Signed-off-by: Anton Vorontsov Acked-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- fs/pstore/ram.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 16ff7332eae0..453030f9c5bc 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -186,6 +186,7 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, return -EINVAL; persistent_ram_free_old(cxt->przs[id]); + persistent_ram_zap(cxt->przs[id]); return 0; } -- cgit v1.2.3 From 364ed2f4653d7c86ebedcc116a9cb34fd272867c Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Sat, 26 May 2012 06:07:53 -0700 Subject: pstore/inode: Make pstore_fill_super() static There's no reason to extern it. The patch fixes the annoying sparse warning: CHECK fs/pstore/inode.c fs/pstore/inode.c:264:5: warning: symbol 'pstore_fill_super' was not declared. Should it be static? Signed-off-by: Anton Vorontsov Acked-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- fs/pstore/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index aeb19e68e086..11a2aa2a56c4 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -258,7 +258,7 @@ fail: return rc; } -int pstore_fill_super(struct super_block *sb, void *data, int silent) +static int pstore_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; -- cgit v1.2.3 From f617e2fd52484fb74236a597d0f9068ec7d9d2dd Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 14 Jun 2012 16:10:13 +0200 Subject: Btrfs: remove obsolete btrfs_next_leaf call from __resolve_indirect_ref When resolving indirect refs, we used to call btrfs_next_leaf in case we didn't find an exact match. While we should find exact matches most of the time, in case we don't, we must continue searching. Treating those matches differently depending on the level we're searching doesn't make sense. Even worse, we might end up searching for a key larger than the largest, in which case there is no next_leaf and subsequent jobs would fail. This commit drops the bogous lines. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 3f75895c919b..579131de1b3b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -294,16 +294,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (level == 0) { - if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret) - goto out; - eb = path->nodes[0]; - } - + if (level == 0) btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - } ret = add_all_parents(root, path, parents, level, &key, ref->wanted_disk_byte, extent_item_pos); -- cgit v1.2.3 From 8ba97a15e7d4f70b9af71fa1db86a28dd17ad1b2 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 4 Jun 2012 16:54:57 +0200 Subject: Btrfs: use btrfs_read_lock_root_node in get_old_root get_old_root could race with root node updates because we weren't locking the node early enough. Use btrfs_read_lock_root_node to grab the root locked in the very beginning and release the lock as soon as possible (just like btrfs_search_slot does). Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 836e4e03edca..2cde7b0a0106 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1143,6 +1143,13 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, return eb_rewin; } +/* + * get_old_root() rewinds the state of @root's root node to the given @time_seq + * value. If there are no changes, the current root->root_node is returned. If + * anything changed in between, there's a fresh buffer allocated on which the + * rewind operations are done. In any case, the returned buffer is read locked. + * Returns NULL on error (with no locks held). + */ static inline struct extent_buffer * get_old_root(struct btrfs_root *root, u64 time_seq) { @@ -1151,6 +1158,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_root *old_root; u64 old_generation; + eb = btrfs_read_lock_root_node(root); tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); if (!tm) return root->node; @@ -1173,15 +1181,21 @@ get_old_root(struct btrfs_root *root, u64 time_seq) /* there's a root replace operation for the current root */ eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT, root->nodesize); + } + btrfs_tree_read_unlock(root->node); + free_extent_buffer(root->node); + if (!eb) + return NULL; + btrfs_tree_read_lock(eb); + if (old_root->logical != root->node->start) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(eb, root->root_key.objectid); } - if (!eb) - return NULL; btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); __tree_mod_log_rewind(eb, time_seq, tm); + extent_buffer_get(eb); return eb; } @@ -2612,9 +2626,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, again: b = get_old_root(root, time_seq); - extent_buffer_get(b); level = btrfs_header_level(b); - btrfs_tree_read_lock(b); p->locks[level] = BTRFS_READ_LOCK; while (b) { -- cgit v1.2.3 From a95236d99fa56766f11056903439f55fe5038bcf Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Tue, 5 Jun 2012 16:41:24 +0200 Subject: Btrfs: fix return value for __tree_mod_log_oldest_root In __tree_mod_log_oldest_root() we must return the found operation even if it's not a ROOT_REPLACE operation. Otherwise, the caller assumes that there are no operations to be rewinded and returns immediately. The code in the caller is modified to improve readability. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2cde7b0a0106..50d7c99ddce7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1023,6 +1023,10 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, looped = 1; } + /* if there's no old root to return, return what we found instead */ + if (!found) + found = tm; + return found; } @@ -1155,18 +1159,24 @@ get_old_root(struct btrfs_root *root, u64 time_seq) { struct tree_mod_elem *tm; struct extent_buffer *eb; - struct tree_mod_root *old_root; + struct tree_mod_root *old_root = NULL; u64 old_generation; + u64 logical; eb = btrfs_read_lock_root_node(root); tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); if (!tm) return root->node; - old_root = &tm->old_root; - old_generation = tm->generation; + if (tm->op == MOD_LOG_ROOT_REPLACE) { + old_root = &tm->old_root; + old_generation = tm->generation; + logical = old_root->logical; + } else { + logical = root->node->start; + } - tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq); + tm = tree_mod_log_search(root->fs_info, logical, time_seq); /* * there was an item in the log when __tree_mod_log_oldest_root * returned. this one must not go away, because the time_seq passed to @@ -1174,26 +1184,23 @@ get_old_root(struct btrfs_root *root, u64 time_seq) */ BUG_ON(!tm); - if (old_root->logical == root->node->start) { - /* there are logged operations for the current root */ - eb = btrfs_clone_extent_buffer(root->node); - } else { - /* there's a root replace operation for the current root */ + if (old_root) eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT, root->nodesize); - } + else + eb = btrfs_clone_extent_buffer(root->node); btrfs_tree_read_unlock(root->node); free_extent_buffer(root->node); if (!eb) return NULL; btrfs_tree_read_lock(eb); - if (old_root->logical != root->node->start) { + if (old_root) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(eb, root->root_key.objectid); + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); } - btrfs_set_header_level(eb, old_root->level); - btrfs_set_header_generation(eb, old_generation); __tree_mod_log_rewind(eb, time_seq, tm); extent_buffer_get(eb); -- cgit v1.2.3 From 3d7806eca43e73a9721d2e09369200ed93036bd0 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 11 Jun 2012 08:29:29 +0200 Subject: Btrfs: add btrfs_next_old_leaf To make sense of the tree mod log, the backref walker not only needs btrfs_search_old_slot, but it also called btrfs_next_leaf, which in turn was calling btrfs_search_slot. This obviously didn't give the correct result. This commit adds btrfs_next_old_leaf, a drop-in replacement for btrfs_next_leaf with a time_seq parameter. If it is zero, it behaves exactly like btrfs_next_leaf. If it is non-zero, it will use btrfs_search_old_slot with this time_seq parameter. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 7 ++++--- fs/btrfs/ctree.c | 11 ++++++++++- fs/btrfs/ctree.h | 2 ++ 3 files changed, 16 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 579131de1b3b..8f7d1237b7a0 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -179,7 +179,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, int level, - struct btrfs_key *key, u64 wanted_disk_byte, + struct btrfs_key *key, u64 time_seq, + u64 wanted_disk_byte, const u64 *extent_item_pos) { int ret; @@ -212,7 +213,7 @@ add_parent: */ while (1) { eie = NULL; - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_old_leaf(root, path, time_seq); if (ret < 0) return ret; if (ret) @@ -297,7 +298,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (level == 0) btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - ret = add_all_parents(root, path, parents, level, &key, + ret = add_all_parents(root, path, parents, level, &key, time_seq, ref->wanted_disk_byte, extent_item_pos); out: btrfs_free_path(path); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 50d7c99ddce7..cb76b2a1b908 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5016,6 +5016,12 @@ next: * returns < 0 on io errors. */ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + return btrfs_next_old_leaf(root, path, 0); +} + +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, + u64 time_seq) { int slot; int level; @@ -5041,7 +5047,10 @@ again: path->keep_locks = 1; path->leave_spinning = 1; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (time_seq) + ret = btrfs_search_old_slot(root, &key, path, time_seq); + else + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->keep_locks = 0; if (ret < 0) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0151ca1ac657..db15e9e7b91c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2753,6 +2753,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, + u64 time_seq); static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { ++p->slots[0]; -- cgit v1.2.3 From 3310c36eef330766fd23d50796287ad764929e24 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Mon, 11 Jun 2012 10:52:38 +0200 Subject: Btrfs: fix race in tree mod log addition When adding to the tree modification log, we grab two locks at different stages. We must not drop the outer lock until we're done with section protected by the inner lock. This moves the unlock call for the outer lock to the appropriate position. Signed-off-by: Jan Schmidt --- fs/btrfs/ctree.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cb76b2a1b908..04b06bcf2ca9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -467,6 +467,15 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, return 0; } +/* + * This allocates memory and gets a tree modification sequence number when + * needed. + * + * Returns 0 when no sequence number is needed, < 0 on error. + * Returns 1 when a sequence number was added. In this case, + * fs_info->tree_mod_seq_lock was acquired and must be released by the caller + * after inserting into the rb tree. + */ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, struct tree_mod_elem **tm_ret) { @@ -491,11 +500,11 @@ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, */ kfree(tm); seq = 0; + spin_unlock(&fs_info->tree_mod_seq_lock); } else { __get_tree_mod_seq(fs_info, &tm->elem); seq = tm->elem.seq; } - spin_unlock(&fs_info->tree_mod_seq_lock); return seq; } @@ -521,7 +530,9 @@ tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, tm->slot = slot; tm->generation = btrfs_node_ptr_generation(eb, slot); - return __tree_mod_log_insert(fs_info, tm); + ret = __tree_mod_log_insert(fs_info, tm); + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; } static noinline int @@ -559,7 +570,9 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, tm->move.nr_items = nr_items; tm->op = MOD_LOG_MOVE_KEYS; - return __tree_mod_log_insert(fs_info, tm); + ret = __tree_mod_log_insert(fs_info, tm); + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; } static noinline int @@ -580,7 +593,9 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, tm->generation = btrfs_header_generation(old_root); tm->op = MOD_LOG_ROOT_REPLACE; - return __tree_mod_log_insert(fs_info, tm); + ret = __tree_mod_log_insert(fs_info, tm); + spin_unlock(&fs_info->tree_mod_seq_lock); + return ret; } static struct tree_mod_elem * -- cgit v1.2.3 From 12918b10d59e975fd5241eef03ef9e6d5ea3dcfe Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Fri, 1 Jun 2012 13:55:47 +0400 Subject: NFS: hard-code init_net for NFS callback transports In case of destroying mount namespace on child reaper exit, nsproxy is zeroed to the point already. So, dereferencing of it is invalid. This patch hard-code "init_net" for all network namespace references for NFS callback services. This will be fixed with proper NFS callback containerization. Signed-off-by: Stanislav Kinsbursky Signed-off-by: J. Bruce Fields --- fs/nfs/callback.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 970659daa323..23ff18fe080a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -17,7 +17,6 @@ #include #include #include -#include #include @@ -107,7 +106,7 @@ nfs4_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) { int ret; - ret = svc_create_xprt(serv, "tcp", xprt->xprt_net, PF_INET, + ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET, nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret <= 0) goto out_err; @@ -115,7 +114,7 @@ nfs4_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) dprintk("NFS: Callback listener port = %u (af %u)\n", nfs_callback_tcpport, PF_INET); - ret = svc_create_xprt(serv, "tcp", xprt->xprt_net, PF_INET6, + ret = svc_create_xprt(serv, "tcp", &init_net, PF_INET6, nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret > 0) { nfs_callback_tcpport6 = ret; @@ -184,7 +183,7 @@ nfs41_callback_up(struct svc_serv *serv, struct rpc_xprt *xprt) * fore channel connection. * Returns the input port (0) and sets the svc_serv bc_xprt on success */ - ret = svc_create_xprt(serv, "tcp-bc", xprt->xprt_net, PF_INET, 0, + ret = svc_create_xprt(serv, "tcp-bc", &init_net, PF_INET, 0, SVC_SOCK_ANONYMOUS); if (ret < 0) { rqstp = ERR_PTR(ret); @@ -254,7 +253,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) char svc_name[12]; int ret = 0; int minorversion_setup; - struct net *net = current->nsproxy->net_ns; + struct net *net = &init_net; mutex_lock(&nfs_callback_mutex); if (cb_info->users++ || cb_info->task != NULL) { @@ -330,7 +329,7 @@ void nfs_callback_down(int minorversion) cb_info->users--; if (cb_info->users == 0 && cb_info->task != NULL) { kthread_stop(cb_info->task); - svc_shutdown_net(cb_info->serv, current->nsproxy->net_ns); + svc_shutdown_net(cb_info->serv, &init_net); svc_exit_thread(cb_info->rqst); cb_info->serv = NULL; cb_info->rqst = NULL; -- cgit v1.2.3 From bc2df47a408f2d64cf81bcfd0f6e3e14c84cb0ab Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 12 Jun 2012 08:28:48 -0400 Subject: nfsd4: BUG_ON(!is_spin_locked()) no good on UP kernels Most frequent symptom was a BUG triggering in expire_client, with the server locking up shortly thereafter. Introduced by 508dc6e110c6dbdc0bbe84298ccfe22de7538486 "nfsd41: free_session/free_client must be called under the client_lock". Cc: stable@kernel.org Cc: Benny Halevy Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8fdc9ec5c5d3..94effd5bc4a1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -900,7 +900,7 @@ static void free_session(struct kref *kref) struct nfsd4_session *ses; int mem; - BUG_ON(!spin_is_locked(&client_lock)); + lockdep_assert_held(&client_lock); ses = container_of(kref, struct nfsd4_session, se_ref); nfsd4_del_conns(ses); spin_lock(&nfsd_drc_lock); @@ -1080,7 +1080,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) static inline void free_client(struct nfs4_client *clp) { - BUG_ON(!spin_is_locked(&client_lock)); + lockdep_assert_held(&client_lock); while (!list_empty(&clp->cl_sessions)) { struct nfsd4_session *ses; ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, -- cgit v1.2.3 From beb42dd793193a3d4e72970bfa73cd8810f63cea Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 30 May 2012 15:35:17 -0400 Subject: Btrfs: pass locked_page into extent_clear_unlock_delalloc if theres an error While doing my enospc work I got a transaction abortion that resulted in a panic when we tried to unlock_page() an already unlocked page. This is because we aren't calling extent_clear_unlock_delalloc with the locked page so it was unlocking all the pages in the range. This is wrong since __extent_writepage expects to have the page locked still unless we return *page_started as 1. This should keep us from panicing. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 92df0a5d1d94..ccd6355af73f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -830,7 +830,7 @@ static noinline int cow_file_range(struct inode *inode, if (IS_ERR(trans)) { extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - start, end, NULL, + start, end, locked_page, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | @@ -963,7 +963,7 @@ out: out_unlock: extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - start, end, NULL, + start, end, locked_page, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | -- cgit v1.2.3 From b939d1ab76b4aa816343cdbd8b44ce9929a3b9ef Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 11:06:33 -0400 Subject: Btrfs: fix locking in btrfs_destroy_delayed_refs The transaction abort stuff was throwing warnings from the list debugging code because we do a list_del_init outside of the delayed_refs spin lock. The delayed refs locking makes baby Jesus cry so it's not hard to get wrong, but we need to take the ref head mutex to make sure it's not being processed currently, and so if it is we need to drop the spin lock and then take and drop the mutex and do the search again. If we can take the mutex then we can safely remove the head from the list and carry on. Now when the transaction aborts I don't get the list debugging warnings. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b99d5127ba18..c79ddc756081 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3400,7 +3400,6 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, delayed_refs = &trans->delayed_refs; -again: spin_lock(&delayed_refs->lock); if (delayed_refs->num_entries == 0) { spin_unlock(&delayed_refs->lock); @@ -3408,31 +3407,36 @@ again: return ret; } - node = rb_first(&delayed_refs->root); - while (node) { + while ((node = rb_first(&delayed_refs->root)) != NULL) { ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - delayed_refs->num_entries--; atomic_set(&ref->refs, 1); if (btrfs_delayed_ref_is_head(ref)) { struct btrfs_delayed_ref_head *head; head = btrfs_delayed_node_to_head(ref); - spin_unlock(&delayed_refs->lock); - mutex_lock(&head->mutex); + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&ref->refs); + spin_unlock(&delayed_refs->lock); + + /* Need to wait for the delayed ref to run */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(ref); + + continue; + } + kfree(head->extent_op); delayed_refs->num_heads--; if (list_empty(&head->cluster)) delayed_refs->num_heads_ready--; list_del_init(&head->cluster); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - goto again; } + ref->in_tree = 0; + rb_erase(&ref->rb_node, &delayed_refs->root); + delayed_refs->num_entries--; + spin_unlock(&delayed_refs->lock); btrfs_put_delayed_ref(ref); -- cgit v1.2.3 From d7096fc3ef8360f30f228344bc564d4f97d8a158 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 15:49:57 -0400 Subject: Btrfs: wake up transaction waiters when aborting a transaction I was getting lots of hung tasks and a NULL pointer dereference because we are not cleaning up the transaction properly when it aborts. First we need to reset the running_transaction to NULL so we don't get a bad dereference for any start_transaction callers after this. Also we cannot rely on waitqueue_active() since it's just a list_empty(), so just call wake_up() directly since that will do the barrier for us and such. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 9 +++------ fs/btrfs/transaction.c | 4 ++++ 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c79ddc756081..19b4db70dcb1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3589,16 +3589,13 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, /* FIXME: cleanup wait for commit */ cur_trans->in_commit = 1; cur_trans->blocked = 1; - if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) - wake_up(&root->fs_info->transaction_blocked_wait); + wake_up(&root->fs_info->transaction_blocked_wait); cur_trans->blocked = 0; - if (waitqueue_active(&root->fs_info->transaction_wait)) - wake_up(&root->fs_info->transaction_wait); + wake_up(&root->fs_info->transaction_wait); cur_trans->commit_done = 1; - if (waitqueue_active(&cur_trans->commit_wait)) - wake_up(&cur_trans->commit_wait); + wake_up(&cur_trans->commit_wait); btrfs_destroy_pending_snapshots(cur_trans); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1791c6e3d834..59e0203bfb95 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1221,6 +1221,10 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->trans_lock); list_del_init(&cur_trans->list); + if (cur_trans == root->fs_info->running_transaction) { + root->fs_info->running_transaction = NULL; + root->fs_info->trans_no_join = 0; + } spin_unlock(&root->fs_info->trans_lock); btrfs_cleanup_one_transaction(trans->transaction, root); -- cgit v1.2.3 From 7b8b92af58db347de64a237861fcf13374b34a9c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 15:52:43 -0400 Subject: Btrfs: abort the transaction if the commit fails If a transaction commit fails we don't abort it so we don't set an error on the file system. This patch fixes that by actually calling the abort stuff and then adding a check for a fs error in the transaction start stuff to make sure it is caught properly. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 59e0203bfb95..b72b068183ec 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -100,6 +100,10 @@ loop: kmem_cache_free(btrfs_transaction_cachep, cur_trans); cur_trans = fs_info->running_transaction; goto loop; + } else if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + spin_unlock(&root->fs_info->trans_lock); + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + return -EROFS; } atomic_set(&cur_trans->num_writers, 1); @@ -1213,12 +1217,14 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, static void cleanup_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, int err) { struct btrfs_transaction *cur_trans = trans->transaction; WARN_ON(trans->use_count > 1); + btrfs_abort_transaction(trans, root, err); + spin_lock(&root->fs_info->trans_lock); list_del_init(&cur_trans->list); if (cur_trans == root->fs_info->running_transaction) { @@ -1530,7 +1536,7 @@ cleanup_transaction: // WARN_ON(1); if (current->journal_info == trans) current->journal_info = NULL; - cleanup_transaction(trans, root); + cleanup_transaction(trans, root, ret); return ret; } -- cgit v1.2.3 From ee670f0af35871edb492db5ba406cef36d1b7c21 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 15:54:30 -0400 Subject: Btrfs: fix btrfs_destroy_marked_extents So we're forcing the eb's to have their ref count set to 1 so invalidatepage works but this breaks lots of things, for example root nodes, and is just plain wrong, we don't need to just evict all of this stuff. Also drop the invalidatepage altogether and add a page_cache_release(). With this patch we no longer hang when trying to access the root nodes after an aborted transaction and we no longer leak memory. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 19b4db70dcb1..5a3bf323e2bd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3524,11 +3524,9 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, &(&BTRFS_I(page->mapping->host)->io_tree)->buffer, offset >> PAGE_CACHE_SHIFT); spin_unlock(&dirty_pages->buffer_lock); - if (eb) { + if (eb) ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - atomic_set(&eb->refs, 1); - } if (PageWriteback(page)) end_page_writeback(page); @@ -3542,8 +3540,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, spin_unlock_irq(&page->mapping->tree_lock); } - page->mapping->a_ops->invalidatepage(page, 0); unlock_page(page); + page_cache_release(page); } } -- cgit v1.2.3 From 17ca04aff7e6171df684b7b65804df8830eb8c15 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 31 May 2012 15:58:55 -0400 Subject: Btrfs: unlock everything properly in the error case for nocow I was getting hung on umount when a transaction was aborted because a range of one of the free space inodes was still locked. This is because the nocow stuff doesn't unlock anything on error. This fixed the problem and I verified that is what was happening. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ccd6355af73f..b7f398c36cb7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1136,8 +1136,18 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); - if (!path) + if (!path) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, locked_page, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); return -ENOMEM; + } nolock = btrfs_is_free_space_inode(root, inode); @@ -1147,6 +1157,15 @@ static noinline int run_delalloc_nocow(struct inode *inode, trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, locked_page, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); btrfs_free_path(path); return PTR_ERR(trans); } @@ -1327,8 +1346,11 @@ out_check: } btrfs_release_path(path); - if (cur_offset <= end && cow_start == (u64)-1) + if (cur_offset <= end && cow_start == (u64)-1) { cow_start = cur_offset; + cur_offset = end; + } + if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, end, page_started, nr_written, 1); @@ -1347,6 +1369,17 @@ error: if (!ret) ret = err; + if (ret && cur_offset < end) + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + cur_offset, end, locked_page, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); + btrfs_free_path(path); return ret; } -- cgit v1.2.3 From 606686eeac4550d2212bf3d621a810407ef5e9bf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 4 Jun 2012 14:03:51 -0400 Subject: Btrfs: use rcu to protect device->name Al pointed out that we can just toss out the old name on a device and add a new one arbitrarily, so anybody who uses device->name in printk could possibly use free'd memory. Instead of adding locking around all of this he suggested doing it with RCU, so I've introduced a struct rcu_string that does just that and have gone through and protected all accesses to device->name that aren't under the uuid_mutex with rcu_read_lock(). This protects us and I will use it for dealing with removing the device that we used to mount the file system in a later patch. Thanks, Reviewed-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/check-integrity.c | 16 ++++---- fs/btrfs/disk-io.c | 10 +++-- fs/btrfs/extent_io.c | 7 ++-- fs/btrfs/ioctl.c | 13 +++++-- fs/btrfs/rcu-string.h | 56 ++++++++++++++++++++++++++++ fs/btrfs/scrub.c | 30 +++++++++------ fs/btrfs/volumes.c | 92 +++++++++++++++++++++++++++++----------------- fs/btrfs/volumes.h | 2 +- 8 files changed, 162 insertions(+), 64 deletions(-) create mode 100644 fs/btrfs/rcu-string.h (limited to 'fs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 9cebb1fd6a3c..da6e9364a5e3 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -93,6 +93,7 @@ #include "print-tree.h" #include "locking.h" #include "check-integrity.h" +#include "rcu-string.h" #define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 #define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 @@ -843,13 +844,14 @@ static int btrfsic_process_superblock_dev_mirror( superblock_tmp->never_written = 0; superblock_tmp->mirror_num = 1 + superblock_mirror_num; if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - printk(KERN_INFO "New initial S-block (bdev %p, %s)" - " @%llu (%s/%llu/%d)\n", - superblock_bdev, device->name, - (unsigned long long)dev_bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - superblock_mirror_num); + printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)" + " @%llu (%s/%llu/%d)\n", + superblock_bdev, + rcu_str_deref(device->name), + (unsigned long long)dev_bytenr, + dev_state->name, + (unsigned long long)dev_bytenr, + superblock_mirror_num); list_add(&superblock_tmp->all_blocks_node, &state->all_blocks_list); btrfsic_block_hashtable_add(superblock_tmp, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5a3bf323e2bd..1c9664b16cd1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -44,6 +44,7 @@ #include "free-space-cache.h" #include "inode-map.h" #include "check-integrity.h" +#include "rcu-string.h" static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -2575,8 +2576,9 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) struct btrfs_device *device = (struct btrfs_device *) bh->b_private; - printk_ratelimited(KERN_WARNING "lost page write due to " - "I/O error on %s\n", device->name); + printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + rcu_str_deref(device->name)); /* note, we dont' set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ @@ -2749,8 +2751,8 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - printk("btrfs: disabling barriers on dev %s\n", - device->name); + printk_in_rcu("btrfs: disabling barriers on dev %s\n", + rcu_str_deref(device->name)); device->nobarriers = 1; } if (!bio_flagged(bio, BIO_UPTODATE)) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2c8f7b204617..aaa12c1eb348 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -20,6 +20,7 @@ #include "volumes.h" #include "check-integrity.h" #include "locking.h" +#include "rcu-string.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -1917,9 +1918,9 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, return -EIO; } - printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " - "sector %llu)\n", page->mapping->host->i_ino, start, - dev->name, sector); + printk_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu " + "(dev %s sector %llu)\n", page->mapping->host->i_ino, + start, rcu_str_deref(dev->name), sector); bio_put(bio); return 0; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 24b776c08d99..c5254dde1f0f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -52,6 +52,7 @@ #include "locking.h" #include "inode-map.h" #include "backref.h" +#include "rcu-string.h" /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -1345,8 +1346,9 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, do_div(new_size, root->sectorsize); new_size *= root->sectorsize; - printk(KERN_INFO "btrfs: new size for %s is %llu\n", - device->name, (unsigned long long)new_size); + printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n", + rcu_str_deref(device->name), + (unsigned long long)new_size); if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); @@ -2264,7 +2266,12 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) di_args->total_bytes = dev->total_bytes; memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); if (dev->name) { - strncpy(di_args->path, dev->name, sizeof(di_args->path)); + struct rcu_string *name; + + rcu_read_lock(); + name = rcu_dereference(dev->name); + strncpy(di_args->path, name->str, sizeof(di_args->path)); + rcu_read_unlock(); di_args->path[sizeof(di_args->path) - 1] = 0; } else { di_args->path[0] = '\0'; diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h new file mode 100644 index 000000000000..9e111e4576d4 --- /dev/null +++ b/fs/btrfs/rcu-string.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2012 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +struct rcu_string { + struct rcu_head rcu; + char str[0]; +}; + +static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) +{ + size_t len = strlen(src) + 1; + struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) + + (len * sizeof(char)), mask); + if (!ret) + return ret; + strncpy(ret->str, src, len); + return ret; +} + +static inline void rcu_string_free(struct rcu_string *str) +{ + if (str) + kfree_rcu(str, rcu); +} + +#define printk_in_rcu(fmt, ...) do { \ + rcu_read_lock(); \ + printk(fmt, __VA_ARGS__); \ + rcu_read_unlock(); \ +} while (0) + +#define printk_ratelimited_in_rcu(fmt, ...) do { \ + rcu_read_lock(); \ + printk_ratelimited(fmt, __VA_ARGS__); \ + rcu_read_unlock(); \ +} while (0) + +#define rcu_str_deref(rcu_str) ({ \ + struct rcu_string *__str = rcu_dereference(rcu_str); \ + __str->str; \ +}) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a38cfa4f251e..b223620cd5a6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -26,6 +26,7 @@ #include "backref.h" #include "extent_io.h" #include "check-integrity.h" +#include "rcu-string.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -320,10 +321,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) - printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu, " "length %llu, links %u (path: %s)\n", swarn->errstr, - swarn->logical, swarn->dev->name, + swarn->logical, rcu_str_deref(swarn->dev->name), (unsigned long long)swarn->sector, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, (char *)(unsigned long)ipath->fspath->val[i]); @@ -332,10 +333,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) return 0; err: - printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu: path " "resolving failed with ret=%d\n", swarn->errstr, - swarn->logical, swarn->dev->name, + swarn->logical, rcu_str_deref(swarn->dev->name), (unsigned long long)swarn->sector, root, inum, offset, ret); free_ipath(ipath); @@ -390,10 +391,11 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) do { ret = tree_backref_for_extent(&ptr, eb, ei, item_size, &ref_root, &ref_level); - printk(KERN_WARNING + printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " - "%llu\n", errstr, swarn.logical, dev->name, + "%llu\n", errstr, swarn.logical, + rcu_str_deref(dev->name), (unsigned long long)swarn.sector, ref_level ? "node" : "leaf", ret < 0 ? -1 : ref_level, @@ -580,9 +582,11 @@ out: spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR + + printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", - (unsigned long long)fixup->logical, sdev->dev->name); + (unsigned long long)fixup->logical, + rcu_str_deref(sdev->dev->name)); } btrfs_free_path(path); @@ -936,18 +940,20 @@ corrected_error: spin_lock(&sdev->stat_lock); sdev->stat.corrected_errors++; spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR + printk_ratelimited_in_rcu(KERN_ERR "btrfs: fixed up error at logical %llu on dev %s\n", - (unsigned long long)logical, sdev->dev->name); + (unsigned long long)logical, + rcu_str_deref(sdev->dev->name)); } } else { did_not_correct_error: spin_lock(&sdev->stat_lock); sdev->stat.uncorrectable_errors++; spin_unlock(&sdev->stat_lock); - printk_ratelimited(KERN_ERR + printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", - (unsigned long long)logical, sdev->dev->name); + (unsigned long long)logical, + rcu_str_deref(sdev->dev->name)); } out: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7782020996fe..8a3d2594b807 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -35,6 +35,7 @@ #include "volumes.h" #include "async-thread.h" #include "check-integrity.h" +#include "rcu-string.h" static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -64,7 +65,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) device = list_entry(fs_devices->devices.next, struct btrfs_device, dev_list); list_del(&device->dev_list); - kfree(device->name); + rcu_string_free(device->name); kfree(device); } kfree(fs_devices); @@ -334,8 +335,8 @@ static noinline int device_list_add(const char *path, { struct btrfs_device *device; struct btrfs_fs_devices *fs_devices; + struct rcu_string *name; u64 found_transid = btrfs_super_generation(disk_super); - char *name; fs_devices = find_fsid(disk_super->fsid); if (!fs_devices) { @@ -369,11 +370,13 @@ static noinline int device_list_add(const char *path, memcpy(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); spin_lock_init(&device->io_lock); - device->name = kstrdup(path, GFP_NOFS); - if (!device->name) { + + name = rcu_string_strdup(path, GFP_NOFS); + if (!name) { kfree(device); return -ENOMEM; } + rcu_assign_pointer(device->name, name); INIT_LIST_HEAD(&device->dev_alloc_list); /* init readahead state */ @@ -390,12 +393,12 @@ static noinline int device_list_add(const char *path, device->fs_devices = fs_devices; fs_devices->num_devices++; - } else if (!device->name || strcmp(device->name, path)) { - name = kstrdup(path, GFP_NOFS); + } else if (!device->name || strcmp(device->name->str, path)) { + name = rcu_string_strdup(path, GFP_NOFS); if (!name) return -ENOMEM; - kfree(device->name); - device->name = name; + rcu_string_free(device->name); + rcu_assign_pointer(device->name, name); if (device->missing) { fs_devices->missing_devices--; device->missing = 0; @@ -430,15 +433,22 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) /* We have held the volume lock, it is safe to get the devices. */ list_for_each_entry(orig_dev, &orig->devices, dev_list) { + struct rcu_string *name; + device = kzalloc(sizeof(*device), GFP_NOFS); if (!device) goto error; - device->name = kstrdup(orig_dev->name, GFP_NOFS); - if (!device->name) { + /* + * This is ok to do without rcu read locked because we hold the + * uuid mutex so nothing we touch in here is going to disappear. + */ + name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS); + if (!name) { kfree(device); goto error; } + rcu_assign_pointer(device->name, name); device->devid = orig_dev->devid; device->work.func = pending_bios_fn; @@ -491,7 +501,7 @@ again: } list_del_init(&device->dev_list); fs_devices->num_devices--; - kfree(device->name); + rcu_string_free(device->name); kfree(device); } @@ -516,7 +526,7 @@ static void __free_device(struct work_struct *work) if (device->bdev) blkdev_put(device->bdev, device->mode); - kfree(device->name); + rcu_string_free(device->name); kfree(device); } @@ -540,6 +550,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { struct btrfs_device *new_device; + struct rcu_string *name; if (device->bdev) fs_devices->open_devices--; @@ -555,8 +566,11 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) new_device = kmalloc(sizeof(*new_device), GFP_NOFS); BUG_ON(!new_device); /* -ENOMEM */ memcpy(new_device, device, sizeof(*new_device)); - new_device->name = kstrdup(device->name, GFP_NOFS); - BUG_ON(device->name && !new_device->name); /* -ENOMEM */ + + /* Safe because we are under uuid_mutex */ + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(device->name && !name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); new_device->bdev = NULL; new_device->writeable = 0; new_device->in_fs_metadata = 0; @@ -621,9 +635,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, if (!device->name) continue; - bdev = blkdev_get_by_path(device->name, flags, holder); + bdev = blkdev_get_by_path(device->name->str, flags, holder); if (IS_ERR(bdev)) { - printk(KERN_INFO "open %s failed\n", device->name); + printk(KERN_INFO "open %s failed\n", device->name->str); goto error; } filemap_write_and_wait(bdev->bd_inode->i_mapping); @@ -1632,6 +1646,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) struct block_device *bdev; struct list_head *devices; struct super_block *sb = root->fs_info->sb; + struct rcu_string *name; u64 total_bytes; int seeding_dev = 0; int ret = 0; @@ -1671,23 +1686,24 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) goto error; } - device->name = kstrdup(device_path, GFP_NOFS); - if (!device->name) { + name = rcu_string_strdup(device_path, GFP_NOFS); + if (!name) { kfree(device); ret = -ENOMEM; goto error; } + rcu_assign_pointer(device->name, name); ret = find_next_devid(root, &device->devid); if (ret) { - kfree(device->name); + rcu_string_free(device->name); kfree(device); goto error; } trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - kfree(device->name); + rcu_string_free(device->name); kfree(device); ret = PTR_ERR(trans); goto error; @@ -1796,7 +1812,7 @@ error_trans: unlock_chunks(root); btrfs_abort_transaction(trans, root, ret); btrfs_end_transaction(trans, root); - kfree(device->name); + rcu_string_free(device->name); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -4204,10 +4220,17 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; dev = bbio->stripes[dev_nr].dev; if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { +#ifdef DEBUG + struct rcu_string *name; + + rcu_read_lock(); + name = rcu_dereference(dev->name); pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " "(%s id %llu), size=%u\n", rw, (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, - dev->name, dev->devid, bio->bi_size); + name->str, dev->devid, bio->bi_size); + rcu_read_unlock(); +#endif bio->bi_bdev = dev->bdev; if (async_submit) schedule_bio(root, dev, rw, bio); @@ -4694,8 +4717,9 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) key.offset = device->devid; ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); if (ret) { - printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n", - device->name, (unsigned long long)device->devid); + printk_in_rcu(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n", + rcu_str_deref(device->name), + (unsigned long long)device->devid); __btrfs_reset_dev_stats(device); device->dev_stats_valid = 1; btrfs_release_path(path); @@ -4747,8 +4771,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, BUG_ON(!path); ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", - ret, device->name); + printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n", + ret, rcu_str_deref(device->name)); goto out; } @@ -4757,8 +4781,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", - device->name, ret); + printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n", + rcu_str_deref(device->name), ret); goto out; } ret = 1; @@ -4770,8 +4794,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", - device->name, ret); + printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n", + rcu_str_deref(device->name), ret); goto out; } } @@ -4823,9 +4847,9 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { if (!dev->dev_stats_valid) return; - printk_ratelimited(KERN_ERR + printk_ratelimited_in_rcu(KERN_ERR "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", - dev->name, + rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), @@ -4837,8 +4861,8 @@ void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) { - printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", - dev->name, + printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3406a88ca83e..74366f27a76b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -58,7 +58,7 @@ struct btrfs_device { /* the mode sent to blkdev_get */ fmode_t mode; - char *name; + struct rcu_string *name; /* the internal btrfs device id */ u64 devid; -- cgit v1.2.3 From 9c5085c147989d48dfe74194b48affc23f376650 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 5 Jun 2012 14:13:12 -0400 Subject: Btrfs: implement ->show_devname Because btrfs can remove the device that was mounted we need to have a ->show_devname so that in this case we can print out some other device in the file system to /proc/mount. So if there are multiple devices in a btrfs file system we will just print the device with the lowest devid that we can find. This will make everything consistent and deal with device removal properly. The drawback is if you mount with a device that is higher than the lowest devicd it won't show up as the mounted device in /proc/mounts, but this is a small price to pay. This was inspired by Miao Xie's patch. Thanks, Reviewed-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 96eb9fef7bd2..0eb9a4da069e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -54,6 +54,7 @@ #include "version.h" #include "export.h" #include "compression.h" +#include "rcu-string.h" #define CREATE_TRACE_POINTS #include @@ -1482,12 +1483,44 @@ static void btrfs_fs_dirty_inode(struct inode *inode, int flags) "error %d\n", btrfs_ino(inode), ret); } +static int btrfs_show_devname(struct seq_file *m, struct dentry *root) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); + struct btrfs_fs_devices *cur_devices; + struct btrfs_device *dev, *first_dev = NULL; + struct list_head *head; + struct rcu_string *name; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + cur_devices = fs_info->fs_devices; + while (cur_devices) { + head = &cur_devices->devices; + list_for_each_entry(dev, head, dev_list) { + if (!first_dev || dev->devid < first_dev->devid) + first_dev = dev; + } + cur_devices = cur_devices->seed; + } + + if (first_dev) { + rcu_read_lock(); + name = rcu_dereference(first_dev->name); + seq_escape(m, name->str, " \t\n\\"); + rcu_read_unlock(); + } else { + WARN_ON(1); + } + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + return 0; +} + static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, .put_super = btrfs_put_super, .sync_fs = btrfs_sync_fs, .show_options = btrfs_show_options, + .show_devname = btrfs_show_devname, .write_inode = btrfs_write_inode, .dirty_inode = btrfs_fs_dirty_inode, .alloc_inode = btrfs_alloc_inode, -- cgit v1.2.3 From 8180ef8894fa402443205cff1e23417e8d3434df Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 8 Jun 2012 15:16:12 -0400 Subject: Btrfs: keep inode pinned when compressing writes A user reported lots of problems using compression on the new code and it turns out part of the problem was that igrab() was failing when we added a new ordered extent. This is because when writing out an inode under compression we immediately return without actually doing anything to the pages, and then in another thread at some point down the line actually do the ordered dance. The problem is between the point that we start writeback and we actually add the ordered extent we could be trying to reclaim the inode, which makes igrab() return NULL. So we need to do an igrab() when we create the async extent and then drop it when we are done with it. This makes sure we stay pinned in memory until the ordered extent can get a reference on it and we are good to go. With this patch we no longer panic in btrfs_finish_ordered_io(). Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b7f398c36cb7..06075043da5d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -986,8 +986,10 @@ static noinline void async_cow_start(struct btrfs_work *work) compress_file_range(async_cow->inode, async_cow->locked_page, async_cow->start, async_cow->end, async_cow, &num_added); - if (num_added == 0) + if (num_added == 0) { + iput(async_cow->inode); async_cow->inode = NULL; + } } /* @@ -1020,6 +1022,8 @@ static noinline void async_cow_free(struct btrfs_work *work) { struct async_cow *async_cow; async_cow = container_of(work, struct async_cow, work); + if (async_cow->inode) + iput(async_cow->inode); kfree(async_cow); } @@ -1038,7 +1042,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); BUG_ON(!async_cow); /* -ENOMEM */ - async_cow->inode = inode; + async_cow->inode = igrab(inode); async_cow->root = root; async_cow->locked_page = locked_page; async_cow->start = start; -- cgit v1.2.3 From 7ddf5a42d311d74fd9f7373cb56def0843c219f8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 8 Jun 2012 15:26:47 -0400 Subject: Btrfs: call filemap_fdatawrite twice for compression I removed this in an earlier commit and I was wrong. Because compression can return from filemap_fdatawrite() without having actually set any of it's pages as writeback() it can make filemap_fdatawait() do essentially nothing, and then we won't find any ordered extents because they may not have been created yet. So not only does this make fsync() completely useless, but it will also screw up if you truncate on a non-page aligned offset since we zero out the end and then wait on ordered extents and then call drop caches. We can drop the cache before the io completes and then we try to unpin the extent we just wrote we won't find it and everything goes sideways. So fix this by putting it back and put a giant comment there to keep me from trying to remove it in the future. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 1 + fs/btrfs/inode.c | 15 +++++++++------ fs/btrfs/ordered-data.c | 22 +++++++++++++++++++++- 3 files changed, 31 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index e616f8872e69..12394a90d60f 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -37,6 +37,7 @@ #define BTRFS_INODE_IN_DEFRAG 3 #define BTRFS_INODE_DELALLOC_META_RESERVED 4 #define BTRFS_INODE_HAS_ORPHAN_ITEM 5 +#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 /* in memory btrfs inode */ struct btrfs_inode { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06075043da5d..7a090fb4eb98 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1398,20 +1398,23 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, int ret; struct btrfs_root *root = BTRFS_I(inode)->root; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) + } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); - else if (!btrfs_test_opt(root, COMPRESS) && - !(BTRFS_I(inode)->force_compress) && - !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) + } else if (!btrfs_test_opt(root, COMPRESS) && + !(BTRFS_I(inode)->force_compress) && + !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) { ret = cow_file_range(inode, locked_page, start, end, page_started, nr_written, 1); - else + } else { + set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags); ret = cow_file_range_async(inode, locked_page, start, end, page_started, nr_written); + } return ret; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9e138cdc36c5..643335a4fe3c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -627,7 +627,27 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) /* start IO across the range first to instantiate any delalloc * extents */ - filemap_write_and_wait_range(inode->i_mapping, start, orig_end); + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + + /* + * So with compression we will find and lock a dirty page and clear the + * first one as dirty, setup an async extent, and immediately return + * with the entire range locked but with nobody actually marked with + * writeback. So we can't just filemap_write_and_wait_range() and + * expect it to work since it will just kick off a thread to do the + * actual work. So we need to call filemap_fdatawrite_range _again_ + * since it will wait on the page lock, which won't be unlocked until + * after the pages have been marked as writeback and so we're good to go + * from there. We have to do this otherwise we'll miss the ordered + * extents and that results in badness. Please Josef, do not think you + * know better and pull this out at some point in the future, it is + * right and you are wrong. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + + filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; found = 0; -- cgit v1.2.3 From 6c282eb40ed6f64a51ff447707714df551d85b8e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 11 Jun 2012 16:03:35 +0800 Subject: Btrfs: fix defrag regression If a file has 3 small extents: | ext1 | ext2 | ext3 | Running "btrfs fi defrag" will only defrag the last two extents, if those extent mappings hasn't been read into memory from disk. This bug was introduced by commit 17ce6ef8d731af5edac8c39e806db4c7e1f6956f ("Btrfs: add a check to decide if we should defrag the range") The cause is, that commit looked into previous and next extents using lookup_extent_mapping() only. While at it, remove the code that checks the previous extent, since it's sufficient to check the next extent. Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 97 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c5254dde1f0f..a98f7d252829 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -786,39 +786,57 @@ none: return -ENOENT; } -/* - * Validaty check of prev em and next em: - * 1) no prev/next em - * 2) prev/next em is an hole/inline extent - */ -static int check_adjacent_extents(struct inode *inode, struct extent_map *em) +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_map *prev = NULL, *next = NULL; - int ret = 0; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em; + u64 len = PAGE_CACHE_SIZE; + /* + * hopefully we have this extent in the tree already, try without + * the full extent lock + */ read_lock(&em_tree->lock); - prev = lookup_extent_mapping(em_tree, em->start - 1, (u64)-1); - next = lookup_extent_mapping(em_tree, em->start + em->len, (u64)-1); + em = lookup_extent_mapping(em_tree, start, len); read_unlock(&em_tree->lock); - if ((!prev || prev->block_start >= EXTENT_MAP_LAST_BYTE) && - (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)) - ret = 1; - free_extent_map(prev); - free_extent_map(next); + if (!em) { + /* get the big lock and read metadata off disk */ + lock_extent(io_tree, start, start + len - 1); + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + unlock_extent(io_tree, start, start + len - 1); + if (IS_ERR(em)) + return NULL; + } + + return em; +} + +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) +{ + struct extent_map *next; + bool ret = true; + + /* this is the last extent */ + if (em->start + em->len >= i_size_read(inode)) + return false; + + next = defrag_lookup_extent(inode, em->start + em->len); + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + ret = false; + + free_extent_map(next); return ret; } -static int should_defrag_range(struct inode *inode, u64 start, u64 len, - int thresh, u64 *last_len, u64 *skip, - u64 *defrag_end) +static int should_defrag_range(struct inode *inode, u64 start, int thresh, + u64 *last_len, u64 *skip, u64 *defrag_end) { - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; int ret = 1; + bool next_mergeable = true; /* * make sure that once we start defragging an extent, we keep on @@ -829,23 +847,9 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, *skip = 0; - /* - * hopefully we have this extent in the tree already, try without - * the full extent lock - */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - read_unlock(&em_tree->lock); - - if (!em) { - /* get the big lock and read metadata off disk */ - lock_extent(io_tree, start, start + len - 1); - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); - unlock_extent(io_tree, start, start + len - 1); - - if (IS_ERR(em)) - return 0; - } + em = defrag_lookup_extent(inode, start); + if (!em) + return 0; /* this will cover holes, and inline extents */ if (em->block_start >= EXTENT_MAP_LAST_BYTE) { @@ -853,18 +857,15 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, goto out; } - /* If we have nothing to merge with us, just skip. */ - if (check_adjacent_extents(inode, em)) { - ret = 0; - goto out; - } + next_mergeable = defrag_check_next_extent(inode, em); /* - * we hit a real extent, if it is big don't bother defragging it again + * we hit a real extent, if it is big or the next extent is not a + * real extent, don't bother defragging it */ - if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh) + if ((*last_len == 0 || *last_len >= thresh) && + (em->len >= thresh || !next_mergeable)) ret = 0; - out: /* * last_len ends up being a counter of how many bytes we've defragged. @@ -1143,8 +1144,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, extent_thresh, - &last_len, &skip, &defrag_end)) { + extent_thresh, &last_len, &skip, + &defrag_end)) { unsigned long next; /* * the should_defrag function tells us how much to skip -- cgit v1.2.3 From 69e380d176e73653e09ce2cf3a5dc09682a69229 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 11 Jun 2012 17:10:51 +0800 Subject: Btrfs: fix incompat flags setting It's a bug, but it happens to work, as BTRFS_COMPRESS_LZO == 2, which has only one bit set. Signed-off-by: Li Zefan --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1c9664b16cd1..9a569aef72ea 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2119,7 +2119,7 @@ int open_ctree(struct super_block *sb, features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; - if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) + if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; /* -- cgit v1.2.3 From bc1782374b128103ae9689e0753e0610f35b6bfd Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 14 Jun 2012 02:23:18 -0600 Subject: Btrfs: fix missing inherited flag in rename When we move a file into a directory with compression flag, we need to inherite BTRFS_INODE_COMPRESS and clear BTRFS_INODE_NOCOMPRESS as well. But if we move a file into a directory without compression flag, we need to clear both of them. It is the way how our setflags deals with compression flag, so keep the same behaviour here. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7a090fb4eb98..3f2c8cbe5ba6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7122,10 +7122,13 @@ static void fixup_inode_flags(struct inode *dir, struct inode *inode) else b_inode->flags &= ~BTRFS_INODE_NODATACOW; - if (b_dir->flags & BTRFS_INODE_COMPRESS) + if (b_dir->flags & BTRFS_INODE_COMPRESS) { b_inode->flags |= BTRFS_INODE_COMPRESS; - else - b_inode->flags &= ~BTRFS_INODE_COMPRESS; + b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + } else { + b_inode->flags &= ~(BTRFS_INODE_COMPRESS | + BTRFS_INODE_NOCOMPRESS); + } } static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, -- cgit v1.2.3 From 4e42ae1bdcda77fc958a17d7ff4ba5a9c9c207da Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 14 Jun 2012 02:23:19 -0600 Subject: Btrfs: do not resize a seeding device Seeding devices are not supposed to change any more. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a98f7d252829..58adbd0356d6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1306,6 +1306,13 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, ret = -EINVAL; goto out_free; } + if (device->fs_devices && device->fs_devices->seeding) { + printk(KERN_INFO "btrfs: resizer unable to apply on " + "seeding device %llu\n", devid); + ret = -EINVAL; + goto out_free; + } + if (!strcmp(sizestr, "max")) new_size = device->bdev->bd_inode->i_size; else { -- cgit v1.2.3 From 6e841e32b159e298debbb2cb0e9158e894fcaf05 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 14 Jun 2012 02:23:20 -0600 Subject: Btrfs: avoid memory leak of extent state in error handling routine We've forgotten to clear extent states in pinned tree, which will results in space counter mismatch and memory leak: WARNING: at fs/btrfs/extent-tree.c:7537 btrfs_free_block_groups+0x1f3/0x2e0 [btrfs]() ... space_info 2 has 8380416 free, is not full space_info total=12582912, used=4096, pinned=4096, reserved=0, may_use=0, readonly=4194304 btrfs state leak: start 29364224 end 29376511 state 1 in tree ffff880075f20090 refs 1 ... Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9a569aef72ea..63a36232788b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3601,6 +3601,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, EXTENT_DIRTY); + btrfs_destroy_pinned_extent(root, + root->fs_info->pinned_extents); /* memset(cur_trans, 0, sizeof(*cur_trans)); -- cgit v1.2.3 From ed0eaa14981e87a1e185b61e4ef621c440e3930c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 14 Jun 2012 02:23:21 -0600 Subject: Btrfs: make sure that we've made everything in pinned tree clean Since we have two trees for recording pinned extents, we need to go through both of them to make sure that we've done everything clean. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 63a36232788b..ffdd76bf05d4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3557,8 +3557,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, u64 start; u64 end; int ret; + bool loop = true; unpin = pinned_extents; +again: while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); @@ -3576,6 +3578,15 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, cond_resched(); } + if (loop) { + if (unpin == &root->fs_info->freed_extents[0]) + unpin = &root->fs_info->freed_extents[1]; + else + unpin = &root->fs_info->freed_extents[0]; + loop = false; + goto again; + } + return 0; } -- cgit v1.2.3 From 67cde3448d951b55088a6ea3bb1aee0160068fb9 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 14 Jun 2012 02:23:22 -0600 Subject: Btrfs: destroy the items of the delayed inodes in error handling routine the items of the delayed inodes were forgotten to be freed, this patch fixes it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 18 ++++++++++++++++++ fs/btrfs/delayed-inode.h | 3 +++ fs/btrfs/disk-io.c | 6 ++++++ 3 files changed, 27 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c18d0442ae6d..2399f4086915 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1879,3 +1879,21 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) } } } + +void btrfs_destroy_delayed_inodes(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + struct btrfs_delayed_node *curr_node, *prev_node; + + delayed_root = btrfs_get_delayed_root(root); + + curr_node = btrfs_first_delayed_node(delayed_root); + while (curr_node) { + __btrfs_kill_delayed_node(curr_node); + + prev_node = curr_node; + curr_node = btrfs_next_delayed_node(curr_node); + btrfs_release_delayed_node(prev_node); + } +} + diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 7083d08b2a21..f5aa4023d3e1 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -124,6 +124,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev); /* Used for drop dead root */ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); +/* Used for clean the transaction */ +void btrfs_destroy_delayed_inodes(struct btrfs_root *root); + /* Used for readdir() */ void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, struct list_head *del_list); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ffdd76bf05d4..e22c5bbf0223 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3608,6 +3608,9 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->commit_done = 1; wake_up(&cur_trans->commit_wait); + btrfs_destroy_delayed_inodes(root); + btrfs_assert_delayed_root_empty(root); + btrfs_destroy_pending_snapshots(cur_trans); btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, @@ -3662,6 +3665,9 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) if (waitqueue_active(&t->commit_wait)) wake_up(&t->commit_wait); + btrfs_destroy_delayed_inodes(root); + btrfs_assert_delayed_root_empty(root); + btrfs_destroy_pending_snapshots(t); btrfs_destroy_delalloc_inodes(root); -- cgit v1.2.3 From e2ae715d66bf4becfb85eb84b7150e23cf27df30 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 15 Jun 2012 14:07:51 +0200 Subject: kmsg - kmsg_dump() use iterator to receive log buffer content Provide an iterator to receive the log buffer content, and convert all kmsg_dump() users to it. The structured data in the kmsg buffer now contains binary data, which should no longer be copied verbatim to the kmsg_dump() users. The iterator should provide reliable access to the buffer data, and also supports proper log line-aware chunking of data while iterating. Signed-off-by: Kay Sievers Tested-by: Tony Luck Reported-by: Anton Vorontsov Tested-by: Anton Vorontsov Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/pseries/nvram.c | 61 +------- arch/x86/platform/mrst/early_printk_mrst.c | 13 +- drivers/mtd/mtdoops.c | 22 +-- fs/pstore/platform.c | 34 ++--- include/linux/kmsg_dump.h | 45 +++++- kernel/printk.c | 220 +++++++++++++++++++++++++---- 6 files changed, 258 insertions(+), 137 deletions(-) (limited to 'fs') diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 36f957f31842..8733a86ad52e 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -68,9 +68,7 @@ static const char *pseries_nvram_os_partitions[] = { }; static void oops_to_nvram(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, - const char *old_msgs, unsigned long old_len, - const char *new_msgs, unsigned long new_len); + enum kmsg_dump_reason reason); static struct kmsg_dumper nvram_kmsg_dumper = { .dump = oops_to_nvram @@ -503,28 +501,6 @@ int __init pSeries_nvram_init(void) return 0; } -/* - * Try to capture the last capture_len bytes of the printk buffer. Return - * the amount actually captured. - */ -static size_t capture_last_msgs(const char *old_msgs, size_t old_len, - const char *new_msgs, size_t new_len, - char *captured, size_t capture_len) -{ - if (new_len >= capture_len) { - memcpy(captured, new_msgs + (new_len - capture_len), - capture_len); - return capture_len; - } else { - /* Grab the end of old_msgs. */ - size_t old_tail_len = min(old_len, capture_len - new_len); - memcpy(captured, old_msgs + (old_len - old_tail_len), - old_tail_len); - memcpy(captured + old_tail_len, new_msgs, new_len); - return old_tail_len + new_len; - } -} - /* * Are we using the ibm,rtas-log for oops/panic reports? And if so, * would logging this oops/panic overwrite an RTAS event that rtas_errd @@ -541,27 +517,6 @@ static int clobbering_unread_rtas_event(void) NVRAM_RTAS_READ_TIMEOUT); } -/* Squeeze out each line's severity prefix. */ -static size_t elide_severities(char *buf, size_t len) -{ - char *in, *out, *buf_end = buf + len; - /* Assume a at the very beginning marks the start of a line. */ - int newline = 1; - - in = out = buf; - while (in < buf_end) { - if (newline && in+3 <= buf_end && - *in == '<' && isdigit(in[1]) && in[2] == '>') { - in += 3; - newline = 0; - } else { - newline = (*in == '\n'); - *out++ = *in++; - } - } - return out - buf; -} - /* Derived from logfs_compress() */ static int nvram_compress(const void *in, void *out, size_t inlen, size_t outlen) @@ -619,9 +574,7 @@ static int zip_oops(size_t text_len) * partition. If that's too much, go back and capture uncompressed text. */ static void oops_to_nvram(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, - const char *old_msgs, unsigned long old_len, - const char *new_msgs, unsigned long new_len) + enum kmsg_dump_reason reason) { static unsigned int oops_count = 0; static bool panicking = false; @@ -660,14 +613,14 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, return; if (big_oops_buf) { - text_len = capture_last_msgs(old_msgs, old_len, - new_msgs, new_len, big_oops_buf, big_oops_buf_sz); - text_len = elide_severities(big_oops_buf, text_len); + kmsg_dump_get_buffer(dumper, false, + big_oops_buf, big_oops_buf_sz, &text_len); rc = zip_oops(text_len); } if (rc != 0) { - text_len = capture_last_msgs(old_msgs, old_len, - new_msgs, new_len, oops_data, oops_data_sz); + kmsg_dump_rewind(dumper); + kmsg_dump_get_buffer(dumper, true, + oops_data, oops_data_sz, &text_len); err_type = ERR_TYPE_KERNEL_PANIC; *oops_len = (u16) text_len; } diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c index 3c6e328483c7..028454f0c3a5 100644 --- a/arch/x86/platform/mrst/early_printk_mrst.c +++ b/arch/x86/platform/mrst/early_printk_mrst.c @@ -110,19 +110,16 @@ static struct kmsg_dumper dw_dumper; static int dumper_registered; static void dw_kmsg_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, - const char *s1, unsigned long l1, - const char *s2, unsigned long l2) + enum kmsg_dump_reason reason) { - int i; + static char line[1024]; + size_t len; /* When run to this, we'd better re-init the HW */ mrst_early_console_init(); - for (i = 0; i < l1; i++) - early_mrst_console.write(&early_mrst_console, s1 + i, 1); - for (i = 0; i < l2; i++) - early_mrst_console.write(&early_mrst_console, s2 + i, 1); + while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len)) + early_mrst_console.write(&early_mrst_console, line, len); } /* Set the ratio rate to 115200, 8n1, IRQ disabled */ diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c index ae36d7e1e913..551e316e4454 100644 --- a/drivers/mtd/mtdoops.c +++ b/drivers/mtd/mtdoops.c @@ -304,32 +304,17 @@ static void find_next_position(struct mtdoops_context *cxt) } static void mtdoops_do_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, const char *s1, unsigned long l1, - const char *s2, unsigned long l2) + enum kmsg_dump_reason reason) { struct mtdoops_context *cxt = container_of(dumper, struct mtdoops_context, dump); - unsigned long s1_start, s2_start; - unsigned long l1_cpy, l2_cpy; - char *dst; - - if (reason != KMSG_DUMP_OOPS && - reason != KMSG_DUMP_PANIC) - return; /* Only dump oopses if dump_oops is set */ if (reason == KMSG_DUMP_OOPS && !dump_oops) return; - dst = cxt->oops_buf + MTDOOPS_HEADER_SIZE; /* Skip the header */ - l2_cpy = min(l2, record_size - MTDOOPS_HEADER_SIZE); - l1_cpy = min(l1, record_size - MTDOOPS_HEADER_SIZE - l2_cpy); - - s2_start = l2 - l2_cpy; - s1_start = l1 - l1_cpy; - - memcpy(dst, s1 + s1_start, l1_cpy); - memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); + kmsg_dump_get_buffer(dumper, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE, + record_size - MTDOOPS_HEADER_SIZE, NULL); /* Panics must be written immediately */ if (reason != KMSG_DUMP_OOPS) @@ -375,6 +360,7 @@ static void mtdoops_notify_add(struct mtd_info *mtd) return; } + cxt->dump.max_reason = KMSG_DUMP_OOPS; cxt->dump.dump = mtdoops_do_dump; err = kmsg_dump_register(&cxt->dump); if (err) { diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 82c585f715e3..03ce7a9b81cc 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -94,20 +94,15 @@ static const char *get_reason_str(enum kmsg_dump_reason reason) * as we can from the end of the buffer. */ static void pstore_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, - const char *s1, unsigned long l1, - const char *s2, unsigned long l2) + enum kmsg_dump_reason reason) { - unsigned long s1_start, s2_start; - unsigned long l1_cpy, l2_cpy; - unsigned long size, total = 0; - char *dst; + unsigned long total = 0; const char *why; u64 id; - int hsize, ret; unsigned int part = 1; unsigned long flags = 0; int is_locked = 0; + int ret; why = get_reason_str(reason); @@ -119,30 +114,25 @@ static void pstore_dump(struct kmsg_dumper *dumper, spin_lock_irqsave(&psinfo->buf_lock, flags); oopscount++; while (total < kmsg_bytes) { + char *dst; + unsigned long size; + int hsize; + size_t len; + dst = psinfo->buf; hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part); size = psinfo->bufsize - hsize; dst += hsize; - l2_cpy = min(l2, size); - l1_cpy = min(l1, size - l2_cpy); - - if (l1_cpy + l2_cpy == 0) + if (!kmsg_dump_get_buffer(dumper, true, dst, size, &len)) break; - s2_start = l2 - l2_cpy; - s1_start = l1 - l1_cpy; - - memcpy(dst, s1 + s1_start, l1_cpy); - memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); - ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, - hsize + l1_cpy + l2_cpy, psinfo); + hsize + len, psinfo); if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_new_entry = 1; - l1 -= l1_cpy; - l2 -= l2_cpy; - total += l1_cpy + l2_cpy; + + total += hsize + len; part++; } if (in_nmi()) { diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 35f7237ec972..af4eb5a39d9a 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -21,6 +21,7 @@ * is passed to the kernel. */ enum kmsg_dump_reason { + KMSG_DUMP_UNDEF, KMSG_DUMP_PANIC, KMSG_DUMP_OOPS, KMSG_DUMP_EMERG, @@ -31,23 +32,37 @@ enum kmsg_dump_reason { /** * struct kmsg_dumper - kernel crash message dumper structure - * @dump: The callback which gets called on crashes. The buffer is passed - * as two sections, where s1 (length l1) contains the older - * messages and s2 (length l2) contains the newer. * @list: Entry in the dumper list (private) + * @dump: Call into dumping code which will retrieve the data with + * through the record iterator + * @max_reason: filter for highest reason number that should be dumped * @registered: Flag that specifies if this is already registered */ struct kmsg_dumper { - void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason, - const char *s1, unsigned long l1, - const char *s2, unsigned long l2); struct list_head list; - int registered; + void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason); + enum kmsg_dump_reason max_reason; + bool active; + bool registered; + + /* private state of the kmsg iterator */ + u32 cur_idx; + u32 next_idx; + u64 cur_seq; + u64 next_seq; }; #ifdef CONFIG_PRINTK void kmsg_dump(enum kmsg_dump_reason reason); +bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len); + +bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + char *buf, size_t size, size_t *len); + +void kmsg_dump_rewind(struct kmsg_dumper *dumper); + int kmsg_dump_register(struct kmsg_dumper *dumper); int kmsg_dump_unregister(struct kmsg_dumper *dumper); @@ -56,6 +71,22 @@ static inline void kmsg_dump(enum kmsg_dump_reason reason) { } +bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + const char *line, size_t size, size_t *len) +{ + return false; +} + +bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + char *buf, size_t size, size_t *len) +{ + return false; +} + +void kmsg_dump_rewind(struct kmsg_dumper *dumper) +{ +} + static inline int kmsg_dump_register(struct kmsg_dumper *dumper) { return -EINVAL; diff --git a/kernel/printk.c b/kernel/printk.c index f205c25c37e2..ceb4a2f775a1 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -909,7 +909,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. - */ + */ seq = clear_seq; idx = clear_idx; while (seq < log_next_seq) { @@ -919,6 +919,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear) idx = log_next(idx); seq++; } + + /* move first record forward until length fits into the buffer */ seq = clear_seq; idx = clear_idx; while (len > size && seq < log_next_seq) { @@ -929,7 +931,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) seq++; } - /* last message in this dump */ + /* last message fitting into this dump */ next_seq = log_next_seq; len = 0; @@ -2300,48 +2302,210 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping * - * Iterate through each of the dump devices and call the oops/panic - * callbacks with the log buffer. + * Call each of the registered dumper's dump() callback, which can + * retrieve the kmsg records with kmsg_dump_get_line() or + * kmsg_dump_get_buffer(). */ void kmsg_dump(enum kmsg_dump_reason reason) { - u64 idx; struct kmsg_dumper *dumper; - const char *s1, *s2; - unsigned long l1, l2; unsigned long flags; if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) return; - /* Theoretically, the log could move on after we do this, but - there's not a lot we can do about that. The new messages - will overwrite the start of what we dump. */ + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) { + if (dumper->max_reason && reason > dumper->max_reason) + continue; + + /* initialize iterator with data about the stored records */ + dumper->active = true; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + + /* invoke dumper which will iterate over records */ + dumper->dump(dumper, reason); + + /* reset iterator */ + dumper->active = false; + } + rcu_read_unlock(); +} + +/** + * kmsg_dump_get_line - retrieve one kmsg log line + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the beginning of the kmsg buffer, with the oldest kmsg + * record, and copy one record into the provided buffer. + * + * Consecutive calls will return the next available record moving + * towards the end of the buffer with the youngest messages. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) +{ + unsigned long flags; + struct log *msg; + size_t l = 0; + bool ret = false; + + if (!dumper->active) + goto out; raw_spin_lock_irqsave(&logbuf_lock, flags); - if (syslog_seq < log_first_seq) - idx = syslog_idx; - else - idx = log_first_idx; + if (dumper->cur_seq < log_first_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = log_first_seq; + dumper->cur_idx = log_first_idx; + } - if (idx > log_next_idx) { - s1 = log_buf; - l1 = log_next_idx; + /* last entry */ + if (dumper->cur_seq >= log_next_seq) { + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; + } - s2 = log_buf + idx; - l2 = log_buf_len - idx; - } else { - s1 = ""; - l1 = 0; + msg = log_from_idx(dumper->cur_idx); + l = msg_print_text(msg, syslog, + line, size); + + dumper->cur_idx = log_next(dumper->cur_idx); + dumper->cur_seq++; + ret = true; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +out: + if (len) + *len = l; + return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + +/** + * kmsg_dump_get_buffer - copy kmsg log lines + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the end of the kmsg buffer and fill the provided buffer + * with as many of the the *youngest* kmsg records that fit into it. + * If the buffer is large enough, all available kmsg records will be + * copied with a single call. + * + * Consecutive calls will fill the buffer with the next block of + * available older records, not including the earlier retrieved ones. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + char *buf, size_t size, size_t *len) +{ + unsigned long flags; + u64 seq; + u32 idx; + u64 next_seq; + u32 next_idx; + size_t l = 0; + bool ret = false; + + if (!dumper->active) + goto out; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + if (dumper->cur_seq < log_first_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = log_first_seq; + dumper->cur_idx = log_first_idx; + } + + /* last entry */ + if (dumper->cur_seq >= dumper->next_seq) { + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; + } + + /* calculate length of entire buffer */ + seq = dumper->cur_seq; + idx = dumper->cur_idx; + while (seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l += msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } + + /* move first record forward until length fits into the buffer */ + seq = dumper->cur_seq; + idx = dumper->cur_idx; + while (l > size && seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); - s2 = log_buf + idx; - l2 = log_next_idx - idx; + l -= msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; } + + /* last message in next interation */ + next_seq = seq; + next_idx = idx; + + l = 0; + while (seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l += msg_print_text(msg, syslog, + buf + l, size - l); + + idx = log_next(idx); + seq++; + } + + dumper->next_seq = next_seq; + dumper->next_idx = next_idx; + ret = true; raw_spin_unlock_irqrestore(&logbuf_lock, flags); +out: + if (len) + *len = l; + return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); - rcu_read_lock(); - list_for_each_entry_rcu(dumper, &dump_list, list) - dumper->dump(dumper, reason, s1, l1, s2, l2); - rcu_read_unlock(); +/** + * kmsg_dump_rewind - reset the interator + * @dumper: registered kmsg dumper + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple + * times within the same dumper.dump() callback. + */ +void kmsg_dump_rewind(struct kmsg_dumper *dumper) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); } +EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #endif -- cgit v1.2.3 From 4325edd0786fdd3909f9550e52a963b2fe54f78b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 15 Jun 2012 20:02:02 -0400 Subject: Btrfs: init old_generation in get_old_root gcc was giving an uninit variable warning here. Strictly speaking we don't need to init it, but this will make things much less error prone. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 04b06bcf2ca9..15cbc2bf4ff0 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1175,7 +1175,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_elem *tm; struct extent_buffer *eb; struct tree_mod_root *old_root = NULL; - u64 old_generation; + u64 old_generation = 0; u64 logical; eb = btrfs_read_lock_root_node(root); -- cgit v1.2.3 From a8c4a33b98b097e69cd1a10672c43d17ad0ffb25 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 15 Jun 2012 20:07:17 -0400 Subject: Btrfs: cast devid to unsigned long long for printk %llu Avoid warning in 32 bit machines Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 58adbd0356d6..0e92e5763005 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1308,7 +1308,8 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, } if (device->fs_devices && device->fs_devices->seeding) { printk(KERN_INFO "btrfs: resizer unable to apply on " - "seeding device %llu\n", devid); + "seeding device %llu\n", + (unsigned long long)devid); ret = -EINVAL; goto out_free; } -- cgit v1.2.3 From a6dc8c04218eb752ff79cdc24a995cf51866caed Mon Sep 17 00:00:00 2001 From: Janne Kalliomäki Date: Sun, 17 Jun 2012 17:05:24 -0400 Subject: hfsplus: fix overflow in sector calculations in hfsplus_submit_bio MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The variable io_size was unsigned int, which caused the wrong sector number to be calculated after aligning it. This then caused mount to fail with big volumes, as backup volume header information was searched from a wrong sector. Signed-off-by: Janne Kalliomäki Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- fs/hfsplus/wrapper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 7daf4b852d1c..90effcccca9a 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -56,7 +56,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, DECLARE_COMPLETION_ONSTACK(wait); struct bio *bio; int ret = 0; - unsigned int io_size; + u64 io_size; loff_t start; int offset; -- cgit v1.2.3 From 7dea9665fee828fb56db3bae5b9685d9fa006d33 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Sun, 17 Jun 2012 17:05:25 -0400 Subject: hfsplus: fix bless ioctl when used with hardlinks HFS+ doesn't really implement hard links - instead, hardlinks are indicated by a magic file type which refers to an indirect node in a hidden directory. The spec indicates that stat() should return the inode number of the indirect node, but it turns out that this doesn't satisfy the firmware when it's looking for a bootloader - it wants the catalog ID of the hardlink file instead. Fix up this case. Signed-off-by: Matthew Garrett Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- fs/hfsplus/ioctl.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index c640ba57074b..09addc8615fa 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -31,6 +31,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags) struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); struct hfsplus_vh *vh = sbi->s_vhdr; struct hfsplus_vh *bvh = sbi->s_backup_vhdr; + u32 cnid = (unsigned long)dentry->d_fsdata; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -41,8 +42,12 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags) vh->finder_info[0] = bvh->finder_info[0] = cpu_to_be32(parent_ino(dentry)); - /* Bootloader */ - vh->finder_info[1] = bvh->finder_info[1] = cpu_to_be32(inode->i_ino); + /* + * Bootloader. Just using the inode here breaks in the case of + * hard links - the firmware wants the ID of the hard link file, + * but the inode points at the indirect inode + */ + vh->finder_info[1] = bvh->finder_info[1] = cpu_to_be32(cnid); /* Per spec, the OS X system folder - same as finder_info[0] here */ vh->finder_info[5] = bvh->finder_info[5] = -- cgit v1.2.3 From 1cfb7271076a265b7c2cbbf9cf5c2ae060a24385 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 9 Jun 2012 12:08:23 +0300 Subject: UBIFS: fix assertion The asserts here never check anything because it uses '|' instead of '&'. Now if the flags are not set it prints a warning a a stack trace. Signed-off-by: Dan Carpenter Signed-off-by: Artem Bityutskiy --- fs/ubifs/find.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 2559d174e004..28ec13af28d9 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -939,8 +939,8 @@ static int find_dirtiest_idx_leb(struct ubifs_info *c) } dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty, lp->free, lp->flags); - ubifs_assert(lp->flags | LPROPS_TAKEN); - ubifs_assert(lp->flags | LPROPS_INDEX); + ubifs_assert(lp->flags & LPROPS_TAKEN); + ubifs_assert(lp->flags & LPROPS_INDEX); return lnum; } -- cgit v1.2.3 From 2a4c8994eeef50796015f8a2005e4a75c1929166 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 14 Jun 2012 13:08:38 -0400 Subject: NFSv4.1: Fix umount when filelayout DS is also the MDS Currently there is a 'chicken and egg' issue when the DS is also the mounted MDS. The nfs_match_client() reference from nfs4_set_ds_client bumps the cl_count, the nfs_client is not freed at umount, and nfs4_deviceid_purge_client is not called to dereference the MDS usage of a deviceid which holds a reference to the DS nfs_client. The result is the umount program returns, but the nfs_client is not freed, and the cl_session hearbeat continues. The MDS (and all other nfs mounts) lose their last nfs_client reference in nfs_free_server when the last nfs_server (fsid) is umounted. The file layout DS lose their last nfs_client reference in destroy_ds when the last deviceid referencing the data server is put and destroy_ds is called. This is triggered by a call to nfs4_deviceid_purge_client which removes references to a pNFS deviceid used by an MDS mount. The fix is to track how many pnfs enabled filesystems are mounted from this server, and then to purge the device id cache once that count reaches zero. Reported-by: Jorge Mora Reported-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 1 - fs/nfs/pnfs.c | 5 +++++ include/linux/nfs_fs_sb.h | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 17ba6b995659..f005b5bebdc7 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -207,7 +207,6 @@ error_0: static void nfs4_shutdown_session(struct nfs_client *clp) { if (nfs4_has_session(clp)) { - nfs4_deviceid_purge_client(clp); nfs4_destroy_session(clp->cl_session); nfs4_destroy_clientid(clp); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index b8323aa7b543..bdf7e52943c8 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -80,6 +80,9 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss) if (nfss->pnfs_curr_ld) { if (nfss->pnfs_curr_ld->clear_layoutdriver) nfss->pnfs_curr_ld->clear_layoutdriver(nfss); + /* Decrement the MDS count. Purge the deviceid cache if zero */ + if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count)) + nfs4_deviceid_purge_client(nfss->nfs_client); module_put(nfss->pnfs_curr_ld->owner); } nfss->pnfs_curr_ld = NULL; @@ -127,6 +130,8 @@ set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, module_put(ld_type->owner); goto out_no_driver; } + /* Bump the MDS count */ + atomic_inc(&server->nfs_client->cl_mds_count); dprintk("%s: pNFS module for %u set\n", __func__, id); return; diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index fbb78fb09bd2..f58325a1d8fb 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -25,6 +25,7 @@ struct nfs41_impl_id; */ struct nfs_client { atomic_t cl_count; + atomic_t cl_mds_count; int cl_cons_state; /* current construction state (-ve: init error) */ #define NFS_CS_READY 0 /* ready to be used */ #define NFS_CS_INITING 1 /* busy initialising */ -- cgit v1.2.3 From 0a9c63fae7df086ff5e107273c3cce8642430974 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 15 Jun 2012 13:02:58 -0400 Subject: NFSv4.1: Fix a race in set_pnfs_layoutdriver The call to try_module_get() dereferences ld_type outside the spin locks, which means that it may be pointing to garbage if a module unload was in progress. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bdf7e52943c8..bbc49caa7a82 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -70,6 +70,10 @@ find_pnfs_driver(u32 id) spin_lock(&pnfs_spinlock); local = find_pnfs_driver_locked(id); + if (local != NULL && !try_module_get(local->owner)) { + dprintk("%s: Could not grab reference on module\n", __func__); + local = NULL; + } spin_unlock(&pnfs_spinlock); return local; } @@ -118,10 +122,6 @@ set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, goto out_no_driver; } } - if (!try_module_get(ld_type->owner)) { - dprintk("%s: Could not grab reference on module\n", __func__); - goto out_no_driver; - } server->pnfs_curr_ld = ld_type; if (ld_type->set_layoutdriver && ld_type->set_layoutdriver(server, mntfh)) { -- cgit v1.2.3 From 5a695da26367f53a368c90435e0b883c12f02791 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 19 Jun 2012 13:39:14 -0400 Subject: NFS: Fix a refcounting issue in O_DIRECT In nfs_direct_write_reschedule(), the requests from nfs_scan_commit_list have a refcount of 2, whereas the operations in nfs_direct_write_completion_ops expect them to have a refcount of 1. This patch adds a call to release the extra references. Signed-off-by: Trond Myklebust Cc: Fred Isaman --- fs/nfs/direct.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 3168f6e3d4d4..9a4cbfc85d81 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -490,6 +490,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) dreq->error = -EIO; spin_unlock(cinfo.lock); } + nfs_release_request(req); } nfs_pageio_complete(&desc); -- cgit v1.2.3 From 1a0de48ae56b5cdb9a46b3d3a0b578dd7f787f22 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 19 Jun 2012 18:38:56 -0400 Subject: NFS: Initialise commit_info.rpc_out when !defined(CONFIG_NFS_V4) Signed-off-by: Trond Myklebust Cc: Fred Isaman --- fs/nfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e605d695dbcb..f7296983eba6 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1530,7 +1530,6 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) nfsi->delegation_state = 0; init_rwsem(&nfsi->rwsem); nfsi->layout = NULL; - atomic_set(&nfsi->commit_info.rpcs_out, 0); #endif } @@ -1545,6 +1544,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&nfsi->commit_info.list); nfsi->npages = 0; nfsi->commit_info.ncommit = 0; + atomic_set(&nfsi->commit_info.rpcs_out, 0); atomic_set(&nfsi->silly_count, 1); INIT_HLIST_HEAD(&nfsi->silly_list); init_waitqueue_head(&nfsi->waitqueue); -- cgit v1.2.3 From 61600ef8483924039dcdec8b4717ca32bd3353c6 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 28 May 2012 14:44:30 +0800 Subject: ceph: check PG_Private flag before accessing page->private I got lots of NULL pointer dereference Oops when compiling kernel on ceph. The bug is because the kernel page migration routine replaces some pages in the page cache with new pages, these new pages' private can be non-zero. Signed-off-by: Zheng Yan Signed-off-by: Sage Weil (cherry picked from commit 28c0254ede13ab575d2df5c6585ed3d4817c3e6b) --- fs/ceph/addr.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 173b1d22e59b..8b67304e4b80 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -54,7 +54,12 @@ (CONGESTION_ON_THRESH(congestion_kb) - \ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) - +static inline struct ceph_snap_context *page_snap_context(struct page *page) +{ + if (PagePrivate(page)) + return (void *)page->private; + return NULL; +} /* * Dirty a page. Optimistically adjust accounting, on the assumption @@ -142,10 +147,9 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset) { struct inode *inode; struct ceph_inode_info *ci; - struct ceph_snap_context *snapc = (void *)page->private; + struct ceph_snap_context *snapc = page_snap_context(page); BUG_ON(!PageLocked(page)); - BUG_ON(!page->private); BUG_ON(!PagePrivate(page)); BUG_ON(!page->mapping); @@ -182,7 +186,6 @@ static int ceph_releasepage(struct page *page, gfp_t g) struct inode *inode = page->mapping ? page->mapping->host : NULL; dout("%p releasepage %p idx %lu\n", inode, page, page->index); WARN_ON(PageDirty(page)); - WARN_ON(page->private); WARN_ON(PagePrivate(page)); return 0; } @@ -443,7 +446,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) osdc = &fsc->client->osdc; /* verify this is a writeable snap context */ - snapc = (void *)page->private; + snapc = page_snap_context(page); if (snapc == NULL) { dout("writepage %p page %p not dirty?\n", inode, page); goto out; @@ -451,7 +454,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) oldest = get_oldest_context(inode, &snap_size); if (snapc->seq > oldest->seq) { dout("writepage %p page %p snapc %p not writeable - noop\n", - inode, page, (void *)page->private); + inode, page, snapc); /* we should only noop if called by kswapd */ WARN_ON((current->flags & PF_MEMALLOC) == 0); ceph_put_snap_context(oldest); @@ -591,7 +594,7 @@ static void writepages_finish(struct ceph_osd_request *req, clear_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); - ceph_put_snap_context((void *)page->private); + ceph_put_snap_context(page_snap_context(page)); page->private = 0; ClearPagePrivate(page); dout("unlocking %d %p\n", i, page); @@ -795,7 +798,7 @@ get_more_pages: } /* only if matching snap context */ - pgsnapc = (void *)page->private; + pgsnapc = page_snap_context(page); if (pgsnapc->seq > snapc->seq) { dout("page snapc %p %lld > oldest %p %lld\n", pgsnapc, pgsnapc->seq, snapc, snapc->seq); @@ -984,7 +987,7 @@ retry_locked: BUG_ON(!ci->i_snap_realm); down_read(&mdsc->snap_rwsem); BUG_ON(!ci->i_snap_realm->cached_context); - snapc = (void *)page->private; + snapc = page_snap_context(page); if (snapc && snapc != ci->i_head_snapc) { /* * this page is already dirty in another (older) snap -- cgit v1.2.3 From b1027439dff844675f6c0df97a1b1d190791a699 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Wed, 20 Jun 2012 14:35:28 -0400 Subject: NFS: Force the legacy idmapper to be single threaded It was initially coded under the assumption that there would only be one request at a time, so use a lock to enforce this requirement.. Signed-off-by: Bryan Schumaker CC: stable@vger.kernel.org [3.4+] Signed-off-by: Trond Myklebust --- fs/nfs/idmap.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index b5b86a05059c..864c51e4b400 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -57,6 +57,11 @@ unsigned int nfs_idmap_cache_timeout = 600; static const struct cred *id_resolver_cache; static struct key_type key_type_id_resolver_legacy; +struct idmap { + struct rpc_pipe *idmap_pipe; + struct key_construction *idmap_key_cons; + struct mutex idmap_mutex; +}; /** * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields @@ -310,9 +315,11 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, name, namelen, type, data, data_size, NULL); if (ret < 0) { + mutex_lock(&idmap->idmap_mutex); ret = nfs_idmap_request_key(&key_type_id_resolver_legacy, name, namelen, type, data, data_size, idmap); + mutex_unlock(&idmap->idmap_mutex); } return ret; } @@ -354,11 +361,6 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *typ /* idmap classic begins here */ module_param(nfs_idmap_cache_timeout, int, 0644); -struct idmap { - struct rpc_pipe *idmap_pipe; - struct key_construction *idmap_key_cons; -}; - enum { Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err }; @@ -469,6 +471,7 @@ nfs_idmap_new(struct nfs_client *clp) return error; } idmap->idmap_pipe = pipe; + mutex_init(&idmap->idmap_mutex); clp->cl_idmap = idmap; return 0; -- cgit v1.2.3 From 66f9311381b4772003d595fb6c518f1647450db0 Mon Sep 17 00:00:00 2001 From: Alain Renaud Date: Fri, 8 Jun 2012 15:34:46 -0400 Subject: xfs: xfs_vm_writepage clear iomap_valid when !buffer_uptodate (REV2) On filesytems with a block size smaller than PAGE_SIZE we currently have a problem with unwritten extents. If a we have multi-block page for which an unwritten extent has been allocated, and only some of the buffers have been written to, and they are not contiguous, we can expose stale data from disk in the blocks between the writes after extent conversion. Example of a page with unwritten and real data. buffer content 0 empty b_state = 0 1 DATA b_state = 0x1023 Uptodate,Dirty,Mapped,Unwritten 2 DATA b_state = 0x1023 Uptodate,Dirty,Mapped,Unwritten 3 empty b_state = 0 4 empty b_state = 0 5 DATA b_state = 0x1023 Uptodate,Dirty,Mapped,Unwritten 6 DATA b_state = 0x1023 Uptodate,Dirty,Mapped,Unwritten 7 empty b_state = 0 Buffers 1, 2, 5, and 6 have been written to, leaving 0, 3, 4, and 7 empty. Currently buffers 1, 2, 5, and 6 are added to a single ioend, and when IO has completed, extent conversion creates a real extent from block 1 through block 6, leaving 0 and 7 unwritten. However buffers 3 and 4 were not written to disk, so stale data is exposed from those blocks on a subsequent read. Fix this by setting iomap_valid = 0 when we find a buffer that is not Uptodate. This ensures that buffers 5 and 6 are not added to the same ioend as buffers 1 and 2. Later these blocks will be converted into two separate real extents, leaving the blocks in between unwritten. Signed-off-by: Alain Renaud Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index ae31c313a79e..8dad722c0041 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -981,10 +981,15 @@ xfs_vm_writepage( imap_valid = 0; } } else { - if (PageUptodate(page)) { + if (PageUptodate(page)) ASSERT(buffer_mapped(bh)); - imap_valid = 0; - } + /* + * This buffer is not uptodate and will not be + * written to disk. Ensure that we will put any + * subsequent writeable buffers into a new + * ioend. + */ + imap_valid = 0; continue; } -- cgit v1.2.3 From 3b876c8f2a361ceeed3fed894980c69066f903a0 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Thu, 7 Jun 2012 15:44:32 +0800 Subject: xfs: fix debug_object WARN at xfs_alloc_vextent() Fengguang reports: [ 780.529603] XFS (vdd): Ending clean mount [ 781.454590] ODEBUG: object is on stack, but not annotated [ 781.455433] ------------[ cut here ]------------ [ 781.455433] WARNING: at /c/kernel-tests/sound/lib/debugobjects.c:301 __debug_object_init+0x173/0x1f1() [ 781.455433] Hardware name: Bochs [ 781.455433] Modules linked in: [ 781.455433] Pid: 26910, comm: kworker/0:2 Not tainted 3.4.0+ #51 [ 781.455433] Call Trace: [ 781.455433] [] warn_slowpath_common+0x83/0x9b [ 781.455433] [] warn_slowpath_null+0x1a/0x1c [ 781.455433] [] __debug_object_init+0x173/0x1f1 [ 781.455433] [] debug_object_init+0x14/0x16 [ 781.455433] [] __init_work+0x20/0x22 [ 781.455433] [] xfs_alloc_vextent+0x6c/0xd5 Use INIT_WORK_ONSTACK in xfs_alloc_vextent instead of INIT_WORK. Reported-by: Wu Fengguang Signed-off-by: Jie Liu Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 229641fb8e67..a996e398692b 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2441,7 +2441,7 @@ xfs_alloc_vextent( DECLARE_COMPLETION_ONSTACK(done); args->done = &done; - INIT_WORK(&args->work, xfs_alloc_vextent_worker); + INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker); queue_work(xfs_alloc_wq, &args->work); wait_for_completion(&done); return args->result; -- cgit v1.2.3 From fbb24a3a915f105016f1c828476be11aceac8504 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 20 Jun 2012 12:52:57 -0700 Subject: nilfs2: ensure proper cache clearing for gc-inodes A gc-inode is a pseudo inode used to buffer the blocks to be moved by garbage collection. Block caches of gc-inodes must be cleared every time a garbage collection function (nilfs_clean_segments) completes. Otherwise, stale blocks buffered in the caches may be wrongly reused in successive calls of the GC function. For user files, this is not a problem because their gc-inodes are distinguished by a checkpoint number as well as an inode number. They never buffer different blocks if either an inode number, a checkpoint number, or a block offset differs. However, gc-inodes of sufile, cpfile and DAT file can store different data for the same block offset. Thus, the nilfs_clean_segments function can move incorrect block for these meta-data files if an old block is cached. I found this is really causing meta-data corruption in nilfs. This fixes the issue by ensuring cache clear of gc-inodes and resolves reported GC problems including checkpoint file corruption, b-tree corruption, and the following warning during GC. nilfs_palloc_freev: entry number 307234 already freed. ... Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: [2.6.37+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/gcinode.c | 2 ++ fs/nilfs2/segment.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 08a07a218d26..57ceaf33d177 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -191,6 +191,8 @@ void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs) while (!list_empty(head)) { ii = list_first_entry(head, struct nilfs_inode_info, i_dirty); list_del_init(&ii->i_dirty); + truncate_inode_pages(&ii->vfs_inode.i_data, 0); + nilfs_btnode_cache_clear(&ii->i_btnode_cache); iput(&ii->vfs_inode); } } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 0e72ad6f22aa..88e11fb346b6 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2309,6 +2309,8 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head) if (!test_bit(NILFS_I_UPDATED, &ii->i_state)) continue; list_del_init(&ii->i_dirty); + truncate_inode_pages(&ii->vfs_inode.i_data, 0); + nilfs_btnode_cache_clear(&ii->i_btnode_cache); iput(&ii->vfs_inode); } } -- cgit v1.2.3 From 4fe7efdbdfb1c7e7a7f31decfd831c0f31d37091 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 20 Jun 2012 12:53:01 -0700 Subject: mm: correctly synchronize rss-counters at exit/exec do_exit() and exec_mmap() call sync_mm_rss() before mm_release() does put_user(clear_child_tid) which can update task->rss_stat and thus make mm->rss_stat inconsistent. This triggers the "BUG:" printk in check_mm(). Let's fix this bug in the safest way, and optimize/cleanup this later. Reported-by: Markus Trippelsdorf Signed-off-by: Konstantin Khlebnikov Cc: Oleg Nesterov Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- kernel/exit.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index a79786a8d2c8..da27b91ff1e8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -819,10 +819,10 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; - sync_mm_rss(old_mm); mm_release(tsk, old_mm); if (old_mm) { + sync_mm_rss(old_mm); /* * Make sure that if there is a core dump in progress * for the old mm, we get out and die instead of going diff --git a/kernel/exit.c b/kernel/exit.c index 34867cc5b42a..c0277d3f1aaa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -643,6 +643,7 @@ static void exit_mm(struct task_struct * tsk) mm_release(tsk, mm); if (!mm) return; + sync_mm_rss(mm); /* * Serialize with any possible pending coredump. * We must hold mmap_sem around checking core_state -- cgit v1.2.3 From 1c8f52a5e9539600543347bcdefafa1854e07986 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Tue, 19 Jun 2012 07:42:25 -0600 Subject: Btrfs: introduce btrfs_next_old_item We introduce btrfs_next_old_item that uses btrfs_next_old_leaf instead of btrfs_next_leaf. btrfs_next_item is also changed to simply call btrfs_next_old_item with time_seq being 0. Signed-off-by: Alexander Block Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index db15e9e7b91c..84ac723f58f8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2755,13 +2755,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); -static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) +static inline int btrfs_next_old_item(struct btrfs_root *root, + struct btrfs_path *p, u64 time_seq) { ++p->slots[0]; if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) - return btrfs_next_leaf(root, p); + return btrfs_next_old_leaf(root, p, time_seq); return 0; } +static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) +{ + return btrfs_next_old_item(root, p, 0); +} int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); int __must_check btrfs_drop_snapshot(struct btrfs_root *root, -- cgit v1.2.3 From 69bca40d41c613927b150c5392505f1894fe3010 Mon Sep 17 00:00:00 2001 From: Alexander Block Date: Tue, 19 Jun 2012 07:42:26 -0600 Subject: Btrfs: don't assume to be on the correct extent in add_all_parents add_all_parents did assume that path is already at a correct extent data item, which may not be true in case of data extents that were partly rewritten and splitted. We need to check if we're on a matching extent for every item and only for the ones after the first. The loop is changed to do this now. This patch also fixes a bug introduced with commit 3b127fd8 "Btrfs: remove obsolete btrfs_next_leaf call from __resolve_indirect_ref". The removal of next_leaf did sometimes result in slot==nritems when the above described case happens, and thus resulting in invalid values (e.g. wanted_obejctid) in add_all_parents (leading to missed backrefs or even crashes). Signed-off-by: Alexander Block Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 94 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8f7d1237b7a0..7301cdb4b2cb 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -179,61 +179,74 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, int level, - struct btrfs_key *key, u64 time_seq, + struct btrfs_key *key_for_search, u64 time_seq, u64 wanted_disk_byte, const u64 *extent_item_pos) { - int ret; - int slot = path->slots[level]; - struct extent_buffer *eb = path->nodes[level]; + int ret = 0; + int slot; + struct extent_buffer *eb; + struct btrfs_key key; struct btrfs_file_extent_item *fi; struct extent_inode_elem *eie = NULL; u64 disk_byte; - u64 wanted_objectid = key->objectid; -add_parent: - if (level == 0 && extent_item_pos) { - fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); - ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie); + if (level != 0) { + eb = path->nodes[level]; + ret = ulist_add(parents, eb->start, 0, GFP_NOFS); if (ret < 0) return ret; - } - ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS); - if (ret < 0) - return ret; - - if (level != 0) return 0; + } /* - * if the current leaf is full with EXTENT_DATA items, we must - * check the next one if that holds a reference as well. - * ref->count cannot be used to skip this check. - * repeat this until we don't find any additional EXTENT_DATA items. + * We normally enter this function with the path already pointing to + * the first item to check. But sometimes, we may enter it with + * slot==nritems. In that case, go to the next leaf before we continue. */ - while (1) { - eie = NULL; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) ret = btrfs_next_old_leaf(root, path, time_seq); - if (ret < 0) - return ret; - if (ret) - return 0; + while (!ret) { eb = path->nodes[0]; - for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) { - btrfs_item_key_to_cpu(eb, key, slot); - if (key->objectid != wanted_objectid || - key->type != BTRFS_EXTENT_DATA_KEY) - return 0; - fi = btrfs_item_ptr(eb, slot, - struct btrfs_file_extent_item); - disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte == wanted_disk_byte) - goto add_parent; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(eb, &key, slot); + + if (key.objectid != key_for_search->objectid || + key.type != BTRFS_EXTENT_DATA_KEY) + break; + + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + + if (disk_byte == wanted_disk_byte) { + eie = NULL; + if (extent_item_pos) { + ret = check_extent_in_eb(&key, eb, fi, + *extent_item_pos, + &eie); + if (ret < 0) + break; + } + if (!ret) { + ret = ulist_add(parents, eb->start, + (unsigned long)eie, GFP_NOFS); + if (ret < 0) + break; + if (!extent_item_pos) { + ret = btrfs_next_old_leaf(root, path, + time_seq); + continue; + } + } } + ret = btrfs_next_old_item(root, path, time_seq); } - return 0; + if (ret > 0) + ret = 0; + return ret; } /* @@ -250,7 +263,6 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path; struct btrfs_root *root; struct btrfs_key root_key; - struct btrfs_key key = {0}; struct extent_buffer *eb; int ret = 0; int root_level; @@ -295,11 +307,9 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (level == 0) - btrfs_item_key_to_cpu(eb, &key, path->slots[0]); - - ret = add_all_parents(root, path, parents, level, &key, time_seq, - ref->wanted_disk_byte, extent_item_pos); + ret = add_all_parents(root, path, parents, level, &ref->key_for_search, + time_seq, ref->wanted_disk_byte, + extent_item_pos); out: btrfs_free_path(path); return ret; -- cgit v1.2.3 From e18fca734278784bd6591de63ca148cc27344ca9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 18 Jun 2012 07:23:18 -0600 Subject: Btrfs: add a missing spin_lock When fixing up the locking in the delayed ref destruction work I accidently broke the locking myself ;(. Add back a spin_lock that should be there and we are now all set. Thanks, Btrfs: add a missing spin_lock When fixing up the locking in the delayed ref destruction work I accidently broke the locking myself ;(. Add back a spin_lock that should be there and we are now all set. Thanks, Reported-by: Dan Carpenter Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e22c5bbf0223..7d7bc8eace86 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3426,6 +3426,7 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, mutex_unlock(&head->mutex); btrfs_put_delayed_ref(ref); + spin_lock(&delayed_refs->lock); continue; } -- cgit v1.2.3 From cb77fcd88569cd2b7b25ecd4086ea932a53be9b3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Jun 2012 12:19:48 -0600 Subject: Btrfs: delay iput with async extents There is some concern that these iput()'s could be the final iputs and could induce lockups on people waiting on writeback. This would happen in the rare case that we don't create ordered extents because of an error, but it is theoretically possible and we already have a mechanism to deal with this so just make them delayed iputs to negate any worry. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3f2c8cbe5ba6..4a4f2d59a64b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -987,7 +987,7 @@ static noinline void async_cow_start(struct btrfs_work *work) async_cow->start, async_cow->end, async_cow, &num_added); if (num_added == 0) { - iput(async_cow->inode); + btrfs_add_delayed_iput(async_cow->inode); async_cow->inode = NULL; } } @@ -1023,7 +1023,7 @@ static noinline void async_cow_free(struct btrfs_work *work) struct async_cow *async_cow; async_cow = container_of(work, struct async_cow, work); if (async_cow->inode) - iput(async_cow->inode); + btrfs_add_delayed_iput(async_cow->inode); kfree(async_cow); } -- cgit v1.2.3 From 9a3a5dab63461b84213052888bf38a962b22d035 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 11 Jun 2012 10:39:43 -0400 Subject: xfs: check for stale inode before acquiring iflock on push An inode in the AIL can be flush locked and marked stale if a cluster free transaction occurs at the right time. The inode item is then marked as flushing, which causes xfsaild to spin and leaves the filesystem stalled. This is reproduced by running xfstests 273 in a loop for an extended period of time. Check for stale inodes before the flush lock. This marks the inode as pinned, leads to a log flush and allows the filesystem to proceed. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_inode_item.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6cdbf90c6f7b..d041d47d9d86 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -504,6 +504,14 @@ xfs_inode_item_push( goto out_unlock; } + /* + * Stale inode items should force out the iclog. + */ + if (ip->i_flags & XFS_ISTALE) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + /* * Someone else is already flushing the inode. Nothing we can do * here but wait for the flush to finish and remove the item from @@ -514,15 +522,6 @@ xfs_inode_item_push( goto out_unlock; } - /* - * Stale inode items should force out the iclog. - */ - if (ip->i_flags & XFS_ISTALE) { - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return XFS_ITEM_PINNED; - } - ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); -- cgit v1.2.3 From 76d095388b040229ea1aad7dea45be0cfa20f589 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 12 Jun 2012 14:20:26 +1000 Subject: xfs: fix allocbt cursor leak in xfs_alloc_ag_vextent_near When we fail to find an matching extent near the requested extent specification during a left-right distance search in xfs_alloc_ag_vextent_near, we fail to free the original cursor that we used to look up the XFS_BTNUM_CNT tree and hence leak it. Reported-by: Chris J Arges Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index a996e398692b..9d1aeb7e2734 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1080,6 +1080,7 @@ restart: goto restart; } + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); trace_xfs_alloc_size_neither(args); args->agbno = NULLAGBLOCK; return 0; -- cgit v1.2.3 From 59c84ed0ddc11f1823b4a33ace4fbcc948261bb2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Jun 2012 00:32:26 +0200 Subject: xfs: Fix overallocation in xfs_buf_allocate_memory() Commit de1cbee which removed b_file_offset in favor of b_bn introduced a bug causing xfs_buf_allocate_memory() to overestimate the number of necessary pages. The problem is that xfs_buf_alloc() sets b_bn to -1 and thus effectively every buffer is straddling a page boundary which causes xfs_buf_allocate_memory() to allocate two pages and use vmalloc() for access which is unnecessary. Dave says xfs_buf_alloc() doesn't need to set b_bn to -1 anymore since the buffer is inserted into the cache only after being fully initialized now. So just make xfs_buf_alloc() fill in proper block number from the beginning. CC: David Chinner Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 172d3cc8f8cb..a4beb421018a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -201,14 +201,7 @@ xfs_buf_alloc( bp->b_length = numblks; bp->b_io_length = numblks; bp->b_flags = flags; - - /* - * We do not set the block number here in the buffer because we have not - * finished initialising the buffer. We insert the buffer into the cache - * in this state, so this ensures that we are unable to do IO on a - * buffer that hasn't been fully initialised. - */ - bp->b_bn = XFS_BUF_DADDR_NULL; + bp->b_bn = blkno; atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); @@ -567,11 +560,6 @@ xfs_buf_get( if (bp != new_bp) xfs_buf_free(new_bp); - /* - * Now we have a workable buffer, fill in the block number so - * that we can do IO on it. - */ - bp->b_bn = blkno; bp->b_io_length = bp->b_length; found: @@ -772,7 +760,7 @@ xfs_buf_get_uncached( int error, i; xfs_buf_t *bp; - bp = xfs_buf_alloc(target, 0, numblks, 0); + bp = xfs_buf_alloc(target, XFS_BUF_DADDR_NULL, numblks, 0); if (unlikely(bp == NULL)) goto fail; -- cgit v1.2.3 From 8866fc6fa55e31b2bce931b7963ff16641b39dc7 Mon Sep 17 00:00:00 2001 From: Ben Myers Date: Fri, 25 May 2012 15:45:36 -0500 Subject: xfs: shutdown xfs_sync_worker before the log Revert commit 1307bbd, which uses the s_umount semaphore to provide exclusion between xfs_sync_worker and unmount, in favor of shutting down the sync worker before freeing the log in xfs_log_unmount. This is a cleaner way of resolving the race between xfs_sync_worker and unmount than using s_umount. Signed-off-by: Ben Myers Reviewed-by: Mark Tinguely Reviewed-by: Dave Chinner --- fs/xfs/xfs_log.c | 1 + fs/xfs/xfs_sync.c | 32 ++++++++++++++++---------------- 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f30d9807dc48..0e1a64f0439c 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -810,6 +810,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) void xfs_log_unmount(xfs_mount_t *mp) { + cancel_delayed_work_sync(&mp->m_sync_work); xfs_trans_ail_destroy(mp); xlog_dealloc_log(mp->m_log); } diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index c9d3409c5ca3..1e9ee064dbb2 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -386,23 +386,23 @@ xfs_sync_worker( * We shouldn't write/force the log if we are in the mount/unmount * process or on a read only filesystem. The workqueue still needs to be * active in both cases, however, because it is used for inode reclaim - * during these times. Use the s_umount semaphore to provide exclusion - * with unmount. + * during these times. Use the MS_ACTIVE flag to avoid doing anything + * during mount. Doing work during unmount is avoided by calling + * cancel_delayed_work_sync on this work queue before tearing down + * the ail and the log in xfs_log_unmount. */ - if (down_read_trylock(&mp->m_super->s_umount)) { - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { - /* dgc: errors ignored here */ - if (mp->m_super->s_frozen == SB_UNFROZEN && - xfs_log_need_covered(mp)) - error = xfs_fs_log_dummy(mp); - else - xfs_log_force(mp, 0); - - /* start pushing all the metadata that is currently - * dirty */ - xfs_ail_push_all(mp->m_ail); - } - up_read(&mp->m_super->s_umount); + if (!(mp->m_super->s_flags & MS_ACTIVE) && + !(mp->m_flags & XFS_MOUNT_RDONLY)) { + /* dgc: errors ignored here */ + if (mp->m_super->s_frozen == SB_UNFROZEN && + xfs_log_need_covered(mp)) + error = xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently + * dirty */ + xfs_ail_push_all(mp->m_ail); } /* queue us up again */ -- cgit v1.2.3 From f7bdf03a99efc083608cd9c0c3e03abff311c79e Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Thu, 14 Jun 2012 09:22:15 -0500 Subject: xfs: rename log structure to xlog Rename the XFS log structure to xlog to help crash distinquish it from the other logs in Linux. Signed-off-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log.c | 76 ++++++++++++++++++++++++++++-------------------- fs/xfs/xfs_log_cil.c | 22 +++++++------- fs/xfs/xfs_log_priv.h | 46 +++++++++++++++++++---------- fs/xfs/xfs_log_recover.c | 38 ++++++++++++------------ fs/xfs/xfs_mount.h | 4 +-- fs/xfs/xfs_trace.h | 18 ++++++------ 6 files changed, 116 insertions(+), 88 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0e1a64f0439c..d90d4a388609 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -38,13 +38,21 @@ kmem_zone_t *xfs_log_ticket_zone; /* Local miscellaneous function prototypes */ -STATIC int xlog_commit_record(struct log *log, struct xlog_ticket *ticket, - xlog_in_core_t **, xfs_lsn_t *); +STATIC int +xlog_commit_record( + struct xlog *log, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + xfs_lsn_t *commitlsnp); + STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, xfs_buftarg_t *log_target, xfs_daddr_t blk_offset, int num_bblks); -STATIC int xlog_space_left(struct log *log, atomic64_t *head); +STATIC int +xlog_space_left( + struct xlog *log, + atomic64_t *head); STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); STATIC void xlog_dealloc_log(xlog_t *log); @@ -64,8 +72,10 @@ STATIC void xlog_state_switch_iclogs(xlog_t *log, int eventual_size); STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); -STATIC void xlog_grant_push_ail(struct log *log, - int need_bytes); +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes); STATIC void xlog_regrant_reserve_log_space(xlog_t *log, xlog_ticket_t *ticket); STATIC void xlog_ungrant_log_space(xlog_t *log, @@ -73,7 +83,9 @@ STATIC void xlog_ungrant_log_space(xlog_t *log, #if defined(DEBUG) STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); -STATIC void xlog_verify_grant_tail(struct log *log); +STATIC void +xlog_verify_grant_tail( + struct xlog *log); STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, int count, boolean_t syncing); STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, @@ -89,9 +101,9 @@ STATIC int xlog_iclogs_empty(xlog_t *log); static void xlog_grant_sub_space( - struct log *log, - atomic64_t *head, - int bytes) + struct xlog *log, + atomic64_t *head, + int bytes) { int64_t head_val = atomic64_read(head); int64_t new, old; @@ -115,9 +127,9 @@ xlog_grant_sub_space( static void xlog_grant_add_space( - struct log *log, - atomic64_t *head, - int bytes) + struct xlog *log, + atomic64_t *head, + int bytes) { int64_t head_val = atomic64_read(head); int64_t new, old; @@ -165,7 +177,7 @@ xlog_grant_head_wake_all( static inline int xlog_ticket_reservation( - struct log *log, + struct xlog *log, struct xlog_grant_head *head, struct xlog_ticket *tic) { @@ -182,7 +194,7 @@ xlog_ticket_reservation( STATIC bool xlog_grant_head_wake( - struct log *log, + struct xlog *log, struct xlog_grant_head *head, int *free_bytes) { @@ -204,7 +216,7 @@ xlog_grant_head_wake( STATIC int xlog_grant_head_wait( - struct log *log, + struct xlog *log, struct xlog_grant_head *head, struct xlog_ticket *tic, int need_bytes) @@ -256,7 +268,7 @@ shutdown: */ STATIC int xlog_grant_head_check( - struct log *log, + struct xlog *log, struct xlog_grant_head *head, struct xlog_ticket *tic, int *need_bytes) @@ -323,7 +335,7 @@ xfs_log_regrant( struct xfs_mount *mp, struct xlog_ticket *tic) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; int need_bytes; int error = 0; @@ -389,7 +401,7 @@ xfs_log_reserve( bool permanent, uint t_type) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; struct xlog_ticket *tic; int need_bytes; int error = 0; @@ -465,7 +477,7 @@ xfs_log_done( struct xlog_in_core **iclog, uint flags) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; xfs_lsn_t lsn = 0; if (XLOG_FORCED_SHUTDOWN(log) || @@ -839,7 +851,7 @@ void xfs_log_space_wake( struct xfs_mount *mp) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; int free_bytes; if (XLOG_FORCED_SHUTDOWN(log)) @@ -917,7 +929,7 @@ xfs_lsn_t xlog_assign_tail_lsn_locked( struct xfs_mount *mp) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; struct xfs_log_item *lip; xfs_lsn_t tail_lsn; @@ -966,7 +978,7 @@ xlog_assign_tail_lsn( */ STATIC int xlog_space_left( - struct log *log, + struct xlog *log, atomic64_t *head) { int free_bytes; @@ -1278,7 +1290,7 @@ out: */ STATIC int xlog_commit_record( - struct log *log, + struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, xfs_lsn_t *commitlsnp) @@ -1312,7 +1324,7 @@ xlog_commit_record( */ STATIC void xlog_grant_push_ail( - struct log *log, + struct xlog *log, int need_bytes) { xfs_lsn_t threshold_lsn = 0; @@ -1791,7 +1803,7 @@ xlog_write_start_rec( static xlog_op_header_t * xlog_write_setup_ophdr( - struct log *log, + struct xlog *log, struct xlog_op_header *ophdr, struct xlog_ticket *ticket, uint flags) @@ -1874,7 +1886,7 @@ xlog_write_setup_copy( static int xlog_write_copy_finish( - struct log *log, + struct xlog *log, struct xlog_in_core *iclog, uint flags, int *record_cnt, @@ -1959,7 +1971,7 @@ xlog_write_copy_finish( */ int xlog_write( - struct log *log, + struct xlog *log, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, @@ -2822,7 +2834,7 @@ _xfs_log_force( uint flags, int *log_flushed) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; struct xlog_in_core *iclog; xfs_lsn_t lsn; @@ -2970,7 +2982,7 @@ _xfs_log_force_lsn( uint flags, int *log_flushed) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; struct xlog_in_core *iclog; int already_slept = 0; @@ -3148,7 +3160,7 @@ xfs_log_ticket_get( */ xlog_ticket_t * xlog_ticket_alloc( - struct log *log, + struct xlog *log, int unit_bytes, int cnt, char client, @@ -3279,7 +3291,7 @@ xlog_ticket_alloc( */ void xlog_verify_dest_ptr( - struct log *log, + struct xlog *log, char *ptr) { int i; @@ -3308,7 +3320,7 @@ xlog_verify_dest_ptr( */ STATIC void xlog_verify_grant_tail( - struct log *log) + struct xlog *log) { int tail_cycle, tail_blocks; int cycle, space; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 7d6197c58493..ddc4529d07d3 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -44,7 +44,7 @@ */ static struct xlog_ticket * xlog_cil_ticket_alloc( - struct log *log) + struct xlog *log) { struct xlog_ticket *tic; @@ -72,7 +72,7 @@ xlog_cil_ticket_alloc( */ void xlog_cil_init_post_recovery( - struct log *log) + struct xlog *log) { log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); log->l_cilp->xc_ctx->sequence = 1; @@ -182,7 +182,7 @@ xlog_cil_prepare_log_vecs( */ STATIC void xfs_cil_prepare_item( - struct log *log, + struct xlog *log, struct xfs_log_vec *lv, int *len, int *diff_iovecs) @@ -231,7 +231,7 @@ xfs_cil_prepare_item( */ static void xlog_cil_insert_items( - struct log *log, + struct xlog *log, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket) { @@ -373,7 +373,7 @@ xlog_cil_committed( */ STATIC int xlog_cil_push( - struct log *log) + struct xlog *log) { struct xfs_cil *cil = log->l_cilp; struct xfs_log_vec *lv; @@ -601,7 +601,7 @@ xlog_cil_push_work( */ static void xlog_cil_push_background( - struct log *log) + struct xlog *log) { struct xfs_cil *cil = log->l_cilp; @@ -629,7 +629,7 @@ xlog_cil_push_background( static void xlog_cil_push_foreground( - struct log *log, + struct xlog *log, xfs_lsn_t push_seq) { struct xfs_cil *cil = log->l_cilp; @@ -683,7 +683,7 @@ xfs_log_commit_cil( xfs_lsn_t *commit_lsn, int flags) { - struct log *log = mp->m_log; + struct xlog *log = mp->m_log; int log_flags = 0; struct xfs_log_vec *log_vector; @@ -754,7 +754,7 @@ xfs_log_commit_cil( */ xfs_lsn_t xlog_cil_force_lsn( - struct log *log, + struct xlog *log, xfs_lsn_t sequence) { struct xfs_cil *cil = log->l_cilp; @@ -833,7 +833,7 @@ xfs_log_item_in_current_chkpt( */ int xlog_cil_init( - struct log *log) + struct xlog *log) { struct xfs_cil *cil; struct xfs_cil_ctx *ctx; @@ -869,7 +869,7 @@ xlog_cil_init( void xlog_cil_destroy( - struct log *log) + struct xlog *log) { if (log->l_cilp->xc_ctx) { if (log->l_cilp->xc_ctx->ticket) diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 5bc33261f5be..72eba2201b14 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -19,7 +19,7 @@ #define __XFS_LOG_PRIV_H__ struct xfs_buf; -struct log; +struct xlog; struct xlog_ticket; struct xfs_mount; @@ -352,7 +352,7 @@ typedef struct xlog_in_core { struct xlog_in_core *ic_next; struct xlog_in_core *ic_prev; struct xfs_buf *ic_bp; - struct log *ic_log; + struct xlog *ic_log; int ic_size; int ic_offset; int ic_bwritecnt; @@ -409,7 +409,7 @@ struct xfs_cil_ctx { * operations almost as efficient as the old logging methods. */ struct xfs_cil { - struct log *xc_log; + struct xlog *xc_log; struct list_head xc_cil; spinlock_t xc_cil_lock; struct xfs_cil_ctx *xc_ctx; @@ -487,7 +487,7 @@ struct xlog_grant_head { * overflow 31 bits worth of byte offset, so using a byte number will mean * that round off problems won't occur when releasing partial reservations. */ -typedef struct log { +typedef struct xlog { /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ @@ -553,9 +553,14 @@ extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); extern kmem_zone_t *xfs_log_ticket_zone; -struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, - int count, char client, bool permanent, - xfs_km_flags_t alloc_flags); +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int count, + char client, + bool permanent, + xfs_km_flags_t alloc_flags); static inline void @@ -567,9 +572,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) } void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); -int xlog_write(struct log *log, struct xfs_log_vec *log_vector, - struct xlog_ticket *tic, xfs_lsn_t *start_lsn, - xlog_in_core_t **commit_iclog, uint flags); +int +xlog_write( + struct xlog *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags); /* * When we crack an atomic LSN, we sample it first so that the value will not @@ -629,17 +639,23 @@ xlog_assign_grant_head(atomic64_t *head, int cycle, int space) /* * Committed Item List interfaces */ -int xlog_cil_init(struct log *log); -void xlog_cil_init_post_recovery(struct log *log); -void xlog_cil_destroy(struct log *log); +int +xlog_cil_init(struct xlog *log); +void +xlog_cil_init_post_recovery(struct xlog *log); +void +xlog_cil_destroy(struct xlog *log); /* * CIL force routines */ -xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence); +xfs_lsn_t +xlog_cil_force_lsn( + struct xlog *log, + xfs_lsn_t sequence); static inline void -xlog_cil_force(struct log *log) +xlog_cil_force(struct xlog *log) { xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index ca386909131a..a7be98abd6a9 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1471,8 +1471,8 @@ xlog_recover_add_item( STATIC int xlog_recover_add_to_cont_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, xfs_caddr_t dp, int len) { @@ -1517,8 +1517,8 @@ xlog_recover_add_to_cont_trans( */ STATIC int xlog_recover_add_to_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, xfs_caddr_t dp, int len) { @@ -1588,8 +1588,8 @@ xlog_recover_add_to_trans( */ STATIC int xlog_recover_reorder_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, int pass) { xlog_recover_item_t *item, *n; @@ -1642,8 +1642,8 @@ xlog_recover_reorder_trans( */ STATIC int xlog_recover_buffer_pass1( - struct log *log, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover_item *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; struct list_head *bucket; @@ -1696,7 +1696,7 @@ xlog_recover_buffer_pass1( */ STATIC int xlog_check_buffer_cancelled( - struct log *log, + struct xlog *log, xfs_daddr_t blkno, uint len, ushort flags) @@ -2689,9 +2689,9 @@ xlog_recover_free_trans( STATIC int xlog_recover_commit_pass1( - struct log *log, - struct xlog_recover *trans, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover *trans, + struct xlog_recover_item *item) { trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); @@ -2716,10 +2716,10 @@ xlog_recover_commit_pass1( STATIC int xlog_recover_commit_pass2( - struct log *log, - struct xlog_recover *trans, - struct list_head *buffer_list, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct xlog_recover_item *item) { trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); @@ -2753,7 +2753,7 @@ xlog_recover_commit_pass2( */ STATIC int xlog_recover_commit_trans( - struct log *log, + struct xlog *log, struct xlog_recover *trans, int pass) { @@ -2793,8 +2793,8 @@ out: STATIC int xlog_recover_unmount_trans( - struct log *log, - xlog_recover_t *trans) + struct xlog *log, + struct xlog_recover *trans) { /* Do nothing now */ xfs_warn(log->l_mp, "%s: Unmount LR", __func__); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8b89c5ac72d9..90c1fc9eaea4 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -53,7 +53,7 @@ typedef struct xfs_trans_reservations { #include "xfs_sync.h" -struct log; +struct xlog; struct xfs_mount_args; struct xfs_inode; struct xfs_bmbt_irec; @@ -133,7 +133,7 @@ typedef struct xfs_mount { uint m_readio_blocks; /* min read size blocks */ uint m_writeio_log; /* min write size log bytes */ uint m_writeio_blocks; /* min write size blocks */ - struct log *m_log; /* log specific stuff */ + struct xlog *m_log; /* log specific stuff */ int m_logbufs; /* number of log buffers */ int m_logbsize; /* size of each log buffer */ uint m_rsumlevels; /* rt summary levels */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 7cf9d3529e51..caf5dabfd553 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -32,7 +32,7 @@ struct xfs_da_node_entry; struct xfs_dquot; struct xfs_log_item; struct xlog_ticket; -struct log; +struct xlog; struct xlog_recover; struct xlog_recover_item; struct xfs_buf_log_format; @@ -762,7 +762,7 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_force); DEFINE_DQUOT_EVENT(xfs_dqflush_done); DECLARE_EVENT_CLASS(xfs_loggrant_class, - TP_PROTO(struct log *log, struct xlog_ticket *tic), + TP_PROTO(struct xlog *log, struct xlog_ticket *tic), TP_ARGS(log, tic), TP_STRUCT__entry( __field(dev_t, dev) @@ -830,7 +830,7 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, #define DEFINE_LOGGRANT_EVENT(name) \ DEFINE_EVENT(xfs_loggrant_class, name, \ - TP_PROTO(struct log *log, struct xlog_ticket *tic), \ + TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \ TP_ARGS(log, tic)) DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); @@ -1664,7 +1664,7 @@ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); DECLARE_EVENT_CLASS(xfs_log_recover_item_class, - TP_PROTO(struct log *log, struct xlog_recover *trans, + TP_PROTO(struct xlog *log, struct xlog_recover *trans, struct xlog_recover_item *item, int pass), TP_ARGS(log, trans, item, pass), TP_STRUCT__entry( @@ -1698,7 +1698,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class, #define DEFINE_LOG_RECOVER_ITEM(name) \ DEFINE_EVENT(xfs_log_recover_item_class, name, \ - TP_PROTO(struct log *log, struct xlog_recover *trans, \ + TP_PROTO(struct xlog *log, struct xlog_recover *trans, \ struct xlog_recover_item *item, int pass), \ TP_ARGS(log, trans, item, pass)) @@ -1709,7 +1709,7 @@ DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail); DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover); DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, - TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), + TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), TP_ARGS(log, buf_f), TP_STRUCT__entry( __field(dev_t, dev) @@ -1739,7 +1739,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, #define DEFINE_LOG_RECOVER_BUF_ITEM(name) \ DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \ - TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \ + TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), \ TP_ARGS(log, buf_f)) DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel); @@ -1752,7 +1752,7 @@ DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, - TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), + TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), TP_ARGS(log, in_f), TP_STRUCT__entry( __field(dev_t, dev) @@ -1790,7 +1790,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, ) #define DEFINE_LOG_RECOVER_INO_ITEM(name) \ DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \ - TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \ + TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), \ TP_ARGS(log, in_f)) DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); -- cgit v1.2.3 From 2d4cf5ae123e4a7bd26a88e600581aa809bcad7f Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Mon, 18 Jun 2012 16:31:22 -0700 Subject: UBIFS: correct usage of IS_ENABLED() Commit "818039c UBIFS: fix debugfs-less systems support" fixed one regression but introduced a different regression - the debugfs is now always compiled out. Root cause: IS_ENABLED() arguments should be used with the CONFIG_* prefix. Signed-off-by: Brian Norris Signed-off-by: Artem Bityutskiy --- fs/ubifs/debug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 84a7e6f3c046..92df3b081539 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2918,7 +2918,7 @@ int dbg_debugfs_init_fs(struct ubifs_info *c) struct dentry *dent; struct ubifs_debug_info *d = c->dbg; - if (!IS_ENABLED(DEBUG_FS)) + if (!IS_ENABLED(CONFIG_DEBUG_FS)) return 0; n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME, @@ -3013,7 +3013,7 @@ out: */ void dbg_debugfs_exit_fs(struct ubifs_info *c) { - if (IS_ENABLED(DEBUG_FS)) + if (IS_ENABLED(CONFIG_DEBUG_FS)) debugfs_remove_recursive(c->dbg->dfs_dir); } @@ -3099,7 +3099,7 @@ int dbg_debugfs_init(void) const char *fname; struct dentry *dent; - if (!IS_ENABLED(DEBUG_FS)) + if (!IS_ENABLED(CONFIG_DEBUG_FS)) return 0; fname = "ubifs"; @@ -3166,7 +3166,7 @@ out: */ void dbg_debugfs_exit(void) { - if (IS_ENABLED(DEBUG_FS)) + if (IS_ENABLED(CONFIG_DEBUG_FS)) debugfs_remove_recursive(dfs_rootdir); } -- cgit v1.2.3 From cb14d340ef1737c24125dd663eff77734a482d47 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 27 Jun 2012 20:08:44 +0200 Subject: udf: Use 'ret' instead of abusing 'i' in udf_load_logicalvol() Signed-off-by: Jan Kara --- fs/udf/super.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/udf/super.c b/fs/udf/super.c index ac8a348dcb69..9da6f4ee112c 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1233,11 +1233,9 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, BUG_ON(ident != TAG_IDENT_LVD); lvd = (struct logicalVolDesc *)bh->b_data; - i = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps)); - if (i != 0) { - ret = i; + ret = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps)); + if (ret) goto out_bh; - } for (i = 0, offset = 0; i < sbi->s_partitions && offset < le32_to_cpu(lvd->mapTableLength); -- cgit v1.2.3 From adee11b2085bee90bd8f4f52123ffb07882d6256 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 27 Jun 2012 20:20:22 +0200 Subject: udf: Avoid run away loop when partition table length is corrupted Check provided length of partition table so that (possibly maliciously) corrupted partition table cannot cause accessing data beyond current buffer. Signed-off-by: Jan Kara --- fs/udf/super.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/udf/super.c b/fs/udf/super.c index 9da6f4ee112c..ce911f50b39d 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1225,6 +1225,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, struct genericPartitionMap *gpm; uint16_t ident; struct buffer_head *bh; + unsigned int table_len; int ret = 0; bh = udf_read_tagged(sb, block, block, &ident); @@ -1232,13 +1233,20 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, return 1; BUG_ON(ident != TAG_IDENT_LVD); lvd = (struct logicalVolDesc *)bh->b_data; + table_len = le32_to_cpu(lvd->mapTableLength); + if (sizeof(*lvd) + table_len > sb->s_blocksize) { + udf_err(sb, "error loading logical volume descriptor: " + "Partition table too long (%u > %lu)\n", table_len, + sb->s_blocksize - sizeof(*lvd)); + goto out_bh; + } ret = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps)); if (ret) goto out_bh; for (i = 0, offset = 0; - i < sbi->s_partitions && offset < le32_to_cpu(lvd->mapTableLength); + i < sbi->s_partitions && offset < table_len; i++, offset += gpm->partitionMapLength) { struct udf_part_map *map = &sbi->s_partmaps[i]; gpm = (struct genericPartitionMap *) -- cgit v1.2.3 From 1df2ae31c724e57be9d7ac00d78db8a5dabdd050 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 27 Jun 2012 21:23:07 +0200 Subject: udf: Fortify loading of sparing table Add sanity checks when loading sparing table from disk to avoid accessing unallocated memory or writing to it. Signed-off-by: Jan Kara --- fs/udf/super.c | 86 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/udf/super.c b/fs/udf/super.c index ce911f50b39d..8d86a8706c0e 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include "udf_sb.h" @@ -1215,11 +1216,59 @@ out_bh: return ret; } +static int udf_load_sparable_map(struct super_block *sb, + struct udf_part_map *map, + struct sparablePartitionMap *spm) +{ + uint32_t loc; + uint16_t ident; + struct sparingTable *st; + struct udf_sparing_data *sdata = &map->s_type_specific.s_sparing; + int i; + struct buffer_head *bh; + + map->s_partition_type = UDF_SPARABLE_MAP15; + sdata->s_packet_len = le16_to_cpu(spm->packetLength); + if (!is_power_of_2(sdata->s_packet_len)) { + udf_err(sb, "error loading logical volume descriptor: " + "Invalid packet length %u\n", + (unsigned)sdata->s_packet_len); + return -EIO; + } + if (spm->numSparingTables > 4) { + udf_err(sb, "error loading logical volume descriptor: " + "Too many sparing tables (%d)\n", + (int)spm->numSparingTables); + return -EIO; + } + + for (i = 0; i < spm->numSparingTables; i++) { + loc = le32_to_cpu(spm->locSparingTable[i]); + bh = udf_read_tagged(sb, loc, loc, &ident); + if (!bh) + continue; + + st = (struct sparingTable *)bh->b_data; + if (ident != 0 || + strncmp(st->sparingIdent.ident, UDF_ID_SPARING, + strlen(UDF_ID_SPARING)) || + sizeof(*st) + le16_to_cpu(st->reallocationTableLen) > + sb->s_blocksize) { + brelse(bh); + continue; + } + + sdata->s_spar_map[i] = bh; + } + map->s_partition_func = udf_get_pblock_spar15; + return 0; +} + static int udf_load_logicalvol(struct super_block *sb, sector_t block, struct kernel_lb_addr *fileset) { struct logicalVolDesc *lvd; - int i, j, offset; + int i, offset; uint8_t type; struct udf_sb_info *sbi = UDF_SB(sb); struct genericPartitionMap *gpm; @@ -1281,38 +1330,9 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block, } else if (!strncmp(upm2->partIdent.ident, UDF_ID_SPARABLE, strlen(UDF_ID_SPARABLE))) { - uint32_t loc; - struct sparingTable *st; - struct sparablePartitionMap *spm = - (struct sparablePartitionMap *)gpm; - - map->s_partition_type = UDF_SPARABLE_MAP15; - map->s_type_specific.s_sparing.s_packet_len = - le16_to_cpu(spm->packetLength); - for (j = 0; j < spm->numSparingTables; j++) { - struct buffer_head *bh2; - - loc = le32_to_cpu( - spm->locSparingTable[j]); - bh2 = udf_read_tagged(sb, loc, loc, - &ident); - map->s_type_specific.s_sparing. - s_spar_map[j] = bh2; - - if (bh2 == NULL) - continue; - - st = (struct sparingTable *)bh2->b_data; - if (ident != 0 || strncmp( - st->sparingIdent.ident, - UDF_ID_SPARING, - strlen(UDF_ID_SPARING))) { - brelse(bh2); - map->s_type_specific.s_sparing. - s_spar_map[j] = NULL; - } - } - map->s_partition_func = udf_get_pblock_spar15; + if (udf_load_sparable_map(sb, map, + (struct sparablePartitionMap *)gpm) < 0) + goto out_bh; } else if (!strncmp(upm2->partIdent.ident, UDF_ID_METADATA, strlen(UDF_ID_METADATA))) { -- cgit v1.2.3